class GraphModule(torch.nn.Module): def forward(self, L_self_modules_norm1_parameters_weight_: "f32[256][1]cuda:0", L_self_modules_norm1_parameters_bias_: "f32[256][1]cuda:0", s0: "Sym(s0)", L_tgt_: "f32[s0, 1, 256][256, 256, 1]cuda:0", L_self_pos_: "f32[s0, 1, 256][256, 256, 1]cuda:0", L_self_modules_self_attn_num_head: "Sym(s4)", L_self_modules_self_attn_hidden_dim: "Sym(s5)", L_self_modules_self_attn_modules_linear_Q_parameters_weight_: "f32[256, 256][256, 1]cuda:0", L_self_modules_self_attn_modules_linear_Q_parameters_bias_: "f32[256][1]cuda:0", L_self_modules_self_attn_modules_linear_K_parameters_weight_: "f32[256, 256][256, 1]cuda:0", L_self_modules_self_attn_modules_linear_K_parameters_bias_: "f32[256][1]cuda:0", L_self_modules_self_attn_modules_linear_V_parameters_weight_: "f32[256, 256][256, 1]cuda:0", L_self_modules_self_attn_modules_linear_V_parameters_bias_: "f32[256][1]cuda:0", L_self_modules_self_attn_d_att: "Sym(s6)", L_self_modules_self_attn_modules_projection_parameters_weight_: "f32[256, 256][256, 1]cuda:0", L_self_modules_self_attn_modules_projection_parameters_bias_: "f32[256][1]cuda:0", L_self_modules_norm2_parameters_weight_: "f32[256][1]cuda:0", L_self_modules_norm2_parameters_bias_: "f32[256][1]cuda:0", L_self_modules_linear_QV_parameters_weight_: "f32[512, 256][256, 1]cuda:0", L_self_modules_linear_QV_parameters_bias_: "f32[512][1]cuda:0", L_size_2d_0_: "Sym(s7)", L_size_2d_1_: "Sym(s8)", s9: "Sym(s10)", L_long_term_memory_0_: "f16[s10, 1, 256][256, 256, 1]cuda:0", L_long_term_memory_1_: "f16[s10, 1, 256][256, 256, 1]cuda:0", L_self_modules_long_term_attn_modules_projection_parameters_weight_: "f32[256, 256][256, 1]cuda:0", L_self_modules_long_term_attn_modules_projection_parameters_bias_: "f32[256][1]cuda:0"): l_self_modules_norm1_parameters_weight_ = L_self_modules_norm1_parameters_weight_ l_self_modules_norm1_parameters_bias_ = L_self_modules_norm1_parameters_bias_ l_tgt_ = L_tgt_ l_self_pos_ = L_self_pos_ l_self_modules_self_attn_num_head = L_self_modules_self_attn_num_head l_self_modules_self_attn_hidden_dim = L_self_modules_self_attn_hidden_dim l_self_modules_self_attn_modules_linear_q_parameters_weight_ = L_self_modules_self_attn_modules_linear_Q_parameters_weight_ l_self_modules_self_attn_modules_linear_q_parameters_bias_ = L_self_modules_self_attn_modules_linear_Q_parameters_bias_ l_self_modules_self_attn_modules_linear_k_parameters_weight_ = L_self_modules_self_attn_modules_linear_K_parameters_weight_ l_self_modules_self_attn_modules_linear_k_parameters_bias_ = L_self_modules_self_attn_modules_linear_K_parameters_bias_ l_self_modules_self_attn_modules_linear_v_parameters_weight_ = L_self_modules_self_attn_modules_linear_V_parameters_weight_ l_self_modules_self_attn_modules_linear_v_parameters_bias_ = L_self_modules_self_attn_modules_linear_V_parameters_bias_ l_self_modules_self_attn_d_att = L_self_modules_self_attn_d_att l_self_modules_self_attn_modules_projection_parameters_weight_ = L_self_modules_self_attn_modules_projection_parameters_weight_ l_self_modules_self_attn_modules_projection_parameters_bias_ = L_self_modules_self_attn_modules_projection_parameters_bias_ l_self_modules_norm2_parameters_weight_ = L_self_modules_norm2_parameters_weight_ l_self_modules_norm2_parameters_bias_ = L_self_modules_norm2_parameters_bias_ l_self_modules_linear_qv_parameters_weight_ = L_self_modules_linear_QV_parameters_weight_ l_self_modules_linear_qv_parameters_bias_ = L_self_modules_linear_QV_parameters_bias_ l_size_2d_0_ = L_size_2d_0_ l_size_2d_1_ = L_size_2d_1_ l_long_term_memory_0_ = L_long_term_memory_0_ l_long_term_memory_1_ = L_long_term_memory_1_ l_self_modules_long_term_attn_modules_projection_parameters_weight_ = L_self_modules_long_term_attn_modules_projection_parameters_weight_ l_self_modules_long_term_attn_modules_projection_parameters_bias_ = L_self_modules_long_term_attn_modules_projection_parameters_bias_ # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/normalization.py:201 in forward, code: return F.layer_norm( _tgt: "f32[s0, 1, 256][256, 256, 1]cuda:0" = torch.nn.functional.layer_norm(l_tgt_, (256,), l_self_modules_norm1_parameters_weight_, l_self_modules_norm1_parameters_bias_, 1e-05); l_self_modules_norm1_parameters_weight_ = l_self_modules_norm1_parameters_bias_ = None # File: /workspace/networks/layers/transformer.py:752 in with_pos_embed, code: return tensor if pos is None else tensor + pos q: "f32[s0, 1, 256][256, 256, 1]cuda:0" = _tgt + l_self_pos_; l_self_pos_ = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/linear.py:116 in forward, code: return F.linear(input, self.weight, self.bias) Q: "f16[s0, 1, 256][256, 256, 1]cuda:0" = torch._C._nn.linear(q, l_self_modules_self_attn_modules_linear_q_parameters_weight_, l_self_modules_self_attn_modules_linear_q_parameters_bias_); l_self_modules_self_attn_modules_linear_q_parameters_weight_ = l_self_modules_self_attn_modules_linear_q_parameters_bias_ = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/linear.py:116 in forward, code: return F.linear(input, self.weight, self.bias) K: "f16[s0, 1, 256][256, 256, 1]cuda:0" = torch._C._nn.linear(q, l_self_modules_self_attn_modules_linear_k_parameters_weight_, l_self_modules_self_attn_modules_linear_k_parameters_bias_); q = l_self_modules_self_attn_modules_linear_k_parameters_weight_ = l_self_modules_self_attn_modules_linear_k_parameters_bias_ = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/linear.py:116 in forward, code: return F.linear(input, self.weight, self.bias) V: "f16[s0, 1, 256][256, 256, 1]cuda:0" = torch._C._nn.linear(_tgt, l_self_modules_self_attn_modules_linear_v_parameters_weight_, l_self_modules_self_attn_modules_linear_v_parameters_bias_); _tgt = l_self_modules_self_attn_modules_linear_v_parameters_weight_ = l_self_modules_self_attn_modules_linear_v_parameters_bias_ = None # File: /workspace/networks/layers/attention.py:80 in forward, code: Q = Q / self.T Q_1: "f16[s0, 1, 256][256, 256, 1]cuda:0" = Q / 5.656854249492381; Q = None # File: /workspace/networks/layers/attention.py:90 in forward, code: Q = Q.view(-1, bs, num_head, self.d_att).permute(1, 2, 0, 3) view: "f16[s0, 1, s4, (256//s4)][256, 256, (256//s4), 1]cuda:0" = Q_1.view(-1, 1, l_self_modules_self_attn_num_head, l_self_modules_self_attn_d_att); Q_1 = None Q_2: "f16[1, s4, s0, (256//s4)][256, (256//s4), 256, 1]cuda:0" = view.permute(1, 2, 0, 3); view = None # File: /workspace/networks/layers/attention.py:91 in forward, code: K = K.view(-1, bs, num_head, self.d_att).permute(1, 2, 3, 0) view_1: "f16[s0, 1, s4, (256//s4)][256, 256, (256//s4), 1]cuda:0" = K.view(-1, 1, l_self_modules_self_attn_num_head, l_self_modules_self_attn_d_att); K = l_self_modules_self_attn_d_att = None K_1: "f16[1, s4, (256//s4), s0][256, (256//s4), 1, 256]cuda:0" = view_1.permute(1, 2, 3, 0); view_1 = None # File: /workspace/networks/layers/attention.py:92 in forward, code: V = V.view(-1, bs, num_head, hidden_dim).permute(1, 2, 0, 3) view_2: "f16[s0, 1, s4, (256//s4)][256, 256, (256//s4), 1]cuda:0" = V.view(-1, 1, l_self_modules_self_attn_num_head, l_self_modules_self_attn_hidden_dim); V = l_self_modules_self_attn_num_head = l_self_modules_self_attn_hidden_dim = None V_1: "f16[1, s4, s0, (256//s4)][256, (256//s4), 256, 1]cuda:0" = view_2.permute(1, 2, 0, 3); view_2 = None # File: /workspace/networks/layers/attention.py:8 in multiply_by_ychunks, code: return x @ y QK: "f16[1, s4, s0, s0][s0**2*s4, s0**2, s0, 1]cuda:0" = Q_2 @ K_1; Q_2 = K_1 = None # File: /workspace/networks/layers/attention.py:114 in forward, code: attn = torch.softmax(QK, dim=-1) attn: "f32[1, s4, s0, s0][s0**2*s4, s0**2, s0, 1]cuda:0" = torch.softmax(QK, dim = -1); QK = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/dropout.py:59 in forward, code: return F.dropout(input, self.p, self.training, self.inplace) attn_1: "f32[1, s4, s0, s0][s0**2*s4, s0**2, s0, 1]cuda:0" = torch.nn.functional.dropout(attn, 0.0, False, False); attn = None # File: /workspace/networks/layers/attention.py:14 in multiply_by_xchunks, code: return x @ y matmul_1: "f16[1, s4, s0, (256//s4)][s0*s4*((256//s4)), s0*((256//s4)), (256//s4), 1]cuda:0" = attn_1 @ V_1; attn_1 = V_1 = None # File: /workspace/networks/layers/attention.py:120 in forward, code: self.qk_chunks).permute(2, 0, 1, 3) outputs: "f16[s0, 1, s4, (256//s4)][(256//s4), s0*s4*((256//s4)), s0*((256//s4)), 1]cuda:0" = matmul_1.permute(2, 0, 1, 3); matmul_1 = None # File: /workspace/networks/layers/attention.py:122 in forward, code: outputs = outputs.reshape(-1, bs, self.d_model) outputs_1: "f16[s0, 1, s4*((256//s4))][s4*((256//s4)), s4*((256//s4)), 1]cuda:0" = outputs.reshape(-1, 1, 256); outputs = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/linear.py:116 in forward, code: return F.linear(input, self.weight, self.bias) outputs_2: "f16[s0, 1, 256][256, 256, 1]cuda:0" = torch._C._nn.linear(outputs_1, l_self_modules_self_attn_modules_projection_parameters_weight_, l_self_modules_self_attn_modules_projection_parameters_bias_); outputs_1 = l_self_modules_self_attn_modules_projection_parameters_weight_ = l_self_modules_self_attn_modules_projection_parameters_bias_ = None # File: /workspace/networks/layers/transformer.py:772 in forward, code: tgt = tgt + self.droppath(tgt2) tgt: "f32[s0, 1, 256][256, 256, 1]cuda:0" = l_tgt_ + outputs_2; l_tgt_ = outputs_2 = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/normalization.py:201 in forward, code: return F.layer_norm( _tgt_1: "f32[s0, 1, 256][256, 256, 1]cuda:0" = torch.nn.functional.layer_norm(tgt, (256,), l_self_modules_norm2_parameters_weight_, l_self_modules_norm2_parameters_bias_, 1e-05); l_self_modules_norm2_parameters_weight_ = l_self_modules_norm2_parameters_bias_ = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/linear.py:116 in forward, code: return F.linear(input, self.weight, self.bias) curr_QV: "f16[s0, 1, 512][512, 512, 1]cuda:0" = torch._C._nn.linear(_tgt_1, l_self_modules_linear_qv_parameters_weight_, l_self_modules_linear_qv_parameters_bias_); l_self_modules_linear_qv_parameters_weight_ = l_self_modules_linear_qv_parameters_bias_ = None # File: /workspace/networks/layers/transformer.py:778 in forward, code: curr_QV = torch.split(curr_QV, self.d_model, dim=2) split = torch.functional.split(curr_QV, 256, dim = 2); curr_QV = None curr_Q: "f16[s0, 1, 256][512, 512, 1]cuda:0" = split[0] curr_V: "f16[s0, 1, 256][512, 512, 1]cuda:0" = split[1]; split = None # File: /workspace/networks/layers/basic.py:93 in seq_to_2d, code: tensor = tensor.view(h, w, n, c).permute(2, 3, 0, 1).contiguous() view_3: "f16[s7, (s0//s7), 1, 256][512*((s0//s7)), 512, 512, 1]cuda:0" = curr_Q.view(l_size_2d_0_, l_size_2d_1_, 1, 256); l_size_2d_0_ = l_size_2d_1_ = None permute_4: "f16[1, 256, s7, (s0//s7)][512, 1, 512*((s0//s7)), 512]cuda:0" = view_3.permute(2, 3, 0, 1); view_3 = None tensor: "f16[1, 256, s7, (s0//s7)][256*s7*((s0//s7)), s7*((s0//s7)), (s0//s7), 1]cuda:0" = permute_4.contiguous(); permute_4 = None # File: /workspace/networks/layers/attention.py:80 in forward, code: Q = Q / self.T Q_3: "f16[s0, 1, 256][256, 256, 1]cuda:0" = curr_Q / 16.0 # File: /workspace/networks/layers/attention.py:90 in forward, code: Q = Q.view(-1, bs, num_head, self.d_att).permute(1, 2, 0, 3) view_4: "f16[s0, 1, 1, 256][256, 256, 256, 1]cuda:0" = Q_3.view(-1, 1, 1, 256); Q_3 = None Q_4: "f16[1, 1, s0, 256][256, 256, 256, 1]cuda:0" = view_4.permute(1, 2, 0, 3); view_4 = None # File: /workspace/networks/layers/attention.py:91 in forward, code: K = K.view(-1, bs, num_head, self.d_att).permute(1, 2, 3, 0) view_5: "f16[s10, 1, 1, 256][256, 256, 256, 1]cuda:0" = l_long_term_memory_0_.view(-1, 1, 1, 256); l_long_term_memory_0_ = None K_2: "f16[1, 1, 256, s10][256, 256, 1, 256]cuda:0" = view_5.permute(1, 2, 3, 0); view_5 = None # File: /workspace/networks/layers/attention.py:92 in forward, code: V = V.view(-1, bs, num_head, hidden_dim).permute(1, 2, 0, 3) view_6: "f16[s10, 1, 1, 256][256, 256, 256, 1]cuda:0" = l_long_term_memory_1_.view(-1, 1, 1, 256); l_long_term_memory_1_ = None V_2: "f16[1, 1, s10, 256][256, 256, 256, 1]cuda:0" = view_6.permute(1, 2, 0, 3); view_6 = None # File: /workspace/networks/layers/attention.py:8 in multiply_by_ychunks, code: return x @ y QK_1: "f16[1, 1, s0, s10][s0*s10, s0*s10, s10, 1]cuda:0" = Q_4 @ K_2; Q_4 = K_2 = None # File: /workspace/networks/layers/attention.py:114 in forward, code: attn = torch.softmax(QK, dim=-1) attn_2: "f32[1, 1, s0, s10][s0*s10, s0*s10, s10, 1]cuda:0" = torch.softmax(QK_1, dim = -1); QK_1 = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/dropout.py:59 in forward, code: return F.dropout(input, self.p, self.training, self.inplace) attn_3: "f32[1, 1, s0, s10][s0*s10, s0*s10, s10, 1]cuda:0" = torch.nn.functional.dropout(attn_2, 0.0, False, False); attn_2 = None # File: /workspace/networks/layers/attention.py:14 in multiply_by_xchunks, code: return x @ y matmul_3: "f16[1, 1, s0, 256][256*s0, 256*s0, 256, 1]cuda:0" = attn_3 @ V_2; attn_3 = V_2 = None # File: /workspace/networks/layers/attention.py:120 in forward, code: self.qk_chunks).permute(2, 0, 1, 3) outputs_3: "f16[s0, 1, 1, 256][256, 256*s0, 256*s0, 1]cuda:0" = matmul_3.permute(2, 0, 1, 3); matmul_3 = None # File: /workspace/networks/layers/attention.py:122 in forward, code: outputs = outputs.reshape(-1, bs, self.d_model) outputs_4: "f16[s0, 1, 256][256, 256*s0, 1]cuda:0" = outputs_3.reshape(-1, 1, 256); outputs_3 = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/linear.py:116 in forward, code: return F.linear(input, self.weight, self.bias) outputs_5: "f16[s0, 1, 256][256, 256, 1]cuda:0" = torch._C._nn.linear(outputs_4, l_self_modules_long_term_attn_modules_projection_parameters_weight_, l_self_modules_long_term_attn_modules_projection_parameters_bias_); outputs_4 = l_self_modules_long_term_attn_modules_projection_parameters_weight_ = l_self_modules_long_term_attn_modules_projection_parameters_bias_ = None return (tensor, tgt, _tgt_1, outputs_5, curr_Q, curr_V)