class GraphModule(torch.nn.Module): def forward(self, L_self_modules_norm1_parameters_weight_: "f32[256][1]cuda:0", L_self_modules_norm1_parameters_bias_: "f32[256][1]cuda:0", s2: "Sym(s13)", L_tgt_: "f32[s13, 1, 256][1, 256*s13, s13]cuda:0", L_self_pos_: "f32[s13, 1, 256][256, 256, 1]cuda:0", L_self_modules_self_attn_num_head: "Sym(s6)", L_self_modules_self_attn_hidden_dim: "Sym(s7)", L_self_modules_self_attn_modules_linear_Q_parameters_weight_: "f32[256, 256][256, 1]cuda:0", L_self_modules_self_attn_modules_linear_Q_parameters_bias_: "f32[256][1]cuda:0", L_self_modules_self_attn_modules_linear_K_parameters_weight_: "f32[256, 256][256, 1]cuda:0", L_self_modules_self_attn_modules_linear_K_parameters_bias_: "f32[256][1]cuda:0", L_self_modules_self_attn_modules_linear_V_parameters_weight_: "f32[256, 256][256, 1]cuda:0", L_self_modules_self_attn_modules_linear_V_parameters_bias_: "f32[256][1]cuda:0", L_self_modules_self_attn_d_att: "Sym(s8)", L_self_modules_self_attn_modules_projection_parameters_weight_: "f32[256, 256][256, 1]cuda:0", L_self_modules_self_attn_modules_projection_parameters_bias_: "f32[256][1]cuda:0", L_self_modules_norm2_parameters_weight_: "f32[256][1]cuda:0", L_self_modules_norm2_parameters_bias_: "f32[256][1]cuda:0", L_self_modules_linear_QV_parameters_weight_: "f32[512, 256][256, 1]cuda:0", L_self_modules_linear_QV_parameters_bias_: "f32[512][1]cuda:0", L_size_2d_0_: "Sym(s9)", L_size_2d_1_: "Sym(s10)", L_curr_id_emb_: "f32[s13, 1, 256][256, 256, 1]cuda:0", L_self_modules_linear_ID_KV_parameters_weight_: "f32[257, 256][256, 1]cuda:0", L_self_modules_linear_ID_KV_parameters_bias_: "f32[257][1]cuda:0", L_self_modules_long_term_attn_modules_projection_parameters_weight_: "f32[256, 256][256, 1]cuda:0", L_self_modules_long_term_attn_modules_projection_parameters_bias_: "f32[256][1]cuda:0"): l_self_modules_norm1_parameters_weight_ = L_self_modules_norm1_parameters_weight_ l_self_modules_norm1_parameters_bias_ = L_self_modules_norm1_parameters_bias_ l_tgt_ = L_tgt_ l_self_pos_ = L_self_pos_ l_self_modules_self_attn_num_head = L_self_modules_self_attn_num_head l_self_modules_self_attn_hidden_dim = L_self_modules_self_attn_hidden_dim l_self_modules_self_attn_modules_linear_q_parameters_weight_ = L_self_modules_self_attn_modules_linear_Q_parameters_weight_ l_self_modules_self_attn_modules_linear_q_parameters_bias_ = L_self_modules_self_attn_modules_linear_Q_parameters_bias_ l_self_modules_self_attn_modules_linear_k_parameters_weight_ = L_self_modules_self_attn_modules_linear_K_parameters_weight_ l_self_modules_self_attn_modules_linear_k_parameters_bias_ = L_self_modules_self_attn_modules_linear_K_parameters_bias_ l_self_modules_self_attn_modules_linear_v_parameters_weight_ = L_self_modules_self_attn_modules_linear_V_parameters_weight_ l_self_modules_self_attn_modules_linear_v_parameters_bias_ = L_self_modules_self_attn_modules_linear_V_parameters_bias_ l_self_modules_self_attn_d_att = L_self_modules_self_attn_d_att l_self_modules_self_attn_modules_projection_parameters_weight_ = L_self_modules_self_attn_modules_projection_parameters_weight_ l_self_modules_self_attn_modules_projection_parameters_bias_ = L_self_modules_self_attn_modules_projection_parameters_bias_ l_self_modules_norm2_parameters_weight_ = L_self_modules_norm2_parameters_weight_ l_self_modules_norm2_parameters_bias_ = L_self_modules_norm2_parameters_bias_ l_self_modules_linear_qv_parameters_weight_ = L_self_modules_linear_QV_parameters_weight_ l_self_modules_linear_qv_parameters_bias_ = L_self_modules_linear_QV_parameters_bias_ l_size_2d_0_ = L_size_2d_0_ l_size_2d_1_ = L_size_2d_1_ l_curr_id_emb_ = L_curr_id_emb_ l_self_modules_linear_id_kv_parameters_weight_ = L_self_modules_linear_ID_KV_parameters_weight_ l_self_modules_linear_id_kv_parameters_bias_ = L_self_modules_linear_ID_KV_parameters_bias_ l_self_modules_long_term_attn_modules_projection_parameters_weight_ = L_self_modules_long_term_attn_modules_projection_parameters_weight_ l_self_modules_long_term_attn_modules_projection_parameters_bias_ = L_self_modules_long_term_attn_modules_projection_parameters_bias_ # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/normalization.py:201 in forward, code: return F.layer_norm( _tgt: "f32[s13, 1, 256][256, 256, 1]cuda:0" = torch.nn.functional.layer_norm(l_tgt_, (256,), l_self_modules_norm1_parameters_weight_, l_self_modules_norm1_parameters_bias_, 1e-05); l_self_modules_norm1_parameters_weight_ = l_self_modules_norm1_parameters_bias_ = None # File: /workspace/networks/layers/transformer.py:752 in with_pos_embed, code: return tensor if pos is None else tensor + pos q: "f32[s13, 1, 256][256, 256, 1]cuda:0" = _tgt + l_self_pos_; l_self_pos_ = None # File: /workspace/networks/layers/transformer.py:768 in forward, code: k = k[::self.global_dilation,:,:] k: "f32[((s13 + 1)//2), 1, 256][512, 256, 1]cuda:0" = q[(slice(None, None, 2), slice(None, None, None), slice(None, None, None))] # File: /workspace/networks/layers/transformer.py:769 in forward, code: v = v[::self.global_dilation,:,:] v: "f32[((s13 + 1)//2), 1, 256][512, 256, 1]cuda:0" = _tgt[(slice(None, None, 2), slice(None, None, None), slice(None, None, None))]; _tgt = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/linear.py:116 in forward, code: return F.linear(input, self.weight, self.bias) Q: "f16[s13, 1, 256][256, 256, 1]cuda:0" = torch._C._nn.linear(q, l_self_modules_self_attn_modules_linear_q_parameters_weight_, l_self_modules_self_attn_modules_linear_q_parameters_bias_); q = l_self_modules_self_attn_modules_linear_q_parameters_weight_ = l_self_modules_self_attn_modules_linear_q_parameters_bias_ = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/linear.py:116 in forward, code: return F.linear(input, self.weight, self.bias) K: "f16[((s13 + 1)//2), 1, 256][256, 256, 1]cuda:0" = torch._C._nn.linear(k, l_self_modules_self_attn_modules_linear_k_parameters_weight_, l_self_modules_self_attn_modules_linear_k_parameters_bias_); k = l_self_modules_self_attn_modules_linear_k_parameters_weight_ = l_self_modules_self_attn_modules_linear_k_parameters_bias_ = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/linear.py:116 in forward, code: return F.linear(input, self.weight, self.bias) V: "f16[((s13 + 1)//2), 1, 256][256, 256, 1]cuda:0" = torch._C._nn.linear(v, l_self_modules_self_attn_modules_linear_v_parameters_weight_, l_self_modules_self_attn_modules_linear_v_parameters_bias_); v = l_self_modules_self_attn_modules_linear_v_parameters_weight_ = l_self_modules_self_attn_modules_linear_v_parameters_bias_ = None # File: /workspace/networks/layers/attention.py:80 in forward, code: Q = Q / self.T Q_1: "f16[s13, 1, 256][256, 256, 1]cuda:0" = Q / 8.0; Q = None # File: /workspace/networks/layers/attention.py:90 in forward, code: Q = Q.view(-1, bs, num_head, self.d_att).permute(1, 2, 0, 3) view: "f16[s13, 1, s6, (256//s6)][256, 256, (256//s6), 1]cuda:0" = Q_1.view(-1, 1, l_self_modules_self_attn_num_head, l_self_modules_self_attn_d_att); Q_1 = None Q_2: "f16[1, s6, s13, (256//s6)][256, (256//s6), 256, 1]cuda:0" = view.permute(1, 2, 0, 3); view = None # File: /workspace/networks/layers/attention.py:91 in forward, code: K = K.view(-1, bs, num_head, self.d_att).permute(1, 2, 3, 0) view_1: "f16[((s13 + 1)//2), 1, s6, (256//s6)][256, 256, (256//s6), 1]cuda:0" = K.view(-1, 1, l_self_modules_self_attn_num_head, l_self_modules_self_attn_d_att); K = l_self_modules_self_attn_d_att = None K_1: "f16[1, s6, (256//s6), ((s13 + 1)//2)][256, (256//s6), 1, 256]cuda:0" = view_1.permute(1, 2, 3, 0); view_1 = None # File: /workspace/networks/layers/attention.py:92 in forward, code: V = V.view(-1, bs, num_head, hidden_dim).permute(1, 2, 0, 3) view_2: "f16[((s13 + 1)//2), 1, s6, (256//s6)][256, 256, (256//s6), 1]cuda:0" = V.view(-1, 1, l_self_modules_self_attn_num_head, l_self_modules_self_attn_hidden_dim); V = l_self_modules_self_attn_num_head = l_self_modules_self_attn_hidden_dim = None V_1: "f16[1, s6, ((s13 + 1)//2), (256//s6)][256, (256//s6), 256, 1]cuda:0" = view_2.permute(1, 2, 0, 3); view_2 = None # File: /workspace/networks/layers/attention.py:8 in multiply_by_ychunks, code: return x @ y QK: "f16[1, s6, s13, ((s13 + 1)//2)][s13*s6*(((s13 + 1)//2)), s13*(((s13 + 1)//2)), ((s13 + 1)//2), 1]cuda:0" = Q_2 @ K_1; Q_2 = K_1 = None # File: /workspace/networks/layers/attention.py:114 in forward, code: attn = torch.softmax(QK, dim=-1) attn: "f32[1, s6, s13, ((s13 + 1)//2)][s13*s6*(((s13 + 1)//2)), s13*(((s13 + 1)//2)), ((s13 + 1)//2), 1]cuda:0" = torch.softmax(QK, dim = -1); QK = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/dropout.py:59 in forward, code: return F.dropout(input, self.p, self.training, self.inplace) attn_1: "f32[1, s6, s13, ((s13 + 1)//2)][s13*s6*(((s13 + 1)//2)), s13*(((s13 + 1)//2)), ((s13 + 1)//2), 1]cuda:0" = torch.nn.functional.dropout(attn, 0.0, False, False); attn = None # File: /workspace/networks/layers/attention.py:14 in multiply_by_xchunks, code: return x @ y matmul_1: "f16[1, s6, s13, (256//s6)][s13*s6*((256//s6)), s13*((256//s6)), (256//s6), 1]cuda:0" = attn_1 @ V_1; attn_1 = V_1 = None # File: /workspace/networks/layers/attention.py:120 in forward, code: self.qk_chunks).permute(2, 0, 1, 3) outputs: "f16[s13, 1, s6, (256//s6)][(256//s6), s13*s6*((256//s6)), s13*((256//s6)), 1]cuda:0" = matmul_1.permute(2, 0, 1, 3); matmul_1 = None # File: /workspace/networks/layers/attention.py:122 in forward, code: outputs = outputs.reshape(-1, bs, self.d_model) outputs_1: "f16[s13, 1, s6*((256//s6))][s6*((256//s6)), s6*((256//s6)), 1]cuda:0" = outputs.reshape(-1, 1, 256); outputs = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/linear.py:116 in forward, code: return F.linear(input, self.weight, self.bias) outputs_2: "f16[s13, 1, 256][256, 256, 1]cuda:0" = torch._C._nn.linear(outputs_1, l_self_modules_self_attn_modules_projection_parameters_weight_, l_self_modules_self_attn_modules_projection_parameters_bias_); outputs_1 = l_self_modules_self_attn_modules_projection_parameters_weight_ = l_self_modules_self_attn_modules_projection_parameters_bias_ = None # File: /workspace/networks/layers/transformer.py:772 in forward, code: tgt = tgt + self.droppath(tgt2) tgt: "f32[s13, 1, 256][1, 256*s13, s13]cuda:0" = l_tgt_ + outputs_2; l_tgt_ = outputs_2 = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/normalization.py:201 in forward, code: return F.layer_norm( _tgt_1: "f32[s13, 1, 256][256, 256, 1]cuda:0" = torch.nn.functional.layer_norm(tgt, (256,), l_self_modules_norm2_parameters_weight_, l_self_modules_norm2_parameters_bias_, 1e-05); l_self_modules_norm2_parameters_weight_ = l_self_modules_norm2_parameters_bias_ = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/linear.py:116 in forward, code: return F.linear(input, self.weight, self.bias) curr_QV: "f16[s13, 1, 512][512, 512, 1]cuda:0" = torch._C._nn.linear(_tgt_1, l_self_modules_linear_qv_parameters_weight_, l_self_modules_linear_qv_parameters_bias_); l_self_modules_linear_qv_parameters_weight_ = l_self_modules_linear_qv_parameters_bias_ = None # File: /workspace/networks/layers/transformer.py:778 in forward, code: curr_QV = torch.split(curr_QV, self.d_model, dim=2) split = torch.functional.split(curr_QV, 256, dim = 2); curr_QV = None curr_Q: "f16[s13, 1, 256][512, 512, 1]cuda:0" = split[0] curr_V: "f16[s13, 1, 256][512, 512, 1]cuda:0" = split[1]; split = None # File: /workspace/networks/layers/basic.py:93 in seq_to_2d, code: tensor = tensor.view(h, w, n, c).permute(2, 3, 0, 1).contiguous() view_3: "f16[s9, (s13//s9), 1, 256][512*((s13//s9)), 512, 512, 1]cuda:0" = curr_Q.view(l_size_2d_0_, l_size_2d_1_, 1, 256) permute_4: "f16[1, 256, s9, (s13//s9)][512, 1, 512*((s13//s9)), 512]cuda:0" = view_3.permute(2, 3, 0, 1); view_3 = None tensor: "f16[1, 256, s9, (s13//s9)][256*s9*((s13//s9)), s9*((s13//s9)), (s13//s9), 1]cuda:0" = permute_4.contiguous(); permute_4 = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/linear.py:116 in forward, code: return F.linear(input, self.weight, self.bias) ID_KV: "f16[s13, 1, 257][257, 257, 1]cuda:0" = torch._C._nn.linear(l_curr_id_emb_, l_self_modules_linear_id_kv_parameters_weight_, l_self_modules_linear_id_kv_parameters_bias_); l_curr_id_emb_ = l_self_modules_linear_id_kv_parameters_weight_ = l_self_modules_linear_id_kv_parameters_bias_ = None # File: /workspace/networks/layers/transformer.py:855 in fuse_key_value_id, code: ID_K, ID_V = torch.split(ID_KV, [self.att_nhead, self.d_model], dim=2) split_1 = torch.functional.split(ID_KV, [1, 256], dim = 2); ID_KV = None ID_K: "f16[s13, 1, 1][257, 257, 1]cuda:0" = split_1[0] ID_V: "f16[s13, 1, 256][257, 257, 1]cuda:0" = split_1[1]; split_1 = None # File: /workspace/networks/layers/transformer.py:857 in fuse_key_value_id, code: K = key.view(-1, bs, self.att_nhead, self.d_model // view_4: "f16[s13, 1, 1, 256][512, 512, 256, 1]cuda:0" = curr_Q.view(-1, 1, 1, 256) # File: /workspace/networks/layers/transformer.py:858 in fuse_key_value_id, code: self.att_nhead) * (1 + torch.tanh(ID_K)).unsqueeze(-1) tanh: "f16[s13, 1, 1][1, 1, 1]cuda:0" = torch.tanh(ID_K); ID_K = None add_2: "f16[s13, 1, 1][1, 1, 1]cuda:0" = 1 + tanh; tanh = None unsqueeze: "f16[s13, 1, 1, 1][1, 1, 1, 1]cuda:0" = add_2.unsqueeze(-1); add_2 = None # File: /workspace/networks/layers/transformer.py:857 in fuse_key_value_id, code: K = key.view(-1, bs, self.att_nhead, self.d_model // K_2: "f16[s13, 1, 1, 256][256, 256, 256, 1]cuda:0" = view_4 * unsqueeze; view_4 = unsqueeze = None # File: /workspace/networks/layers/transformer.py:859 in fuse_key_value_id, code: K = K.view(-1, bs, self.d_model) K_3: "f16[s13, 1, 256][256, 256, 1]cuda:0" = K_2.view(-1, 1, 256); K_2 = None # File: /workspace/networks/layers/transformer.py:860 in fuse_key_value_id, code: V = value + ID_V V_2: "f16[s13, 1, 256][256, 256, 1]cuda:0" = curr_V + ID_V; ID_V = None # File: /workspace/networks/layers/basic.py:93 in seq_to_2d, code: tensor = tensor.view(h, w, n, c).permute(2, 3, 0, 1).contiguous() view_6: "f16[s9, (s13//s9), 1, 256][256*((s13//s9)), 256, 256, 1]cuda:0" = K_3.view(l_size_2d_0_, l_size_2d_1_, 1, 256) permute_5: "f16[1, 256, s9, (s13//s9)][256, 1, 256*((s13//s9)), 256]cuda:0" = view_6.permute(2, 3, 0, 1); view_6 = None tensor_1: "f16[1, 256, s9, (s13//s9)][256*s9*((s13//s9)), s9*((s13//s9)), (s13//s9), 1]cuda:0" = permute_5.contiguous(); permute_5 = None # File: /workspace/networks/layers/basic.py:93 in seq_to_2d, code: tensor = tensor.view(h, w, n, c).permute(2, 3, 0, 1).contiguous() view_7: "f16[s9, (s13//s9), 1, 256][256*((s13//s9)), 256, 256, 1]cuda:0" = V_2.view(l_size_2d_0_, l_size_2d_1_, 1, 256) permute_6: "f16[1, 256, s9, (s13//s9)][256, 1, 256*((s13//s9)), 256]cuda:0" = view_7.permute(2, 3, 0, 1); view_7 = None tensor_2: "f16[1, 256, s9, (s13//s9)][256*s9*((s13//s9)), s9*((s13//s9)), (s13//s9), 1]cuda:0" = permute_6.contiguous(); permute_6 = None # File: /workspace/networks/layers/transformer.py:805 in forward, code: unfold_K = global_K.view(size_2d[0],size_2d[1],bs,ck) unfold_K: "f16[s9, (s13//s9), 1, 256][256*((s13//s9)), 256, 256, 1]cuda:0" = K_3.view(l_size_2d_0_, l_size_2d_1_, 1, 256); K_3 = None # File: /workspace/networks/layers/transformer.py:806 in forward, code: unfold_V = global_V.view(size_2d[0],size_2d[1],bs,cv) unfold_V: "f16[s9, (s13//s9), 1, 256][256*((s13//s9)), 256, 256, 1]cuda:0" = V_2.view(l_size_2d_0_, l_size_2d_1_, 1, 256); V_2 = l_size_2d_0_ = l_size_2d_1_ = None # File: /workspace/networks/layers/transformer.py:807 in forward, code: global_K = unfold_K[::d,::d,:,:].reshape(-1,bs,ck) getitem_27: "f16[((s9 + 1)//2), (((s13//s9) + 1)//2), 1, 256][512*((s13//s9)), 512, 256, 1]cuda:0" = unfold_K[(slice(None, None, 2), slice(None, None, 2), slice(None, None, None), slice(None, None, None))]; unfold_K = None global_K: "f16[(((s9 + 1)//2))*((((s13//s9) + 1)//2)), 1, 256][256, 256, 1]cuda:0" = getitem_27.reshape(-1, 1, 256); getitem_27 = None # File: /workspace/networks/layers/transformer.py:808 in forward, code: global_V = unfold_V[::d,::d,:,:].reshape(-1,bs,cv) getitem_28: "f16[((s9 + 1)//2), (((s13//s9) + 1)//2), 1, 256][512*((s13//s9)), 512, 256, 1]cuda:0" = unfold_V[(slice(None, None, 2), slice(None, None, 2), slice(None, None, None), slice(None, None, None))]; unfold_V = None global_V: "f16[(((s9 + 1)//2))*((((s13//s9) + 1)//2)), 1, 256][256, 256, 1]cuda:0" = getitem_28.reshape(-1, 1, 256); getitem_28 = None # File: /workspace/networks/layers/attention.py:80 in forward, code: Q = Q / self.T Q_3: "f16[s13, 1, 256][256, 256, 1]cuda:0" = curr_Q / 16.0 # File: /workspace/networks/layers/attention.py:90 in forward, code: Q = Q.view(-1, bs, num_head, self.d_att).permute(1, 2, 0, 3) view_10: "f16[s13, 1, 1, 256][256, 256, 256, 1]cuda:0" = Q_3.view(-1, 1, 1, 256); Q_3 = None Q_4: "f16[1, 1, s13, 256][256, 256, 256, 1]cuda:0" = view_10.permute(1, 2, 0, 3); view_10 = None # File: /workspace/networks/layers/attention.py:91 in forward, code: K = K.view(-1, bs, num_head, self.d_att).permute(1, 2, 3, 0) view_11: "f16[(((s9 + 1)//2))*((((s13//s9) + 1)//2)), 1, 1, 256][256, 256, 256, 1]cuda:0" = global_K.view(-1, 1, 1, 256) K_4: "f16[1, 1, 256, (((s9 + 1)//2))*((((s13//s9) + 1)//2))][256, 256, 1, 256]cuda:0" = view_11.permute(1, 2, 3, 0); view_11 = None # File: /workspace/networks/layers/attention.py:92 in forward, code: V = V.view(-1, bs, num_head, hidden_dim).permute(1, 2, 0, 3) view_12: "f16[(((s9 + 1)//2))*((((s13//s9) + 1)//2)), 1, 1, 256][256, 256, 256, 1]cuda:0" = global_V.view(-1, 1, 1, 256) V_3: "f16[1, 1, (((s9 + 1)//2))*((((s13//s9) + 1)//2)), 256][256, 256, 256, 1]cuda:0" = view_12.permute(1, 2, 0, 3); view_12 = None # File: /workspace/networks/layers/attention.py:8 in multiply_by_ychunks, code: return x @ y QK_1: "f16[1, 1, s13, (((s9 + 1)//2))*((((s13//s9) + 1)//2))][s13*(((s9 + 1)//2))*((((s13//s9) + 1)//2)), s13*(((s9 + 1)//2))*((((s13//s9) + 1)//2)), (((s9 + 1)//2))*((((s13//s9) + 1)//2)), 1]cuda:0" = Q_4 @ K_4; Q_4 = K_4 = None # File: /workspace/networks/layers/attention.py:114 in forward, code: attn = torch.softmax(QK, dim=-1) attn_2: "f32[1, 1, s13, (((s9 + 1)//2))*((((s13//s9) + 1)//2))][s13*(((s9 + 1)//2))*((((s13//s9) + 1)//2)), s13*(((s9 + 1)//2))*((((s13//s9) + 1)//2)), (((s9 + 1)//2))*((((s13//s9) + 1)//2)), 1]cuda:0" = torch.softmax(QK_1, dim = -1); QK_1 = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/dropout.py:59 in forward, code: return F.dropout(input, self.p, self.training, self.inplace) attn_3: "f32[1, 1, s13, (((s9 + 1)//2))*((((s13//s9) + 1)//2))][s13*(((s9 + 1)//2))*((((s13//s9) + 1)//2)), s13*(((s9 + 1)//2))*((((s13//s9) + 1)//2)), (((s9 + 1)//2))*((((s13//s9) + 1)//2)), 1]cuda:0" = torch.nn.functional.dropout(attn_2, 0.0, False, False); attn_2 = None # File: /workspace/networks/layers/attention.py:14 in multiply_by_xchunks, code: return x @ y matmul_3: "f16[1, 1, s13, 256][256*s13, 256*s13, 256, 1]cuda:0" = attn_3 @ V_3; attn_3 = V_3 = None # File: /workspace/networks/layers/attention.py:120 in forward, code: self.qk_chunks).permute(2, 0, 1, 3) outputs_3: "f16[s13, 1, 1, 256][256, 256*s13, 256*s13, 1]cuda:0" = matmul_3.permute(2, 0, 1, 3); matmul_3 = None # File: /workspace/networks/layers/attention.py:122 in forward, code: outputs = outputs.reshape(-1, bs, self.d_model) outputs_4: "f16[s13, 1, 256][256, 256*s13, 1]cuda:0" = outputs_3.reshape(-1, 1, 256); outputs_3 = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/linear.py:116 in forward, code: return F.linear(input, self.weight, self.bias) outputs_5: "f16[s13, 1, 256][256, 256, 1]cuda:0" = torch._C._nn.linear(outputs_4, l_self_modules_long_term_attn_modules_projection_parameters_weight_, l_self_modules_long_term_attn_modules_projection_parameters_bias_); outputs_4 = l_self_modules_long_term_attn_modules_projection_parameters_weight_ = l_self_modules_long_term_attn_modules_projection_parameters_bias_ = None return (tensor, tensor_1, tensor_2, tgt, _tgt_1, outputs_5, curr_Q, curr_V, global_K, global_V)