class GraphModule(torch.nn.Module): def forward(self, L_self_modules_norm1_parameters_weight_: "f32[256][1]cuda:0", L_self_modules_norm1_parameters_bias_: "f32[256][1]cuda:0", L_tgt_: "f32[4624, 1, 256][1, 1183744, 4624]cuda:0", L_self_pos_: "f32[4624, 1, 256][256, 256, 1]cuda:0", L_self_modules_self_attn_modules_linear_Q_parameters_weight_: "f32[256, 256][256, 1]cuda:0", L_self_modules_self_attn_modules_linear_Q_parameters_bias_: "f32[256][1]cuda:0", L_self_modules_self_attn_modules_linear_K_parameters_weight_: "f32[256, 256][256, 1]cuda:0", L_self_modules_self_attn_modules_linear_K_parameters_bias_: "f32[256][1]cuda:0", L_self_modules_self_attn_modules_linear_V_parameters_weight_: "f32[256, 256][256, 1]cuda:0", L_self_modules_self_attn_modules_linear_V_parameters_bias_: "f32[256][1]cuda:0", L_self_modules_self_attn_modules_projection_parameters_weight_: "f32[256, 256][256, 1]cuda:0", L_self_modules_self_attn_modules_projection_parameters_bias_: "f32[256][1]cuda:0", L_self_modules_norm2_parameters_weight_: "f32[256][1]cuda:0", L_self_modules_norm2_parameters_bias_: "f32[256][1]cuda:0", L_self_modules_linear_QV_parameters_weight_: "f32[512, 256][256, 1]cuda:0", L_self_modules_linear_QV_parameters_bias_: "f32[512][1]cuda:0", L_curr_id_emb_: "f32[4624, 1, 256][256, 256, 1]cuda:0", L_self_modules_linear_ID_KV_parameters_weight_: "f32[257, 256][256, 1]cuda:0", L_self_modules_linear_ID_KV_parameters_bias_: "f32[257][1]cuda:0", L_self_modules_long_term_attn_modules_projection_parameters_weight_: "f32[256, 256][256, 1]cuda:0", L_self_modules_long_term_attn_modules_projection_parameters_bias_: "f32[256][1]cuda:0"): l_self_modules_norm1_parameters_weight_ = L_self_modules_norm1_parameters_weight_ l_self_modules_norm1_parameters_bias_ = L_self_modules_norm1_parameters_bias_ l_tgt_ = L_tgt_ l_self_pos_ = L_self_pos_ l_self_modules_self_attn_modules_linear_q_parameters_weight_ = L_self_modules_self_attn_modules_linear_Q_parameters_weight_ l_self_modules_self_attn_modules_linear_q_parameters_bias_ = L_self_modules_self_attn_modules_linear_Q_parameters_bias_ l_self_modules_self_attn_modules_linear_k_parameters_weight_ = L_self_modules_self_attn_modules_linear_K_parameters_weight_ l_self_modules_self_attn_modules_linear_k_parameters_bias_ = L_self_modules_self_attn_modules_linear_K_parameters_bias_ l_self_modules_self_attn_modules_linear_v_parameters_weight_ = L_self_modules_self_attn_modules_linear_V_parameters_weight_ l_self_modules_self_attn_modules_linear_v_parameters_bias_ = L_self_modules_self_attn_modules_linear_V_parameters_bias_ l_self_modules_self_attn_modules_projection_parameters_weight_ = L_self_modules_self_attn_modules_projection_parameters_weight_ l_self_modules_self_attn_modules_projection_parameters_bias_ = L_self_modules_self_attn_modules_projection_parameters_bias_ l_self_modules_norm2_parameters_weight_ = L_self_modules_norm2_parameters_weight_ l_self_modules_norm2_parameters_bias_ = L_self_modules_norm2_parameters_bias_ l_self_modules_linear_qv_parameters_weight_ = L_self_modules_linear_QV_parameters_weight_ l_self_modules_linear_qv_parameters_bias_ = L_self_modules_linear_QV_parameters_bias_ l_curr_id_emb_ = L_curr_id_emb_ l_self_modules_linear_id_kv_parameters_weight_ = L_self_modules_linear_ID_KV_parameters_weight_ l_self_modules_linear_id_kv_parameters_bias_ = L_self_modules_linear_ID_KV_parameters_bias_ l_self_modules_long_term_attn_modules_projection_parameters_weight_ = L_self_modules_long_term_attn_modules_projection_parameters_weight_ l_self_modules_long_term_attn_modules_projection_parameters_bias_ = L_self_modules_long_term_attn_modules_projection_parameters_bias_ # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/normalization.py:201 in forward, code: return F.layer_norm( _tgt: "f32[4624, 1, 256][256, 256, 1]cuda:0" = torch.nn.functional.layer_norm(l_tgt_, (256,), l_self_modules_norm1_parameters_weight_, l_self_modules_norm1_parameters_bias_, 1e-05); l_self_modules_norm1_parameters_weight_ = l_self_modules_norm1_parameters_bias_ = None # File: /workspace/networks/layers/transformer.py:752 in with_pos_embed, code: return tensor if pos is None else tensor + pos q: "f32[4624, 1, 256][256, 256, 1]cuda:0" = _tgt + l_self_pos_; l_self_pos_ = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/linear.py:116 in forward, code: return F.linear(input, self.weight, self.bias) Q: "f16[4624, 1, 256][256, 256, 1]cuda:0" = torch._C._nn.linear(q, l_self_modules_self_attn_modules_linear_q_parameters_weight_, l_self_modules_self_attn_modules_linear_q_parameters_bias_); l_self_modules_self_attn_modules_linear_q_parameters_weight_ = l_self_modules_self_attn_modules_linear_q_parameters_bias_ = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/linear.py:116 in forward, code: return F.linear(input, self.weight, self.bias) K: "f16[4624, 1, 256][256, 256, 1]cuda:0" = torch._C._nn.linear(q, l_self_modules_self_attn_modules_linear_k_parameters_weight_, l_self_modules_self_attn_modules_linear_k_parameters_bias_); q = l_self_modules_self_attn_modules_linear_k_parameters_weight_ = l_self_modules_self_attn_modules_linear_k_parameters_bias_ = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/linear.py:116 in forward, code: return F.linear(input, self.weight, self.bias) V: "f16[4624, 1, 256][256, 256, 1]cuda:0" = torch._C._nn.linear(_tgt, l_self_modules_self_attn_modules_linear_v_parameters_weight_, l_self_modules_self_attn_modules_linear_v_parameters_bias_); _tgt = l_self_modules_self_attn_modules_linear_v_parameters_weight_ = l_self_modules_self_attn_modules_linear_v_parameters_bias_ = None # File: /workspace/networks/layers/attention.py:80 in forward, code: Q = Q / self.T Q_1: "f16[4624, 1, 256][256, 256, 1]cuda:0" = Q / 5.656854249492381; Q = None # File: /workspace/networks/layers/attention.py:90 in forward, code: Q = Q.view(-1, bs, num_head, self.d_att).permute(1, 2, 0, 3) view: "f16[4624, 1, 8, 32][256, 256, 32, 1]cuda:0" = Q_1.view(-1, 1, 8, 32); Q_1 = None Q_2: "f16[1, 8, 4624, 32][256, 32, 256, 1]cuda:0" = view.permute(1, 2, 0, 3); view = None # File: /workspace/networks/layers/attention.py:91 in forward, code: K = K.view(-1, bs, num_head, self.d_att).permute(1, 2, 3, 0) view_1: "f16[4624, 1, 8, 32][256, 256, 32, 1]cuda:0" = K.view(-1, 1, 8, 32); K = None K_1: "f16[1, 8, 32, 4624][256, 32, 1, 256]cuda:0" = view_1.permute(1, 2, 3, 0); view_1 = None # File: /workspace/networks/layers/attention.py:92 in forward, code: V = V.view(-1, bs, num_head, hidden_dim).permute(1, 2, 0, 3) view_2: "f16[4624, 1, 8, 32][256, 256, 32, 1]cuda:0" = V.view(-1, 1, 8, 32); V = None V_1: "f16[1, 8, 4624, 32][256, 32, 256, 1]cuda:0" = view_2.permute(1, 2, 0, 3); view_2 = None # File: /workspace/networks/layers/attention.py:8 in multiply_by_ychunks, code: return x @ y QK: "f16[1, 8, 4624, 4624][171051008, 21381376, 4624, 1]cuda:0" = Q_2 @ K_1; Q_2 = K_1 = None # File: /workspace/networks/layers/attention.py:114 in forward, code: attn = torch.softmax(QK, dim=-1) attn: "f32[1, 8, 4624, 4624][171051008, 21381376, 4624, 1]cuda:0" = torch.softmax(QK, dim = -1); QK = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/dropout.py:59 in forward, code: return F.dropout(input, self.p, self.training, self.inplace) attn_1: "f32[1, 8, 4624, 4624][171051008, 21381376, 4624, 1]cuda:0" = torch.nn.functional.dropout(attn, 0.0, False, False); attn = None # File: /workspace/networks/layers/attention.py:14 in multiply_by_xchunks, code: return x @ y matmul_1: "f16[1, 8, 4624, 32][1183744, 147968, 32, 1]cuda:0" = attn_1 @ V_1; attn_1 = V_1 = None # File: /workspace/networks/layers/attention.py:120 in forward, code: self.qk_chunks).permute(2, 0, 1, 3) outputs: "f16[4624, 1, 8, 32][32, 1183744, 147968, 1]cuda:0" = matmul_1.permute(2, 0, 1, 3); matmul_1 = None # File: /workspace/networks/layers/attention.py:122 in forward, code: outputs = outputs.reshape(-1, bs, self.d_model) outputs_1: "f16[4624, 1, 256][256, 256, 1]cuda:0" = outputs.reshape(-1, 1, 256); outputs = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/linear.py:116 in forward, code: return F.linear(input, self.weight, self.bias) outputs_2: "f16[4624, 1, 256][256, 256, 1]cuda:0" = torch._C._nn.linear(outputs_1, l_self_modules_self_attn_modules_projection_parameters_weight_, l_self_modules_self_attn_modules_projection_parameters_bias_); outputs_1 = l_self_modules_self_attn_modules_projection_parameters_weight_ = l_self_modules_self_attn_modules_projection_parameters_bias_ = None # File: /workspace/networks/layers/transformer.py:772 in forward, code: tgt = tgt + self.droppath(tgt2) tgt: "f32[4624, 1, 256][1, 1183744, 4624]cuda:0" = l_tgt_ + outputs_2; l_tgt_ = outputs_2 = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/normalization.py:201 in forward, code: return F.layer_norm( _tgt_1: "f32[4624, 1, 256][256, 256, 1]cuda:0" = torch.nn.functional.layer_norm(tgt, (256,), l_self_modules_norm2_parameters_weight_, l_self_modules_norm2_parameters_bias_, 1e-05); l_self_modules_norm2_parameters_weight_ = l_self_modules_norm2_parameters_bias_ = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/linear.py:116 in forward, code: return F.linear(input, self.weight, self.bias) curr_QV: "f16[4624, 1, 512][512, 512, 1]cuda:0" = torch._C._nn.linear(_tgt_1, l_self_modules_linear_qv_parameters_weight_, l_self_modules_linear_qv_parameters_bias_); l_self_modules_linear_qv_parameters_weight_ = l_self_modules_linear_qv_parameters_bias_ = None # File: /workspace/networks/layers/transformer.py:778 in forward, code: curr_QV = torch.split(curr_QV, self.d_model, dim=2) split = torch.functional.split(curr_QV, 256, dim = 2); curr_QV = None curr_Q: "f16[4624, 1, 256][512, 512, 1]cuda:0" = split[0] curr_V: "f16[4624, 1, 256][512, 512, 1]cuda:0" = split[1]; split = None # File: /workspace/networks/layers/basic.py:93 in seq_to_2d, code: tensor = tensor.view(h, w, n, c).permute(2, 3, 0, 1).contiguous() view_3: "f16[68, 68, 1, 256][34816, 512, 512, 1]cuda:0" = curr_Q.view(68, 68, 1, 256) permute_4: "f16[1, 256, 68, 68][512, 1, 34816, 512]cuda:0" = view_3.permute(2, 3, 0, 1); view_3 = None tensor: "f16[1, 256, 68, 68][1183744, 4624, 68, 1]cuda:0" = permute_4.contiguous(); permute_4 = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/linear.py:116 in forward, code: return F.linear(input, self.weight, self.bias) ID_KV: "f16[4624, 1, 257][257, 257, 1]cuda:0" = torch._C._nn.linear(l_curr_id_emb_, l_self_modules_linear_id_kv_parameters_weight_, l_self_modules_linear_id_kv_parameters_bias_); l_curr_id_emb_ = l_self_modules_linear_id_kv_parameters_weight_ = l_self_modules_linear_id_kv_parameters_bias_ = None # File: /workspace/networks/layers/transformer.py:855 in fuse_key_value_id, code: ID_K, ID_V = torch.split(ID_KV, [self.att_nhead, self.d_model], dim=2) split_1 = torch.functional.split(ID_KV, [1, 256], dim = 2); ID_KV = None ID_K: "f16[4624, 1, 1][257, 257, 1]cuda:0" = split_1[0] ID_V: "f16[4624, 1, 256][257, 257, 1]cuda:0" = split_1[1]; split_1 = None # File: /workspace/networks/layers/transformer.py:857 in fuse_key_value_id, code: K = key.view(-1, bs, self.att_nhead, self.d_model // view_4: "f16[4624, 1, 1, 256][512, 512, 256, 1]cuda:0" = curr_Q.view(-1, 1, 1, 256) # File: /workspace/networks/layers/transformer.py:858 in fuse_key_value_id, code: self.att_nhead) * (1 + torch.tanh(ID_K)).unsqueeze(-1) tanh: "f16[4624, 1, 1][1, 1, 1]cuda:0" = torch.tanh(ID_K); ID_K = None add_2: "f16[4624, 1, 1][1, 1, 1]cuda:0" = 1 + tanh; tanh = None unsqueeze: "f16[4624, 1, 1, 1][1, 1, 1, 1]cuda:0" = add_2.unsqueeze(-1); add_2 = None # File: /workspace/networks/layers/transformer.py:857 in fuse_key_value_id, code: K = key.view(-1, bs, self.att_nhead, self.d_model // K_2: "f16[4624, 1, 1, 256][256, 256, 256, 1]cuda:0" = view_4 * unsqueeze; view_4 = unsqueeze = None # File: /workspace/networks/layers/transformer.py:859 in fuse_key_value_id, code: K = K.view(-1, bs, self.d_model) K_3: "f16[4624, 1, 256][256, 256, 1]cuda:0" = K_2.view(-1, 1, 256); K_2 = None # File: /workspace/networks/layers/transformer.py:860 in fuse_key_value_id, code: V = value + ID_V V_2: "f16[4624, 1, 256][256, 256, 1]cuda:0" = curr_V + ID_V; ID_V = None # File: /workspace/networks/layers/basic.py:93 in seq_to_2d, code: tensor = tensor.view(h, w, n, c).permute(2, 3, 0, 1).contiguous() view_6: "f16[68, 68, 1, 256][17408, 256, 256, 1]cuda:0" = K_3.view(68, 68, 1, 256) permute_5: "f16[1, 256, 68, 68][256, 1, 17408, 256]cuda:0" = view_6.permute(2, 3, 0, 1); view_6 = None tensor_1: "f16[1, 256, 68, 68][1183744, 4624, 68, 1]cuda:0" = permute_5.contiguous(); permute_5 = None # File: /workspace/networks/layers/basic.py:93 in seq_to_2d, code: tensor = tensor.view(h, w, n, c).permute(2, 3, 0, 1).contiguous() view_7: "f16[68, 68, 1, 256][17408, 256, 256, 1]cuda:0" = V_2.view(68, 68, 1, 256) permute_6: "f16[1, 256, 68, 68][256, 1, 17408, 256]cuda:0" = view_7.permute(2, 3, 0, 1); view_7 = None tensor_2: "f16[1, 256, 68, 68][1183744, 4624, 68, 1]cuda:0" = permute_6.contiguous(); permute_6 = None # File: /workspace/networks/layers/attention.py:80 in forward, code: Q = Q / self.T Q_3: "f16[4624, 1, 256][256, 256, 1]cuda:0" = curr_Q / 16.0 # File: /workspace/networks/layers/attention.py:90 in forward, code: Q = Q.view(-1, bs, num_head, self.d_att).permute(1, 2, 0, 3) view_8: "f16[4624, 1, 1, 256][256, 256, 256, 1]cuda:0" = Q_3.view(-1, 1, 1, 256); Q_3 = None Q_4: "f16[1, 1, 4624, 256][256, 256, 256, 1]cuda:0" = view_8.permute(1, 2, 0, 3); view_8 = None # File: /workspace/networks/layers/attention.py:91 in forward, code: K = K.view(-1, bs, num_head, self.d_att).permute(1, 2, 3, 0) view_9: "f16[4624, 1, 1, 256][256, 256, 256, 1]cuda:0" = K_3.view(-1, 1, 1, 256) K_4: "f16[1, 1, 256, 4624][256, 256, 1, 256]cuda:0" = view_9.permute(1, 2, 3, 0); view_9 = None # File: /workspace/networks/layers/attention.py:92 in forward, code: V = V.view(-1, bs, num_head, hidden_dim).permute(1, 2, 0, 3) view_10: "f16[4624, 1, 1, 256][256, 256, 256, 1]cuda:0" = V_2.view(-1, 1, 1, 256) V_3: "f16[1, 1, 4624, 256][256, 256, 256, 1]cuda:0" = view_10.permute(1, 2, 0, 3); view_10 = None # File: /workspace/networks/layers/attention.py:8 in multiply_by_ychunks, code: return x @ y QK_1: "f16[1, 1, 4624, 4624][21381376, 21381376, 4624, 1]cuda:0" = Q_4 @ K_4; Q_4 = K_4 = None # File: /workspace/networks/layers/attention.py:114 in forward, code: attn = torch.softmax(QK, dim=-1) attn_2: "f32[1, 1, 4624, 4624][21381376, 21381376, 4624, 1]cuda:0" = torch.softmax(QK_1, dim = -1); QK_1 = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/dropout.py:59 in forward, code: return F.dropout(input, self.p, self.training, self.inplace) attn_3: "f32[1, 1, 4624, 4624][21381376, 21381376, 4624, 1]cuda:0" = torch.nn.functional.dropout(attn_2, 0.0, False, False); attn_2 = None # File: /workspace/networks/layers/attention.py:14 in multiply_by_xchunks, code: return x @ y matmul_3: "f16[1, 1, 4624, 256][1183744, 1183744, 256, 1]cuda:0" = attn_3 @ V_3; attn_3 = V_3 = None # File: /workspace/networks/layers/attention.py:120 in forward, code: self.qk_chunks).permute(2, 0, 1, 3) outputs_3: "f16[4624, 1, 1, 256][256, 1183744, 1183744, 1]cuda:0" = matmul_3.permute(2, 0, 1, 3); matmul_3 = None # File: /workspace/networks/layers/attention.py:122 in forward, code: outputs = outputs.reshape(-1, bs, self.d_model) outputs_4: "f16[4624, 1, 256][256, 1183744, 1]cuda:0" = outputs_3.reshape(-1, 1, 256); outputs_3 = None # File: /opt/conda/lib/python3.11/site-packages/torch/nn/modules/linear.py:116 in forward, code: return F.linear(input, self.weight, self.bias) outputs_5: "f16[4624, 1, 256][256, 256, 1]cuda:0" = torch._C._nn.linear(outputs_4, l_self_modules_long_term_attn_modules_projection_parameters_weight_, l_self_modules_long_term_attn_modules_projection_parameters_bias_); outputs_4 = l_self_modules_long_term_attn_modules_projection_parameters_weight_ = l_self_modules_long_term_attn_modules_projection_parameters_bias_ = None return (tensor, tensor_1, tensor_2, tgt, _tgt_1, outputs_5, curr_Q, curr_V, K_3, V_2)