diff --git a/.gitignore b/.gitignore
index 53db6cc..f0a9653 100644
--- a/.gitignore
+++ b/.gitignore
@@ -158,4 +158,13 @@ src/analysis/orca/tmp_XMYAR426.txt
 archive.zip
 logs/
 generated/
-data/processed/
\ No newline at end of file
+data/processed/
+*.pdf
+*.zip
+*.pth
+*.bck
+*.pt
+cifardata/
+*.meta.json
+*.joblib
+*.gz
\ No newline at end of file
diff --git a/configs/config.yaml b/configs/config.yaml
index 881f765..2a4da43 100644
--- a/configs/config.yaml
+++ b/configs/config.yaml
@@ -2,7 +2,7 @@ general:
     name: 'graph_dit'
     wandb: 'disabled' 
     gpus: 1
-    gpu_number: 3
+    gpu_number: 2
     resume: null
     test_only: null
     sample_every_val: 2500
@@ -31,7 +31,8 @@ model:
     lambda_train: [1, 10]  # node and edge training weight 
     ensure_connected: True
 train:
-    n_epochs: 5000
+    # n_epochs: 5000
+    n_epochs: 10
     batch_size: 1200
     lr: 0.0002
     clip_grad: null
diff --git a/graph_dit/diffusion_model.py b/graph_dit/diffusion_model.py
index d286c71..f46871d 100644
--- a/graph_dit/diffusion_model.py
+++ b/graph_dit/diffusion_model.py
@@ -220,7 +220,7 @@ class Graph_DiT(pl.LightningModule):
         # self.sampling_metrics.reset()
         self.val_y_collection = []
 
-    @torch.no_grad()
+    # @torch.no_grad()
     def validation_step(self, data, i):
         data_x = F.one_hot(data.x, num_classes=8).float()[:, self.active_index]
         data_edge_attr = F.one_hot(data.edge_attr, num_classes=2).float()
@@ -313,7 +313,7 @@ class Graph_DiT(pl.LightningModule):
         self.test_E_logp.reset()
         self.test_y_collection = []
     
-    @torch.no_grad()
+    # @torch.no_grad()
     def test_step(self, data, i):
         data_x = F.one_hot(data.x, num_classes=8).float()[:, self.active_index]
         data_edge_attr = F.one_hot(data.edge_attr, num_classes=2).float()
@@ -573,7 +573,7 @@ class Graph_DiT(pl.LightningModule):
         
         return nll
     
-    @torch.no_grad()
+    # @torch.no_grad()
     def sample_batch(self, batch_id, batch_size, y, keep_chain, number_chain_steps, save_final, num_nodes=None):
         """
         :param batch_id: int
@@ -742,19 +742,24 @@ class Graph_DiT(pl.LightningModule):
                 if valid_rlt[i]:
                     nodes = [num_to_op[j] for j in x_list[i].cpu().numpy()]
                     # edges = e_list[i].cpu().numpy()
-                    score.append(get_nasbench201_nodes_score(nodes,train_loader=self.train_loader,searchspace=self.searchspace,device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu") , args=self.args))
+                    score.append(get_nasbench201_nodes_score(nodes,train_loader=self.train_loader,searchspace=self.searchspace,device=sampled_s.X.device , args=self.args))
                 else:
                     score.append(-1)
-            return torch.tensor(score, dtype=torch.float32, requires_grad=True).to(x_list[0].device)
+            # return torch.tensor(score, dtype=torch.float32, requires_grad=True).to(x_list[0].device)
+            target_score = torch.ones(100, dtype=torch.float32, device=sampled_s.X.device, requires_grad=True) * 2000.0
+            # target_score_list = [2000 for i in range(100)]
+            # return torch.tensor(score, device=sampled_s.X.device ,dtype=torch.float32, requires_grad=True), torch.tensor(target_score_list, device=sampled_s.X.device, dtype=torch.float32, requires_grad=True)
+            return torch.tensor(score, device=sampled_s.X.device ,dtype=torch.float32, requires_grad=True), target_score
 
         sample_num = 10
         best_arch = None
         best_score_int = -1e8
         score = torch.ones(100, dtype=torch.float32, requires_grad=True) * -1e8
+        print(f'score.requires_grad: {score.requires_grad}')
 
         for i in range(sample_num):
             sampled_s = diffusion_utils.sample_discrete_features(prob_X, prob_E, node_mask=node_mask, step=s[0,0].item())
-            score = get_score(sampled_s)
+            score, target_score = get_score(sampled_s)
             print(f'score: {score}')
             print(f'score.shape: {score.shape}')
             print(f'torch.sum(score): {torch.sum(score)}')
@@ -779,14 +784,19 @@ class Graph_DiT(pl.LightningModule):
         print(f'X_s: {X_s}, E_s: {E_s}')
 
         # NASWOT score
-        target_score = torch.ones(100, requires_grad=True) * 2000.0
-        target_score = target_score.to(X_s.device)
+        # target_score = torch.ones(100, requires_grad=True, device=X_s.device) * 2000.0
+        # target_score = torch.ones(100, requires_grad=True) * 2000.0
+        print(f'best_score: {best_score.shape}, target_score: {target_score.shape}')
+        print(f'best_score.requires_grad: {best_score.requires_grad}, target_score.requires_grad: {target_score.requires_grad}')
+        print(f'best_score.device: {best_score.device}, target_score.device: {target_score.device}')
+        # target_score = target_score.to(X_s.device)
+        # print(f'best_score: {best_score.shape}, target_score: {target_score.shape}')
+        # print(f'best_score.requires_grad: {best_score.requires_grad}, target_score.requires_grad: {target_score.requires_grad}')
 
         # compute loss mse(cur_score - target_score)
         mse_loss = torch.nn.MSELoss()
-        print(f'best_score: {best_score.shape}, target_score: {target_score.shape}')
-        print(f'best_score.requires_grad: {best_score.requires_grad}, target_score.requires_grad: {target_score.requires_grad}')
-        loss = mse_loss(best_score, target_score)
+        loss = mse_loss(target_score, best_score)
+        print(f'loss: {loss.requires_grad}')
         loss.backward(retain_graph=True)
 
         # loss backward = gradient
@@ -798,8 +808,8 @@ class Graph_DiT(pl.LightningModule):
         beta_ratio = 0.5
         # x_current = pred.X - beta_ratio * x_grad
         # e_current = pred.E - beta_ratio * e_grad
-        E_s = pred.X - beta_ratio * x_grad
-        X_s = pred.E - beta_ratio * e_grad
+        X_s = pred.X - beta_ratio * x_grad
+        E_s = pred.E - beta_ratio * e_grad
 
         # update prob.X prob_E with using gradient
 
diff --git a/graph_dit/models/transformer.py b/graph_dit/models/transformer.py
index 15d77e9..e9b8bfa 100644
--- a/graph_dit/models/transformer.py
+++ b/graph_dit/models/transformer.py
@@ -86,7 +86,7 @@ class Denoiser(nn.Module):
     """
     def forward(self, x, e, node_mask, y, t, unconditioned):
         
-        print("Denoiser Forward")
+        # print("Denoiser Forward")
         # print(x.shape, e.shape, y.shape, t.shape, unconditioned)
         force_drop_id = torch.zeros_like(y.sum(-1))
         # drop the nan values