some onehot issue

update a small problem
update EdgeMetricsCE class
2024-06-30 21:09:16 +02:00 · 2024-06-30 19:41:31 +02:00 · 2024-06-30 17:37:18 +02:00 · 2024-06-30 16:43:08 +02:00 · 2024-06-30 16:39:42 +02:00 · 2024-06-29 17:16:08 +02:00
8 changed files with 635 additions and 133 deletions
--- a/graph_dit/datasets/abstract_dataset.py
+++ b/graph_dit/datasets/abstract_dataset.py
@@ -116,7 +116,7 @@ class AbstractDatasetInfos:
    def compute_input_output_dims(self, datamodule):
        example_batch = datamodule.example_batch()
        example_batch_x = torch.nn.functional.one_hot(example_batch.x, num_classes=118).float()[:, self.active_index]
-        example_batch_edge_attr = torch.nn.functional.one_hot(example_batch.edge_attr, num_classes=10).float()
+        example_batch_edge_attr = torch.nn.functional.one_hot(example_batch.edge_attr, num_classes=2).float()

        self.input_dims = {'X': example_batch_x.size(1), 
                           'E': example_batch_edge_attr.size(1), 
@@ -127,4 +127,19 @@ class AbstractDatasetInfos:
        print('input dims')
        print(self.input_dims)
        print('output dims')
+        print(self.output_dims)
+    def compute_graph_input_output_dims(self, datamodule):
+        example_batch = datamodule.example_batch()
+        example_batch_x = torch.nn.functional.one_hot(example_batch.x, num_classes=8).float()[:, self.active_index]
+        example_batch_edge_attr = torch.nn.functional.one_hot(example_batch.edge_attr, num_classes=2).float()
+
+        self.input_dims = {'X': example_batch_x.size(1),
+                           'E': example_batch_edge_attr.size(1),
+                           'y': example_batch['y'].size(1)}
+        self.output_dims = {'X': example_batch_x.size(1),
+                            'E': example_batch_edge_attr.size(1),
+                            'y': example_batch['y'].size(1)}
+        print('input dims')
+        print(self.input_dims)
+        print('output dims')
        print(self.output_dims)
--- a/graph_dit/datasets/dataset.py
+++ b/graph_dit/datasets/dataset.py
@@ -39,6 +39,16 @@ op_to_atom = {
    'none': 'S',           # Sulfur for no operation
    'output': 'He'         # Helium for output
 }
+
+op_type = {
+    'nor_conv_1x1': 1,
+    'nor_conv_3x3': 2,
+    'avg_pool_3x3': 3,
+    'skip_connect': 4,
+    'output': 5,
+    'none': 6,
+    'input': 7
+}
 class DataModule(AbstractDataModule):
    def __init__(self, cfg):
        self.datadir = cfg.dataset.datadir
@@ -50,12 +60,12 @@ class DataModule(AbstractDataModule):

    def prepare_data(self) -> None:
        target = getattr(self.cfg.dataset, 'guidance_target', None)
-        print("target", target)
+        print("target", target) # nasbench-201
        # try:
        #     base_path = pathlib.Path(os.path.realpath(__file__)).parents[2]
        # except NameError:
        # base_path = pathlib.Path(os.getcwd()).parent[2]
-        base_path = '/home/stud/hanzhang/Graph-Dit'
+        base_path = '/home/stud/hanzhang/nasbenchDiT'
        root_path = os.path.join(base_path, self.datadir)
        self.root_path = root_path

@@ -68,13 +78,16 @@ class DataModule(AbstractDataModule):
        # Dataset has target property, root path, and transform
        source = './NAS-Bench-201-v1_1-096897.pth'
        dataset = Dataset(source=source, root=root_path, target_prop=target, transform=None)
+        self.dataset = dataset
+        # self.api = dataset.api

        # if len(self.task.split('-')) == 2:
        #     train_index, val_index, test_index, unlabeled_index = self.fixed_split(dataset)
        # else:
        train_index, val_index, test_index, unlabeled_index = self.random_data_split(dataset)

-        self.train_index, self.val_index, self.test_index, self.unlabeled_index = train_index, val_index, test_index, unlabeled_index
+        self.train_index, self.val_index, self.test_index, self.unlabeled_index = (
+            train_index, val_index, test_index, unlabeled_index)
        train_index, val_index, test_index, unlabeled_index = torch.LongTensor(train_index), torch.LongTensor(val_index), torch.LongTensor(test_index), torch.LongTensor(unlabeled_index)
        if len(unlabeled_index) > 0:
            train_index = torch.cat([train_index, unlabeled_index], dim=0)
@@ -175,6 +188,27 @@ class DataModule(AbstractDataModule):
        smiles = Chem.MolToSmiles(mol)
        return smiles

+    def get_train_graphs(self):
+        train_graphs = []
+        test_graphs = []
+        for graph in self.train_dataset:
+            train_graphs.append(graph)
+        for graph in self.test_dataset:
+            test_graphs.append(graph)
+        return train_graphs, test_graphs
+
+
+    # def get_train_smiles(self):
+    #     filename = f'{self.task}.csv.gz'
+    #     df = pd.read_csv(f'{self.root_path}/raw/{filename}')
+    #     df_test = df.iloc[self.test_index]
+    #     df = df.iloc[self.train_index]
+    #     smiles_list = df['smiles'].tolist()
+    #     smiles_list_test = df_test['smiles'].tolist()
+    #     smiles_list = [Chem.MolToSmiles(Chem.MolFromSmiles(smi)) for smi in smiles_list]
+    #     smiles_list_test = [Chem.MolToSmiles(Chem.MolFromSmiles(smi)) for smi in smiles_list_test]
+    #     return smiles_list, smiles_list_test
+
    def get_train_smiles(self):
        train_smiles = []   
        test_smiles = []
@@ -319,6 +353,121 @@ class DataModule_original(AbstractDataModule):
    def test_dataloader(self):
        return self.test_loader

+def new_graphs_to_json(graphs, filename):
+    source_name = "nasbench-201"
+    num_graph = len(graphs)
+
+    node_name_list = []
+    node_count_list = []
+    node_name_list.append('*')
+    
+    for op_name in op_type:
+        node_name_list.append(op_name)
+        node_count_list.append(0) 
+    
+    node_count_list.append(0)
+    n_nodes_per_graph = [0] * num_graph
+    edge_count_list = [0, 0] 
+    valencies = [0] * (len(op_type) + 1)
+    transition_E = np.zeros((len(op_type) + 1, len(op_type) + 1, 2))
+
+    n_node_list = []
+    n_edge_list = []
+
+    for graph in graphs:
+        ops = graph[1]
+        adj = graph[0]
+
+        n_node = len(ops)
+        n_edge = len(ops)
+        n_node_list.append(n_node)
+        n_edge_list.append(n_edge)
+
+        n_nodes_per_graph[n_node] += 1
+        cur_node_count_arr = np.zeros(len(op_type) + 1)
+
+        for op in ops:
+            node = op
+            # if node == '*':
+            #     node_count_list[-1] += 1
+            #     cur_node_count_arr[-1] += 1
+            # else:
+            node_count_list[node] += 1
+            cur_node_count_arr[node] += 1
+            try:
+                valencies[node] += 1
+            except:
+                print('int(op_type[node])', int(node))
+        
+        transition_E_temp = np.zeros((len(op_type) + 1, len(op_type) + 1, 2))
+        for i in range(n_node):
+            for j in range(n_node):
+                if i == j or adj[i][j] == 0:
+                    continue
+                start_node, end_node = i, j
+                
+                start_index = ops[start_node]
+                end_index = ops[end_node]
+                bond_index = 1
+                edge_count_list[bond_index] += 2
+                
+                transition_E[start_index, end_index, bond_index] += 2
+                transition_E[end_index, start_index, bond_index] += 2
+                transition_E_temp[start_index, end_index, bond_index] += 2
+                transition_E_temp[end_index, start_index, bond_index] += 2
+
+        edge_count_list[0] += n_node * (n_node - 1) - n_edge * 2
+        cur_tot_edge = cur_node_count_arr.reshape(-1,1) * cur_node_count_arr.reshape(1,-1) * 2
+        # print(f"cur_tot_edge={cur_tot_edge}, shape: {cur_tot_edge.shape}")
+        cur_tot_edge = cur_tot_edge - np.diag(cur_node_count_arr) * 2
+        transition_E[:, :, 0] += cur_tot_edge - transition_E_temp.sum(axis=-1)
+        assert (cur_tot_edge > transition_E_temp.sum(axis=-1)).sum() >= 0
+    
+    n_nodes_per_graph = np.array(n_nodes_per_graph) / np.sum(n_nodes_per_graph)
+    n_nodes_per_graph = n_nodes_per_graph.tolist()[:51]
+
+    node_count_list = np.array(node_count_list) / np.sum(node_count_list)
+    print('processed meta info: ------', filename, '------')
+    print('len node_count_list', len(node_count_list))
+    print('len node_name_list', len(node_name_list))
+    active_nodes = np.array(node_name_list)[node_count_list > 0]
+    active_nodes = active_nodes.tolist()
+    node_count_list = node_count_list.tolist()
+
+    edge_count_list = np.array(edge_count_list) / np.sum(edge_count_list)
+    edge_count_list = edge_count_list.tolist()
+    valencies = np.array(valencies) / np.sum(valencies)
+    valencies = valencies.tolist()
+
+    no_edge = np.sum(transition_E, axis=-1) == 0
+    first_elt = transition_E[:, :, 0]
+    first_elt[no_edge] = 1
+    transition_E[:, :, 0] = first_elt
+
+    transition_E = transition_E / np.sum(transition_E, axis=-1, keepdims=True)
+
+    meta_dict = {
+        'source': source_name,
+        'num_graph': num_graph,
+        'n_nodes_per_graph': n_nodes_per_graph,
+        'max_n_nodes': max(n_node_list),
+        'max_n_edges': max(n_edge_list),
+        'node_type_list': node_count_list,
+        'edge_type_list': edge_count_list,
+        'valencies': valencies,
+        'active_nodes': active_nodes,
+        'num_active_nodes': len(active_nodes),
+        'transition_E': transition_E.tolist(),
+    }
+
+    with open(f'/home/stud/hanzhang/nasbenchDiT/graph_dit/nasbench-201-meta.json', 'w') as f:
+        json.dump(meta_dict, f)
+    
+    return meta_dict
+
+
+
+
 def graphs_to_json(graphs, filename):
    bonds = {
        'nor_conv_1x1': 1,
@@ -466,7 +615,7 @@ def graphs_to_json(graphs, filename):
        'atom_type_dist': atom_count_list,
        'bond_type_dist': bond_count_list,
        'valencies': valencies,
-        'active_atoms': [atom_name_list[i] for i in range(118) if atom_count_list[i] > 0],
+        'active_nodes': [atom_name_list[i] for i in range(118) if atom_count_list[i] > 0],
        'num_atom_type': len([atom_name_list[i] for i in range(118) if atom_count_list[i] > 0]),
        'transition_E': transition_E.tolist(),
    }
@@ -477,14 +626,17 @@ def graphs_to_json(graphs, filename):
 class Dataset(InMemoryDataset):
    def __init__(self, source, root, target_prop=None, transform=None, pre_transform=None, pre_filter=None):
        self.target_prop = target_prop
-        source = '/home/stud/hanzhang/Graph-DiT/graph_dit/NAS-Bench-201-v1_1-096897.pth'
+        source = '/home/stud/hanzhang/nasbenchDiT/graph_dit/NAS-Bench-201-v1_1-096897.pth'
        self.source = source
-        self.api = API(source)  # Initialize NAS-Bench-201 API
-        print('API loaded')
+        # self.api = API(source)  # Initialize NAS-Bench-201 API
+        # print('API loaded')
        super().__init__(root, transform, pre_transform, pre_filter)
-        print('Dataset initialized')
-        print(self.processed_paths[0])
+        print(self.processed_paths[0]) #/home/stud/hanzhang/Graph-DiT/graph_dit/NAS-Bench-201-v1_1-096897.pth.pt
        self.data, self.slices = torch.load(self.processed_paths[0])
+        print('Dataset initialized')
+        self.data.edge_attr = self.data.edge_attr.squeeze()
+        self.data.idx = torch.arange(len(self.data.y))
+        print(f"self.data={self.data}, self.slices={self.slices}")

    @property
    def raw_file_names(self):
@@ -495,82 +647,172 @@ class Dataset(InMemoryDataset):
        return [f'{self.source}.pt']

    def process(self):
-        def parse_architecture_string(arch_str):
-            stages = arch_str.split('+')
-            nodes = ['input']
-            edges = []
-            
-            for stage in stages:
-                operations = stage.strip('|').split('|')
-                for op in operations:
-                    operation, idx = op.split('~')
-                    idx = int(idx)
-                    edges.append((idx, len(nodes)))  # Add edge from idx to the new node
-                    nodes.append(operation)
-            nodes.append('output')  # Add the output node
-            return nodes, edges
+        source = '/home/stud/hanzhang/nasbenchDiT/graph_dit/NAS-Bench-201-v1_1-096897.pth'
+        self.api = API(source)

-        def create_graph(nodes, edges):
-            G = nx.DiGraph()
-            for i, node in enumerate(nodes):
-                G.add_node(i, label=node)
-            G.add_edges_from(edges)
-            return G
-
-        def arch_to_graph(arch_str, sa, sc, target, target2=None, target3=None):
-            nodes, edges = parse_architecture_string(arch_str)
-
-            node_labels = [bonds[node] for node in nodes]  # Replace with appropriate encoding if necessary
-            assert 0 not in node_labels, f'Invalid node label: {node_labels}'
-            x = torch.LongTensor(node_labels)
-            print(f'in initialize Dataset, arch_to_Graph x={x}')
-
-            edges_list = [(start, end) for start, end in edges]
-            edge_type = [bonds[nodes[end]] for start, end in edges]  # Example: using end node type as edge type
-            edge_index = torch.tensor(edges_list, dtype=torch.long).t().contiguous()
-            edge_type = torch.tensor(edge_type, dtype=torch.long)
-            edge_attr = edge_type.view(-1, 1)
-
-            if target3 is not None:
-                y = torch.tensor([sa, sc, target, target2, target3], dtype=torch.float).view(1, -1)
-            elif target2 is not None:
-                y = torch.tensor([sa, sc, target, target2], dtype=torch.float).view(1, -1)
-            else:
-                y = torch.tensor([sa, sc, target], dtype=torch.float).view(1, -1)
-
-            print(f'in initialize Dataset, Data_init, x={x}, y={y}, edge_index={edge_index}, edge_attr={edge_attr}')
-            data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
-            return data, nodes
-
-        bonds = {
-            'nor_conv_1x1': 1,
-            'nor_conv_3x3': 2,
-            'avg_pool_3x3': 3,
-            'skip_connect': 4,
-            'output': 5,
-            'none': 6,
-            'input': 7
-        }
-
-        # Prepare to process NAS-Bench-201 data
        data_list = []
-        len_data = len(self.api)  # Number of architectures
-        with tqdm(total=len_data) as pbar:
-            for arch_index in range(len_data):
-                arch_info = self.api.query_meta_info_by_index(arch_index)
-                arch_str = arch_info.arch_str
-                sa = np.random.rand()  # Placeholder for synthetic accessibility
-                sc = np.random.rand()  # Placeholder for substructure count
-                target = np.random.rand()  # Placeholder for target value
-                target2 = np.random.rand()  # Placeholder for second target value
-                target3 = np.random.rand()  # Placeholder for third target value
+        len_data = len(self.api)

-                data, active_nodes = arch_to_graph(arch_str, sa, sc, target, target2, target3)
+        def graph_to_graph_data(graph):
+            ops = graph[1]
+            adj = graph[0]
+            nodes = []
+            for op in ops:
+                nodes.append(op_type[op])
+            x = torch.LongTensor(nodes)
+
+            edges_list = []
+            edge_type = []
+            for start in range(len(ops)):
+                for end in range(len(ops)):
+                    if adj[start][end] == 1:
+                        edges_list.append((start, end))
+                        edge_type.append(1)
+                        edges_list.append((end, start))
+                        edge_type.append(1)
+            
+            edge_index = torch.tensor(edges_list, dtype=torch.long).t()
+            edge_type = torch.tensor(edge_type, dtype=torch.long)
+            edge_attr = edge_type
+            y = torch.tensor([0], dtype=torch.float).view(1, -1)
+            data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y, idx=i)
+            return data
+        graph_list = []
+
+        with tqdm(total = len_data) as pbar:
+            active_nodes = set()
+            for i in range(len_data):
+                arch_info = self.api.query_meta_info_by_index(i)
+                results = self.api.query_by_index(i, 'cifar100')
+                nodes, edges = parse_architecture_string(arch_info.arch_str)
+                adj_matrix, ops = create_adj_matrix_and_ops(nodes, edges)
+                for op in ops:
+                    if op not in active_nodes:
+                        active_nodes.add(op)
+                
+                graph_list.append({
+                    "adj_matrix": adj_matrix,
+                    "ops": ops,
+                    "idx": i,
+                    "train": [{
+                        "iepoch": result.get_train()['iepoch'],
+                        "loss": result.get_train()['loss'],
+                        "accuracy": result.get_train()['accuracy'],
+                        "cur_time": result.get_train()['cur_time'],
+                        "all_time": result.get_train()['all_time'],
+                        "seed": seed,
+                    }for seed, result in results.items()],
+                    "valid": [{
+                        "iepoch": result.get_eval('x-valid')['iepoch'],
+                        "loss": result.get_eval('x-valid')['loss'],
+                        "accuracy": result.get_eval('x-valid')['accuracy'],
+                        "cur_time": result.get_eval('x-valid')['cur_time'],
+                        "all_time": result.get_eval('x-valid')['all_time'],
+                        "seed": seed,
+                    }for seed, result in results.items()],
+                    "test": [{
+                        "iepoch": result.get_eval('x-test')['iepoch'],
+                        "loss": result.get_eval('x-test')['loss'],
+                        "accuracy": result.get_eval('x-test')['accuracy'],
+                        "cur_time": result.get_eval('x-test')['cur_time'],
+                        "all_time": result.get_eval('x-test')['all_time'],
+                        "seed": seed,
+                    }for seed, result in results.items()]
+                })
+                data = graph_to_graph_data((adj_matrix, ops)) 
                data_list.append(data)
                pbar.update(1)
-
+        
+        for graph in graph_list:
+            adj_matrix = graph['adj_matrix']
+            if isinstance(adj_matrix, np.ndarray):
+                adj_matrix = adj_matrix.tolist()
+                graph['adj_matrix'] = adj_matrix
+            ops = graph['ops']
+            if isinstance(ops, np.ndarray):
+                ops = ops.tolist()
+                graph['ops'] = ops
+        with open(f'nasbench-201-graph.json', 'w') as f:
+            json.dump(graph_list, f)
+            
        torch.save(self.collate(data_list), self.processed_paths[0])

+        # def parse_architecture_string(arch_str):
+        #     stages = arch_str.split('+')
+        #     nodes = ['input']
+        #     edges = []
+            
+        #     for stage in stages:
+        #         operations = stage.strip('|').split('|')
+        #         for op in operations:
+        #             operation, idx = op.split('~')
+        #             idx = int(idx)
+        #             edges.append((idx, len(nodes)))  # Add edge from idx to the new node
+        #             nodes.append(operation)
+        #     nodes.append('output')  # Add the output node
+        #     return nodes, edges
+
+        # def create_graph(nodes, edges):
+        #     G = nx.DiGraph()
+        #     for i, node in enumerate(nodes):
+        #         G.add_node(i, label=node)
+        #     G.add_edges_from(edges)
+        #     return G
+
+        # def arch_to_graph(arch_str, sa, sc, target, target2=None, target3=None):
+        #     nodes, edges = parse_architecture_string(arch_str)
+
+        #     node_labels = [bonds[node] for node in nodes]  # Replace with appropriate encoding if necessary
+        #     assert 0 not in node_labels, f'Invalid node label: {node_labels}'
+        #     x = torch.LongTensor(node_labels)
+        #     print(f'in initialize Dataset, arch_to_Graph x={x}')
+
+        #     edges_list = [(start, end) for start, end in edges]
+        #     edge_type = [bonds[nodes[end]] for start, end in edges]  # Example: using end node type as edge type
+        #     edge_index = torch.tensor(edges_list, dtype=torch.long).t().contiguous()
+        #     edge_type = torch.tensor(edge_type, dtype=torch.long)
+        #     edge_attr = edge_type.view(-1, 1)
+
+        #     if target3 is not None:
+        #         y = torch.tensor([sa, sc, target, target2, target3], dtype=torch.float).view(1, -1)
+        #     elif target2 is not None:
+        #         y = torch.tensor([sa, sc, target, target2], dtype=torch.float).view(1, -1)
+        #     else:
+        #         y = torch.tensor([sa, sc, target], dtype=torch.float).view(1, -1)
+
+        #     print(f'in initialize Dataset, Data_init, x={x}, y={y}, edge_index={edge_index}, edge_attr={edge_attr}')
+        #     data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
+        #     return data, nodes
+
+        # bonds = {
+        #     'nor_conv_1x1': 1,
+        #     'nor_conv_3x3': 2,
+        #     'avg_pool_3x3': 3,
+        #     'skip_connect': 4,
+        #     'output': 5,
+        #     'none': 6,
+        #     'input': 7
+        # }
+
+        # # Prepare to process NAS-Bench-201 data
+        # data_list = []
+        # len_data = len(self.api)  # Number of architectures
+        # with tqdm(total=len_data) as pbar:
+        #     for arch_index in range(len_data):
+        #         arch_info = self.api.query_meta_info_by_index(arch_index)
+        #         arch_str = arch_info.arch_str
+        #         sa = np.random.rand()  # Placeholder for synthetic accessibility
+        #         sc = np.random.rand()  # Placeholder for substructure count
+        #         target = np.random.rand()  # Placeholder for target value
+        #         target2 = np.random.rand()  # Placeholder for second target value
+        #         target3 = np.random.rand()  # Placeholder for third target value
+
+        #         data, active_nodes = arch_to_graph(arch_str, sa, sc, target, target2, target3)
+        #         data_list.append(data)
+        #         pbar.update(1)
+
+        # torch.save(self.collate(data_list), self.processed_paths[0])
+
 class Dataset_origin(InMemoryDataset):
    def __init__(self, source, root, target_prop=None,
                 transform=None, pre_transform=None, pre_filter=None):
@@ -676,7 +918,7 @@ def create_adj_matrix_and_ops(nodes, edges):
        adj_matrix[src][dst] = 1
    return adj_matrix, nodes
 class DataInfos(AbstractDatasetInfos):
-    def __init__(self, datamodule, cfg):
+    def __init__(self, datamodule, cfg, dataset):
        tasktype_dict = {
            'hiv_b': 'classification',
            'bace_b': 'classification',
@@ -689,6 +931,7 @@ class DataInfos(AbstractDatasetInfos):
        self.task = task_name
        self.task_type = tasktype_dict.get(task_name, "regression")
        self.ensure_connected = cfg.model.ensure_connected
+        # self.api = dataset.api

        datadir = cfg.dataset.datadir

@@ -699,36 +942,55 @@ class DataInfos(AbstractDatasetInfos):
        length = 15625
        ops_type = {}
        len_ops = set()
-        api = API('/home/stud/hanzhang/Graph-DiT/graph_dit/NAS-Bench-201-v1_1-096897.pth')
-        for i in range(length):
-            arch_info = api.query_meta_info_by_index(i)
-            nodes, edges = parse_architecture_string(arch_info.arch_str)
-            adj_matrix, ops = create_adj_matrix_and_ops(nodes, edges)    
-            if i < 5:
-                print("Adjacency Matrix:")
-                print(adj_matrix)
-                print("Operations List:")
-                print(ops)
-            for op in ops:
-                if op not in ops_type:
-                    ops_type[op] = len(ops_type)
-            len_ops.add(len(ops))
-            graphs.append((adj_matrix, ops))
+        # api = API('/home/stud/hanzhang/Graph-DiT/graph_dit/NAS-Bench-201-v1_1-096897.pth')

-        meta_dict = graphs_to_json(graphs, 'nasbench-201')

+        def read_adj_ops_from_json(filename):
+            with open(filename, 'r') as json_file:
+                data = json.load(json_file)
+
+            adj_ops_pairs = []
+            for item in data:
+                adj_matrix = np.array(item['adj_matrix'])
+                ops = item['ops']
+                ops = [op_type[op] for op in ops]
+                adj_ops_pairs.append((adj_matrix, ops))
+            
+            return adj_ops_pairs
+        # for i in range(length):
+        #     arch_info = self.api.query_meta_info_by_index(i)
+        #     nodes, edges = parse_architecture_string(arch_info.arch_str)
+        #     adj_matrix, ops = create_adj_matrix_and_ops(nodes, edges)    
+            # if i < 5:
+            #     print("Adjacency Matrix:")
+            #     print(adj_matrix)
+            #     print("Operations List:")
+            #     print(ops)
+            # for op in ops:
+            #     if op not in ops_type:
+            #         ops_type[op] = len(ops_type)
+            # len_ops.add(len(ops))
+            # graphs.append((adj_matrix, ops))
+        graphs = read_adj_ops_from_json(f'/home/stud/hanzhang/nasbenchDiT/graph_dit/nasbench-201-graph.json')
+
+        # check first five graphs
+        for i in range(5):
+            print(f'graph {i} : {graphs[i]}')
+        print(f'ops_type: {ops_type}')
+
+        meta_dict = new_graphs_to_json(graphs, 'nasbench-201')
        self.base_path = base_path
-        self.active_atoms = meta_dict['active_atoms']
-        self.max_n_nodes = meta_dict['max_node']
-        self.original_max_n_nodes = meta_dict['max_node']
-        self.n_nodes = torch.Tensor(meta_dict['n_atoms_per_mol_dist'])
-        self.edge_types = torch.Tensor(meta_dict['bond_type_dist'])
+        self.active_nodes = meta_dict['active_nodes']
+        self.max_n_nodes = meta_dict['max_n_nodes']
+        self.original_max_n_nodes = meta_dict['max_n_nodes']
+        self.n_nodes = torch.Tensor(meta_dict['n_nodes_per_graph'])
+        self.edge_types = torch.Tensor(meta_dict['edge_type_list'])
        self.transition_E = torch.Tensor(meta_dict['transition_E'])

-        self.atom_decoder = meta_dict['active_atoms']
-        node_types = torch.Tensor(meta_dict['atom_type_dist'])
+        self.node_decoder = meta_dict['active_nodes']
+        node_types = torch.Tensor(meta_dict['node_type_list'])
        active_index = (node_types > 0).nonzero().squeeze()
-        self.node_types = torch.Tensor(meta_dict['atom_type_dist'])[active_index]
+        self.node_types = torch.Tensor(meta_dict['node_type_list'])[active_index]
        self.nodes_dist = DistributionNodes(self.n_nodes)
        self.active_index = active_index

@@ -923,11 +1185,11 @@ def compute_meta(root, source_name, train_index, test_index):
        'transition_E': tansition_E.tolist(),
        }

-    with open(f'{root}/{source_name}.meta.json', "w") as f:
+    with open(f'/home/stud/hanzhang/nasbenchDiT/graph_dit/nasbench201.meta.json', "w") as f:
        json.dump(meta_dict, f)
    
    return meta_dict


 if __name__ == "__main__":
-    pass
+    dataset = Dataset(source='nasbench', root='/home/stud/hanzhang/nasbenchDiT/graph-dit', target_prop='Class', transform=None)
--- a/graph_dit/diffusion_model.py
+++ b/graph_dit/diffusion_model.py
@@ -13,11 +13,11 @@ from metrics.abstract_metrics import SumExceptBatchMetric, SumExceptBatchKL, NLL
 import utils

 class Graph_DiT(pl.LightningModule):
-    # def __init__(self, cfg, dataset_infos, train_metrics, sampling_metrics, visualization_tools):
-    def __init__(self, cfg, dataset_infos, visualization_tools):
+    def __init__(self, cfg, dataset_infos, train_metrics, sampling_metrics, visualization_tools):
+    # def __init__(self, cfg, dataset_infos, visualization_tools):

        super().__init__()
-        # self.save_hyperparameters(ignore=['train_metrics', 'sampling_metrics'])
+        self.save_hyperparameters(ignore=['train_metrics', 'sampling_metrics'])
        self.test_only = cfg.general.test_only
        self.guidance_target = getattr(cfg.dataset, 'guidance_target', None)

@@ -57,8 +57,8 @@ class Graph_DiT(pl.LightningModule):
        self.test_E_logp = SumExceptBatchMetric()
        self.test_y_collection = []

-        # self.train_metrics = train_metrics
-        # self.sampling_metrics = sampling_metrics
+        self.train_metrics = train_metrics
+        self.sampling_metrics = sampling_metrics

        self.visualization_tools = visualization_tools
        self.max_n_nodes = dataset_infos.max_n_nodes
@@ -179,9 +179,9 @@ class Graph_DiT(pl.LightningModule):
    @torch.no_grad()
    def validation_step(self, data, i):
        data_x = F.one_hot(data.x, num_classes=118).float()[:, self.active_index]
-        data_edge_attr = F.one_hot(data.edge_attr, num_classes=5).float()
+        data_edge_attr = F.one_hot(data.edge_attr, num_classes=10).float()
        dense_data, node_mask = utils.to_dense(data_x, data.edge_index, data_edge_attr, data.batch, self.max_n_nodes)
-        dense_data = dense_data.mask(node_mask)
+        dense_data = dense_data.mask(node_mask, collapse=True)
        noisy_data = self.apply_noise(dense_data.X, dense_data.E, data.y, node_mask)
        pred = self.forward(noisy_data)
        nll = self.compute_val_loss(pred, noisy_data, dense_data.X, dense_data.E, data.y, node_mask, test=False)
--- a/graph_dit/main.py
+++ b/graph_dit/main.py
@@ -78,16 +78,20 @@ def main(cfg: DictConfig):

    datamodule = dataset.DataModule(cfg)
    datamodule.prepare_data()
-    dataset_infos = dataset.DataInfos(datamodule=datamodule, cfg=cfg)
+    dataset_infos = dataset.DataInfos(datamodule=datamodule, cfg=cfg, dataset=datamodule.dataset)
    # train_smiles, reference_smiles = datamodule.get_train_smiles()
+    train_graphs, reference_graphs = datamodule.get_train_graphs()

    # get input output dimensions
    dataset_infos.compute_input_output_dims(datamodule=datamodule)
-    # train_metrics = TrainMolecularMetricsDiscrete(dataset_infos)
+    train_metrics = TrainMolecularMetricsDiscrete(dataset_infos)

    # sampling_metrics = SamplingMolecularMetrics(
    #     dataset_infos, train_smiles, reference_smiles
    # )
+    sampling_metrics = SamplingGraphMetrics(
+        dataset_infos, train_graphs, reference_graphs
+    )
    visualization_tools = MolecularVisualization(dataset_infos)

    model_kwargs = {
@@ -135,5 +139,16 @@ def main(cfg: DictConfig):
    else:
        trainer.test(model, datamodule=datamodule, ckpt_path=cfg.general.test_only)

+@hydra.main(
+    version_base="1.1", config_path="../configs", config_name="config"
+)
+def test(cfg: DictConfig):
+    datamodule = dataset.DataModule(cfg)
+    datamodule.prepare_data()
+    dataset_infos = dataset.DataInfos(datamodule=datamodule, cfg=cfg, dataset=datamodule.dataset)
+    train_graphs, reference_graphs = datamodule.get_train_graphs()
+
+    dataset_infos.compute_input_output_dims(datamodule=datamodule)
+
 if __name__ == "__main__":
-    main()
+    test()
--- a/graph_dit/metrics/molecular_metrics_sampling.py
+++ b/graph_dit/metrics/molecular_metrics_sampling.py
@@ -23,7 +23,104 @@ def result_to_csv(path, dict_data):
            writer.writeheader()
        writer.writerow(dict_data)

+class SamplingGraphMetrics(nn.Module):
+    def __init__(
+            self,
+            dataset_infos,
+            train_graphs,
+            reference_graphs,
+            n_jobs=1,
+            device="cpu",
+            batch_size=512,
+    ):
+        super().__init__()
+        self.task_name = dataset_infos.task
+        self.dataset_infos = dataset_infos
+        self.active_nodes = dataset_infos.active_nodes
+        self.train_graphs = train_graphs

+        self.stat_ref = None
+
+        self.compute_config = {
+            "n_jobs": n_jobs,
+            "device": device,
+            "batch_size": batch_size,
+        }
+
+        self.task_evaluator = {
+            'meta_taskname': dataset_infos.task,
+            'sas': None,
+            'scs': None
+        }
+
+        for cur_task in dataset_infos.task.split("-")[:]:
+            model_path = os.path.join(
+                dataset_infos.base_path, "data/evaluator", f"{cur_task}.joblib"
+            )
+            os.makedirs(os.path.dirname(model_path), exist_ok=True)
+            evaluator = TaskModel(model_path, cur_task)
+            self.task_evaluator[cur_task] = evaluator
+
+    def forward(self, graphs, targets, name, current_epoch, val_counter, test=False):
+        if isinstance(targets, list):
+            targets_cat = torch.cat(targets, dim=0)
+            targets_np = targets_cat.detach().cpu().numpy()
+        else:
+            targets_np = targets.detach().cpu().numpy()
+
+        unique_graphs, all_graphs, all_graphs, targets_log = compute_molecular_metrics(
+            graphs,
+            targets_np,
+            self.train_graphs,
+            self.stat_ref,
+            self.dataset_infos,
+            self.task_evaluator,
+            self.compute_config,
+        )
+
+        if test:
+            file_name = "final_graphs.txt"
+            with open(file_name, "w") as fp:
+                all_tasks_name = list(self.task_evaluator.keys())
+                all_tasks_name = all_tasks_name.copy()
+                if 'meta_taskname' in all_tasks_name:
+                    all_tasks_name.remove('meta_taskname')
+
+                all_tasks_str = "graph, " + ", ".join([f"input_{task}" for task in all_tasks_name] + [f"output_{task}" for task in all_tasks_name])
+                fp.write(all_tasks_str + "\n")
+                for i, graph in enumerate(all_graphs):
+                    if targets_log is not None:
+                        all_result_str = f"{graph}, " + ", ".join([f"{targets_log['input_'+task][i]}" for task in all_tasks_name] + [f"{targets_log['output_'+task][i]}" for task in all_tasks_name])
+                        fp.write(all_result_str + "\n")
+                    else:
+                        fp.write("%s\n" % graph)
+                print("All graphs saved")
+        else:
+            result_path = os.path.join(os.getcwd(), f"graphs/{name}")
+            os.makedirs(result_path, exist_ok=True)
+            text_path = os.path.join(
+                result_path,
+                f"valid_unique_graphs_e{current_epoch}_b{val_counter}.txt",
+            )
+            textfile = open(text_path, "w")
+            for graph in unique_graphs:
+                textfile.write(graph + "\n")
+            textfile.close()
+        
+        all_logs = all_graphs
+        if test:
+            all_logs["log_name"] = "test"
+        else:
+            all_logs["log_name"] = (
+                "epoch" + str(current_epoch) + "_batch" + str(val_counter)
+            )
+        
+        result_to_csv("output.csv", all_logs)
+        return all_graphs
+    
+    def reset(self):
+        pass
+            
 class SamplingMolecularMetrics(nn.Module):
    def __init__(
        self,
@@ -40,21 +137,21 @@ class SamplingMolecularMetrics(nn.Module):
        self.active_atoms = dataset_infos.active_atoms
        self.train_smiles = train_smiles

-        if reference_smiles is not None:
-            print(
-                f"--- Computing intermediate statistics for training for #{len(reference_smiles)} smiles ---"
-            )
-            start_time = time.time()
-            self.stat_ref = compute_intermediate_statistics(
-                reference_smiles, n_jobs=n_jobs, device=device, batch_size=batch_size
-            )
-            end_time = time.time()
-            elapsed_time = end_time - start_time
-            print(
-                f"--- End computing intermediate statistics: using {elapsed_time:.2f}s ---"
-            )
-        else:
-            self.stat_ref = None
+        # if reference_smiles is not None:
+        #     print(
+        #         f"--- Computing intermediate statistics for training for #{len(reference_smiles)} smiles ---"
+        #     )
+        #     start_time = time.time()
+        #     self.stat_ref = compute_intermediate_statistics(
+        #         reference_smiles, n_jobs=n_jobs, device=device, batch_size=batch_size
+        #     )
+        #     end_time = time.time()
+        #     elapsed_time = end_time - start_time
+        #     print(
+        #         f"--- End computing intermediate statistics: using {elapsed_time:.2f}s ---"
+        #     )
+        # else:
+        self.stat_ref = None
    
        self.comput_config = {
            "n_jobs": n_jobs,
--- a/graph_dit/metrics/molecular_metrics_train.py
+++ b/graph_dit/metrics/molecular_metrics_train.py
@@ -35,7 +35,13 @@ class CEPerClass(Metric):

    def compute(self):
        return self.total_ce / self.total_samples
+class NodeCE(CEPerClass):
+    def __init__(self, i):
+        super().__init__(i)

+class EdgeCE(CEPerClass):
+    def __init__(self, i):
+        super().__init__(i)

 class AtomCE(CEPerClass):
    def __init__(self, i):
@@ -65,6 +71,21 @@ class AromaticCE(CEPerClass):
    def __init__(self, i):
        super().__init__(i)

+class NodeMetricsCE(MetricCollection):
+    def __init__(self, active_nodes):
+        metrics_list = []
+
+        for i, node_type in enumerate(active_nodes) :
+            metrics_list.append(type(f'{node_type}_CE', (NodeCE,), {})(i))
+        super().__init__(metrics_list)
+
+class EdgeMetricsCE(MetricCollection):
+    def __init__(self):
+        ce_no_bond = NoBondCE(0)
+        ce_SI = SingleCE(1)
+        ce_DO = DoubleCE(2)
+        ce_TR = TripleCE(3)
+        super().__init__([ce_no_bond, ce_SI, ce_DO, ce_TR])

 class AtomMetricsCE(MetricCollection):
    def __init__(self, active_atoms):
@@ -84,7 +105,47 @@ class BondMetricsCE(MetricCollection):
        ce_TR = TripleCE(3)
        super().__init__([ce_no_bond, ce_SI, ce_DO, ce_TR])

-# 
+#
+
+class TrainGraphMetricsDiscrete(nn.Module):
+    def __init__(self, dataset_infos):
+        super().__init__()
+        active_nodes = dataset_infos.active_nodes
+        self.train_node_metrics = NodeMetricsCE(active_nodes=active_nodes)
+        self.train_edge_metrics = EdgeMetricsCE()
+
+    def forward(self, masked_pred_X, masked_pred_E, true_X, true_E, log: bool):
+        self.train_node_metrics(masked_pred_X, true_X)
+        self.train_edge_metrics(masked_pred_E, true_E)
+        if log:
+            to_log = {}
+            for key, val in self.train_node_metrics.compute().items():
+                to_log['train/' + key] = val.item()
+            for key, val in self.train_edge_metrics.compute().items():
+                to_log['train/' + key] = val.item()
+
+    def reset(self):
+        for metric in [self.train_node_metrics, self.train_edge_metrics]:
+            metric.reset()
+
+    def log_epoch_metrics(self, current_epoch, log=True):
+        epoch_node_metrics = self.train_node_metrics.compute()
+        epoch_edge_metrics = self.train_edge_metrics.compute()
+
+        to_log = {}
+        for key, val in epoch_node_metrics.items():
+            to_log['train_epoch/' + key] = val.item()
+        for key, val in epoch_edge_metrics.items():
+            to_log['train_epoch/' + key] = val.item()
+
+        for key, val in epoch_node_metrics.items():
+            epoch_node_metrics[key] = round(val.item(),4)
+        for key, val in epoch_edge_metrics.items():
+            epoch_edge_metrics[key] = round(val.item(),4)
+
+        if log:
+            print(f"Epoch {current_epoch}: {epoch_node_metrics} -- {epoch_edge_metrics}")
+
 class TrainMolecularMetricsDiscrete(nn.Module):
    def __init__(self, dataset_infos):
        super().__init__()
--- a/graph_dit/metrics/property_metric.py
+++ b/graph_dit/metrics/property_metric.py
@@ -15,6 +15,17 @@ from rdkit.Chem import AllChem
 from rdkit import DataStructs
 from rdkit.Chem import rdMolDescriptors
 rdBase.DisableLog('rdApp.error')
+import json
+
+op_type = {
+    'nor_conv_1x1': 1,
+    'nor_conv_3x3': 2,
+    'avg_pool_3x3': 3,
+    'skip_connect': 4,
+    'output': 5,
+    'none': 6,
+    'input': 7
+}

 task_to_colname = {
    'hiv_b': 'HIV_active',
@@ -32,8 +43,10 @@ tasktype_name = {
    'O2': 'regression',
    'N2': 'regression',
    'CO2': 'regression',
+    'nasbench201': 'regression',
 }

+
 class TaskModel():
    """Scores based on an ECFP classifier."""
    def __init__(self, model_path, task_name):
@@ -55,8 +68,47 @@ class TaskModel():
            perfermance = self.train()
            dump(self.model, model_path)
            print('Oracle peformance: ', perfermance)
-
    def train(self):
+        def read_adj_ops_from_json(filename):
+            with open(filename, 'r') as json_file:
+                data = json.load(json_file)
+
+            adj_ops_pairs = []
+            for item in data:
+                adj_matrix = np.array(item['adj_matrix'])
+                ops = item['ops']
+                acc = item['train'][0]['accuracy']
+                adj_ops_pairs.append((adj_matrix, ops, acc))
+            
+            return adj_ops_pairs
+        def feature_from_adj_and_ops(adj, ops):
+            return np.concatenate([adj.flatten(), ops])
+        filename = '/home/stud/hanzhang/nasbenchDiT/graph_dit/nasbench-201-graph.json'
+        graphs = read_adj_ops_from_json(filename)
+        adjs = []
+        opss = []
+        accs = []
+        features = []
+        for graph in graphs:
+            adj, ops, acc=graph
+            op_code = [op_type[op] for op in ops]
+            adjs.append(adj)
+            opss.append(op_code)
+            accs.append(acc)
+            features.append(feature_from_adj_and_ops(adj, op_code))
+        features = np.array(features)
+        labels = np.array(accs)
+
+        mask = ~np.isnan(labels)
+        labels = labels[mask]
+        features = features[mask]
+        self.model.fit(features, labels)
+        y_pred = self.model.predict(features)
+        perf = self.metric_func(labels, y_pred)
+        print(f'{self.task_name} performance: {perf}')
+        return perf
+
+    def train__(self):
        data_path = os.path.dirname(self.model_path)
        data_path = os.path.join(os.path.dirname(self.model_path), '..', f'raw/{self.task_name}.csv.gz')
        df = pd.read_csv(data_path)
--- a/graph_dit/workingdoc.md
+++ b/graph_dit/workingdoc.md
Author	SHA1	Message	Date
mhz	f5911be781	some onehot issue	2024-06-30 21:09:16 +02:00
mhz	be8bb16f61	update a small problem	2024-06-30 19:41:31 +02:00
mhz	0fc6f6e686	update EdgeMetricsCE class	2024-06-30 17:37:18 +02:00
mhz	d57575586d	make the metrics code back	2024-06-30 16:43:08 +02:00
mhz	7274b3f606	update the taskmodel	2024-06-30 16:39:42 +02:00
mhz	66fe70028e	no need to read the api again and again	2024-06-29 17:16:08 +02:00
mhz	df26eef77c	update the new graph to json function	2024-06-28 16:29:43 +02:00
mhz	222470a43c	rewrite to graph metrics	2024-06-27 20:44:04 +02:00
mhz	a7f7010da7	write graph code for the absctract dataset	2024-06-26 23:42:01 +02:00
mhz	14186fa97f	write test code	2024-06-26 23:41:37 +02:00
mhz	a222c514d9	add get_train_graphs	2024-06-26 22:42:06 +02:00
mhz	062a27b83f	try update the api in DataInfo	2024-06-26 22:10:07 +02:00
mhz	0c7c525680	try update the api in DataInfo	2024-06-26 22:09:46 +02:00