From 66fe70028efe399c2ee2aa1106d40b697513ab99 Mon Sep 17 00:00:00 2001 From: mhz Date: Sat, 29 Jun 2024 17:16:08 +0200 Subject: [PATCH] no need to read the api again and again --- graph_dit/datasets/dataset.py | 250 ++++++++++++++++++++++------------ 1 file changed, 164 insertions(+), 86 deletions(-) diff --git a/graph_dit/datasets/dataset.py b/graph_dit/datasets/dataset.py index 4d017ac..dbd64f2 100644 --- a/graph_dit/datasets/dataset.py +++ b/graph_dit/datasets/dataset.py @@ -79,7 +79,7 @@ class DataModule(AbstractDataModule): source = './NAS-Bench-201-v1_1-096897.pth' dataset = Dataset(source=source, root=root_path, target_prop=target, transform=None) self.dataset = dataset - self.api = dataset.api + # self.api = dataset.api # if len(self.task.split('-')) == 2: # train_index, val_index, test_index, unlabeled_index = self.fixed_split(dataset) @@ -628,12 +628,12 @@ class Dataset(InMemoryDataset): self.target_prop = target_prop source = '/home/stud/hanzhang/nasbenchDiT/graph_dit/NAS-Bench-201-v1_1-096897.pth' self.source = source - self.api = API(source) # Initialize NAS-Bench-201 API - print('API loaded') + # self.api = API(source) # Initialize NAS-Bench-201 API + # print('API loaded') super().__init__(root, transform, pre_transform, pre_filter) print(self.processed_paths[0]) #/home/stud/hanzhang/Graph-DiT/graph_dit/NAS-Bench-201-v1_1-096897.pth.pt - print('Dataset initialized') self.data, self.slices = torch.load(self.processed_paths[0]) + print('Dataset initialized') self.data.edge_attr = self.data.edge_attr.squeeze() self.data.idx = torch.arange(len(self.data.y)) print(f"self.data={self.data}, self.slices={self.slices}") @@ -647,82 +647,146 @@ class Dataset(InMemoryDataset): return [f'{self.source}.pt'] def process(self): - def parse_architecture_string(arch_str): - stages = arch_str.split('+') - nodes = ['input'] - edges = [] - - for stage in stages: - operations = stage.strip('|').split('|') - for op in operations: - operation, idx = op.split('~') - idx = int(idx) - edges.append((idx, len(nodes))) # Add edge from idx to the new node - nodes.append(operation) - nodes.append('output') # Add the output node - return nodes, edges + source = '/home/stud/hanzhang/nasbenchDiT/graph_dit/NAS-Bench-201-v1_1-096897.pth' + self.api = API(source) - def create_graph(nodes, edges): - G = nx.DiGraph() - for i, node in enumerate(nodes): - G.add_node(i, label=node) - G.add_edges_from(edges) - return G - - def arch_to_graph(arch_str, sa, sc, target, target2=None, target3=None): - nodes, edges = parse_architecture_string(arch_str) - - node_labels = [bonds[node] for node in nodes] # Replace with appropriate encoding if necessary - assert 0 not in node_labels, f'Invalid node label: {node_labels}' - x = torch.LongTensor(node_labels) - print(f'in initialize Dataset, arch_to_Graph x={x}') - - edges_list = [(start, end) for start, end in edges] - edge_type = [bonds[nodes[end]] for start, end in edges] # Example: using end node type as edge type - edge_index = torch.tensor(edges_list, dtype=torch.long).t().contiguous() - edge_type = torch.tensor(edge_type, dtype=torch.long) - edge_attr = edge_type.view(-1, 1) - - if target3 is not None: - y = torch.tensor([sa, sc, target, target2, target3], dtype=torch.float).view(1, -1) - elif target2 is not None: - y = torch.tensor([sa, sc, target, target2], dtype=torch.float).view(1, -1) - else: - y = torch.tensor([sa, sc, target], dtype=torch.float).view(1, -1) - - print(f'in initialize Dataset, Data_init, x={x}, y={y}, edge_index={edge_index}, edge_attr={edge_attr}') - data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y) - return data, nodes - - bonds = { - 'nor_conv_1x1': 1, - 'nor_conv_3x3': 2, - 'avg_pool_3x3': 3, - 'skip_connect': 4, - 'output': 5, - 'none': 6, - 'input': 7 - } - - # Prepare to process NAS-Bench-201 data data_list = [] - len_data = len(self.api) # Number of architectures - with tqdm(total=len_data) as pbar: - for arch_index in range(len_data): - arch_info = self.api.query_meta_info_by_index(arch_index) - arch_str = arch_info.arch_str - sa = np.random.rand() # Placeholder for synthetic accessibility - sc = np.random.rand() # Placeholder for substructure count - target = np.random.rand() # Placeholder for target value - target2 = np.random.rand() # Placeholder for second target value - target3 = np.random.rand() # Placeholder for third target value + len_data = len(self.api) - data, active_nodes = arch_to_graph(arch_str, sa, sc, target, target2, target3) + def graph_to_graph_data(graph): + ops = graph[1] + adj = graph[0] + nodes = [] + for op in ops: + nodes.append(op_type[op]) + x = torch.LongTensor(nodes) + + edges_list = [] + edge_type = [] + for start in range(len(ops)): + for end in range(len(ops)): + if adj[start][end] == 1: + edges_list.append((start, end)) + edge_type.append(1) + edges_list.append((end, start)) + edge_type.append(1) + + edge_index = torch.tensor(edges_list, dtype=torch.long).t() + edge_type = torch.tensor(edge_type, dtype=torch.long) + edge_attr = edge_type + y = torch.tensor([0], dtype=torch.float).view(1, -1) + data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y, idx=i) + return data + graph_list = [] + + with tqdm(total = len_data) as pbar: + active_nodes = set() + for i in range(len_data): + arch_info = self.api.query_meta_info_by_index(i) + nodes, edges = parse_architecture_string(arch_info.arch_str) + adj_matrix, ops = create_adj_matrix_and_ops(nodes, edges) + for op in ops: + if op not in active_nodes: + active_nodes.add(op) + graph_list.append({ + "adj_matrix": adj_matrix, + "ops": ops, + "idx": i + }) + data = graph_to_graph_data((adj_matrix, ops)) data_list.append(data) pbar.update(1) - + + for graph in graph_list: + adj_matrix = graph['adj_matrix'] + if isinstance(adj_matrix, np.ndarray): + adj_matrix = adj_matrix.tolist() + graph['adj_matrix'] = adj_matrix + ops = graph['ops'] + if isinstance(ops, np.ndarray): + ops = ops.tolist() + graph['ops'] = ops + with open(f'nasbench-201-graph.json', 'w') as f: + json.dump(graph_list, f) + torch.save(self.collate(data_list), self.processed_paths[0]) + # def parse_architecture_string(arch_str): + # stages = arch_str.split('+') + # nodes = ['input'] + # edges = [] + + # for stage in stages: + # operations = stage.strip('|').split('|') + # for op in operations: + # operation, idx = op.split('~') + # idx = int(idx) + # edges.append((idx, len(nodes))) # Add edge from idx to the new node + # nodes.append(operation) + # nodes.append('output') # Add the output node + # return nodes, edges + + # def create_graph(nodes, edges): + # G = nx.DiGraph() + # for i, node in enumerate(nodes): + # G.add_node(i, label=node) + # G.add_edges_from(edges) + # return G + + # def arch_to_graph(arch_str, sa, sc, target, target2=None, target3=None): + # nodes, edges = parse_architecture_string(arch_str) + + # node_labels = [bonds[node] for node in nodes] # Replace with appropriate encoding if necessary + # assert 0 not in node_labels, f'Invalid node label: {node_labels}' + # x = torch.LongTensor(node_labels) + # print(f'in initialize Dataset, arch_to_Graph x={x}') + + # edges_list = [(start, end) for start, end in edges] + # edge_type = [bonds[nodes[end]] for start, end in edges] # Example: using end node type as edge type + # edge_index = torch.tensor(edges_list, dtype=torch.long).t().contiguous() + # edge_type = torch.tensor(edge_type, dtype=torch.long) + # edge_attr = edge_type.view(-1, 1) + + # if target3 is not None: + # y = torch.tensor([sa, sc, target, target2, target3], dtype=torch.float).view(1, -1) + # elif target2 is not None: + # y = torch.tensor([sa, sc, target, target2], dtype=torch.float).view(1, -1) + # else: + # y = torch.tensor([sa, sc, target], dtype=torch.float).view(1, -1) + + # print(f'in initialize Dataset, Data_init, x={x}, y={y}, edge_index={edge_index}, edge_attr={edge_attr}') + # data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y) + # return data, nodes + + # bonds = { + # 'nor_conv_1x1': 1, + # 'nor_conv_3x3': 2, + # 'avg_pool_3x3': 3, + # 'skip_connect': 4, + # 'output': 5, + # 'none': 6, + # 'input': 7 + # } + + # # Prepare to process NAS-Bench-201 data + # data_list = [] + # len_data = len(self.api) # Number of architectures + # with tqdm(total=len_data) as pbar: + # for arch_index in range(len_data): + # arch_info = self.api.query_meta_info_by_index(arch_index) + # arch_str = arch_info.arch_str + # sa = np.random.rand() # Placeholder for synthetic accessibility + # sc = np.random.rand() # Placeholder for substructure count + # target = np.random.rand() # Placeholder for target value + # target2 = np.random.rand() # Placeholder for second target value + # target3 = np.random.rand() # Placeholder for third target value + + # data, active_nodes = arch_to_graph(arch_str, sa, sc, target, target2, target3) + # data_list.append(data) + # pbar.update(1) + + # torch.save(self.collate(data_list), self.processed_paths[0]) + class Dataset_origin(InMemoryDataset): def __init__(self, source, root, target_prop=None, transform=None, pre_transform=None, pre_filter=None): @@ -841,7 +905,7 @@ class DataInfos(AbstractDatasetInfos): self.task = task_name self.task_type = tasktype_dict.get(task_name, "regression") self.ensure_connected = cfg.model.ensure_connected - self.api = dataset.api + # self.api = dataset.api datadir = cfg.dataset.datadir @@ -853,20 +917,34 @@ class DataInfos(AbstractDatasetInfos): ops_type = {} len_ops = set() # api = API('/home/stud/hanzhang/Graph-DiT/graph_dit/NAS-Bench-201-v1_1-096897.pth') - for i in range(length): - arch_info = self.api.query_meta_info_by_index(i) - nodes, edges = parse_architecture_string(arch_info.arch_str) - adj_matrix, ops = create_adj_matrix_and_ops(nodes, edges) + + + def read_adj_ops_from_json(filename): + with open(filename, 'r') as json_file: + data = json.load(json_file) + + adj_ops_pairs = [] + for item in data: + adj_matrix = np.array(item['adjacency_matrix']) + ops = item['operations'] + adj_ops_pairs.append((adj_matrix, ops)) + + return adj_ops_pairs + # for i in range(length): + # arch_info = self.api.query_meta_info_by_index(i) + # nodes, edges = parse_architecture_string(arch_info.arch_str) + # adj_matrix, ops = create_adj_matrix_and_ops(nodes, edges) # if i < 5: # print("Adjacency Matrix:") # print(adj_matrix) # print("Operations List:") # print(ops) - for op in ops: - if op not in ops_type: - ops_type[op] = len(ops_type) - len_ops.add(len(ops)) - graphs.append((adj_matrix, ops)) + # for op in ops: + # if op not in ops_type: + # ops_type[op] = len(ops_type) + # len_ops.add(len(ops)) + # graphs.append((adj_matrix, ops)) + graphs = read_adj_ops_from_json(f'nasbench-201.meta.json') # check first five graphs for i in range(5): @@ -879,13 +957,13 @@ class DataInfos(AbstractDatasetInfos): self.max_n_nodes = meta_dict['max_n_nodes'] self.original_max_n_nodes = meta_dict['max_n_nodes'] self.n_nodes = torch.Tensor(meta_dict['n_nodes_per_graph']) - self.edge_types = torch.Tensor(meta_dict['edge_type_dist']) + self.edge_types = torch.Tensor(meta_dict['edge_type_list']) self.transition_E = torch.Tensor(meta_dict['transition_E']) self.node_decoder = meta_dict['active_nodes'] - node_types = torch.Tensor(meta_dict['node_type_dist']) + node_types = torch.Tensor(meta_dict['node_type_list']) active_index = (node_types > 0).nonzero().squeeze() - self.node_types = torch.Tensor(meta_dict['node_type_dist'])[active_index] + self.node_types = torch.Tensor(meta_dict['node_type_list'])[active_index] self.nodes_dist = DistributionNodes(self.n_nodes) self.active_index = active_index