22 Commits

Author SHA1 Message Date
mhz
d44900c8ba update print and output json statements 2024-07-03 15:26:12 +02:00
mhz
73324083ce update the gpu id 2024-07-03 15:25:46 +02:00
mhz
0c3cfb195a process function needs 2 dimension for y 2024-07-01 10:06:51 +02:00
mhz
4d1dea1179 comment some output statements and record dimension infos 2024-07-01 10:05:45 +02:00
mhz
7147679c42 EdgeMetricsCE only has 0 and 1 2024-07-01 10:04:51 +02:00
mhz
817ef04c58 comment some output statements 2024-07-01 10:04:07 +02:00
mhz
dd31fda8d5 comment some output statements 2024-07-01 10:03:40 +02:00
mhz
572f030677 working dairy 2024-07-01 10:03:05 +02:00
mhz
ba008ae54c update the main function 2024-07-01 10:02:51 +02:00
mhz
f5911be781 some onehot issue 2024-06-30 21:09:16 +02:00
mhz
be8bb16f61 update a small problem 2024-06-30 19:41:31 +02:00
mhz
0fc6f6e686 update EdgeMetricsCE class 2024-06-30 17:37:18 +02:00
mhz
d57575586d make the metrics code back 2024-06-30 16:43:08 +02:00
mhz
7274b3f606 update the taskmodel 2024-06-30 16:39:42 +02:00
mhz
66fe70028e no need to read the api again and again 2024-06-29 17:16:08 +02:00
mhz
df26eef77c update the new graph to json function 2024-06-28 16:29:43 +02:00
mhz
222470a43c rewrite to graph metrics 2024-06-27 20:44:04 +02:00
mhz
a7f7010da7 write graph code for the absctract dataset 2024-06-26 23:42:01 +02:00
mhz
14186fa97f write test code 2024-06-26 23:41:37 +02:00
mhz
a222c514d9 add get_train_graphs 2024-06-26 22:42:06 +02:00
mhz
062a27b83f try update the api in DataInfo 2024-06-26 22:10:07 +02:00
mhz
0c7c525680 try update the api in DataInfo 2024-06-26 22:09:46 +02:00
13 changed files with 1182 additions and 162 deletions

View File

@@ -2,6 +2,7 @@ general:
name: 'graph_dit' name: 'graph_dit'
wandb: 'disabled' wandb: 'disabled'
gpus: 1 gpus: 1
gpu_number: 3
resume: null resume: null
test_only: null test_only: null
sample_every_val: 2500 sample_every_val: 2500
@@ -10,7 +11,7 @@ general:
chains_to_save: 1 chains_to_save: 1
log_every_steps: 50 log_every_steps: 50
number_chain_steps: 8 number_chain_steps: 8
final_model_samples_to_generate: 10000 final_model_samples_to_generate: 100
final_model_samples_to_save: 20 final_model_samples_to_save: 20
final_model_chains_to_save: 1 final_model_chains_to_save: 1
enable_progress_bar: False enable_progress_bar: False
@@ -30,7 +31,7 @@ model:
lambda_train: [1, 10] # node and edge training weight lambda_train: [1, 10] # node and edge training weight
ensure_connected: True ensure_connected: True
train: train:
n_epochs: 10000 n_epochs: 5000
batch_size: 1200 batch_size: 1200
lr: 0.0002 lr: 0.0002
clip_grad: null clip_grad: null

View File

@@ -116,7 +116,7 @@ class AbstractDatasetInfos:
def compute_input_output_dims(self, datamodule): def compute_input_output_dims(self, datamodule):
example_batch = datamodule.example_batch() example_batch = datamodule.example_batch()
example_batch_x = torch.nn.functional.one_hot(example_batch.x, num_classes=118).float()[:, self.active_index] example_batch_x = torch.nn.functional.one_hot(example_batch.x, num_classes=118).float()[:, self.active_index]
example_batch_edge_attr = torch.nn.functional.one_hot(example_batch.edge_attr, num_classes=10).float() example_batch_edge_attr = torch.nn.functional.one_hot(example_batch.edge_attr, num_classes=2).float()
self.input_dims = {'X': example_batch_x.size(1), self.input_dims = {'X': example_batch_x.size(1),
'E': example_batch_edge_attr.size(1), 'E': example_batch_edge_attr.size(1),
@@ -127,4 +127,19 @@ class AbstractDatasetInfos:
print('input dims') print('input dims')
print(self.input_dims) print(self.input_dims)
print('output dims') print('output dims')
print(self.output_dims)
def compute_graph_input_output_dims(self, datamodule):
example_batch = datamodule.example_batch()
example_batch_x = torch.nn.functional.one_hot(example_batch.x, num_classes=8).float()[:, self.active_index]
example_batch_edge_attr = torch.nn.functional.one_hot(example_batch.edge_attr, num_classes=2).float()
self.input_dims = {'X': example_batch_x.size(1),
'E': example_batch_edge_attr.size(1),
'y': example_batch['y'].size(1)}
self.output_dims = {'X': example_batch_x.size(1),
'E': example_batch_edge_attr.size(1),
'y': example_batch['y'].size(1)}
print('input dims')
print(self.input_dims)
print('output dims')
print(self.output_dims) print(self.output_dims)

View File

@@ -39,6 +39,16 @@ op_to_atom = {
'none': 'S', # Sulfur for no operation 'none': 'S', # Sulfur for no operation
'output': 'He' # Helium for output 'output': 'He' # Helium for output
} }
op_type = {
'nor_conv_1x1': 1,
'nor_conv_3x3': 2,
'avg_pool_3x3': 3,
'skip_connect': 4,
'output': 5,
'none': 6,
'input': 7
}
class DataModule(AbstractDataModule): class DataModule(AbstractDataModule):
def __init__(self, cfg): def __init__(self, cfg):
self.datadir = cfg.dataset.datadir self.datadir = cfg.dataset.datadir
@@ -50,12 +60,12 @@ class DataModule(AbstractDataModule):
def prepare_data(self) -> None: def prepare_data(self) -> None:
target = getattr(self.cfg.dataset, 'guidance_target', None) target = getattr(self.cfg.dataset, 'guidance_target', None)
print("target", target) print("target", target) # nasbench-201
# try: # try:
# base_path = pathlib.Path(os.path.realpath(__file__)).parents[2] # base_path = pathlib.Path(os.path.realpath(__file__)).parents[2]
# except NameError: # except NameError:
# base_path = pathlib.Path(os.getcwd()).parent[2] # base_path = pathlib.Path(os.getcwd()).parent[2]
base_path = '/home/stud/hanzhang/Graph-Dit' base_path = '/home/stud/hanzhang/nasbenchDiT'
root_path = os.path.join(base_path, self.datadir) root_path = os.path.join(base_path, self.datadir)
self.root_path = root_path self.root_path = root_path
@@ -68,13 +78,16 @@ class DataModule(AbstractDataModule):
# Dataset has target property, root path, and transform # Dataset has target property, root path, and transform
source = './NAS-Bench-201-v1_1-096897.pth' source = './NAS-Bench-201-v1_1-096897.pth'
dataset = Dataset(source=source, root=root_path, target_prop=target, transform=None) dataset = Dataset(source=source, root=root_path, target_prop=target, transform=None)
self.dataset = dataset
# self.api = dataset.api
# if len(self.task.split('-')) == 2: # if len(self.task.split('-')) == 2:
# train_index, val_index, test_index, unlabeled_index = self.fixed_split(dataset) # train_index, val_index, test_index, unlabeled_index = self.fixed_split(dataset)
# else: # else:
train_index, val_index, test_index, unlabeled_index = self.random_data_split(dataset) train_index, val_index, test_index, unlabeled_index = self.random_data_split(dataset)
self.train_index, self.val_index, self.test_index, self.unlabeled_index = train_index, val_index, test_index, unlabeled_index self.train_index, self.val_index, self.test_index, self.unlabeled_index = (
train_index, val_index, test_index, unlabeled_index)
train_index, val_index, test_index, unlabeled_index = torch.LongTensor(train_index), torch.LongTensor(val_index), torch.LongTensor(test_index), torch.LongTensor(unlabeled_index) train_index, val_index, test_index, unlabeled_index = torch.LongTensor(train_index), torch.LongTensor(val_index), torch.LongTensor(test_index), torch.LongTensor(unlabeled_index)
if len(unlabeled_index) > 0: if len(unlabeled_index) > 0:
train_index = torch.cat([train_index, unlabeled_index], dim=0) train_index = torch.cat([train_index, unlabeled_index], dim=0)
@@ -175,6 +188,27 @@ class DataModule(AbstractDataModule):
smiles = Chem.MolToSmiles(mol) smiles = Chem.MolToSmiles(mol)
return smiles return smiles
def get_train_graphs(self):
train_graphs = []
test_graphs = []
for graph in self.train_dataset:
train_graphs.append(graph)
for graph in self.test_dataset:
test_graphs.append(graph)
return train_graphs, test_graphs
# def get_train_smiles(self):
# filename = f'{self.task}.csv.gz'
# df = pd.read_csv(f'{self.root_path}/raw/{filename}')
# df_test = df.iloc[self.test_index]
# df = df.iloc[self.train_index]
# smiles_list = df['smiles'].tolist()
# smiles_list_test = df_test['smiles'].tolist()
# smiles_list = [Chem.MolToSmiles(Chem.MolFromSmiles(smi)) for smi in smiles_list]
# smiles_list_test = [Chem.MolToSmiles(Chem.MolFromSmiles(smi)) for smi in smiles_list_test]
# return smiles_list, smiles_list_test
def get_train_smiles(self): def get_train_smiles(self):
train_smiles = [] train_smiles = []
test_smiles = [] test_smiles = []
@@ -319,6 +353,121 @@ class DataModule_original(AbstractDataModule):
def test_dataloader(self): def test_dataloader(self):
return self.test_loader return self.test_loader
def new_graphs_to_json(graphs, filename):
source_name = "nasbench-201"
num_graph = len(graphs)
node_name_list = []
node_count_list = []
node_name_list.append('*')
for op_name in op_type:
node_name_list.append(op_name)
node_count_list.append(0)
node_count_list.append(0)
n_nodes_per_graph = [0] * num_graph
edge_count_list = [0, 0]
valencies = [0] * (len(op_type) + 1)
transition_E = np.zeros((len(op_type) + 1, len(op_type) + 1, 2))
n_node_list = []
n_edge_list = []
for graph in graphs:
ops = graph[1]
adj = graph[0]
n_node = len(ops)
n_edge = len(ops)
n_node_list.append(n_node)
n_edge_list.append(n_edge)
n_nodes_per_graph[n_node] += 1
cur_node_count_arr = np.zeros(len(op_type) + 1)
for op in ops:
node = op
# if node == '*':
# node_count_list[-1] += 1
# cur_node_count_arr[-1] += 1
# else:
node_count_list[node] += 1
cur_node_count_arr[node] += 1
try:
valencies[node] += 1
except:
print('int(op_type[node])', int(node))
transition_E_temp = np.zeros((len(op_type) + 1, len(op_type) + 1, 2))
for i in range(n_node):
for j in range(n_node):
if i == j or adj[i][j] == 0:
continue
start_node, end_node = i, j
start_index = ops[start_node]
end_index = ops[end_node]
bond_index = 1
edge_count_list[bond_index] += 2
transition_E[start_index, end_index, bond_index] += 2
transition_E[end_index, start_index, bond_index] += 2
transition_E_temp[start_index, end_index, bond_index] += 2
transition_E_temp[end_index, start_index, bond_index] += 2
edge_count_list[0] += n_node * (n_node - 1) - n_edge * 2
cur_tot_edge = cur_node_count_arr.reshape(-1,1) * cur_node_count_arr.reshape(1,-1) * 2
# print(f"cur_tot_edge={cur_tot_edge}, shape: {cur_tot_edge.shape}")
cur_tot_edge = cur_tot_edge - np.diag(cur_node_count_arr) * 2
transition_E[:, :, 0] += cur_tot_edge - transition_E_temp.sum(axis=-1)
assert (cur_tot_edge > transition_E_temp.sum(axis=-1)).sum() >= 0
n_nodes_per_graph = np.array(n_nodes_per_graph) / np.sum(n_nodes_per_graph)
n_nodes_per_graph = n_nodes_per_graph.tolist()[:51]
node_count_list = np.array(node_count_list) / np.sum(node_count_list)
print('processed meta info: ------', filename, '------')
print('len node_count_list', len(node_count_list))
print('len node_name_list', len(node_name_list))
active_nodes = np.array(node_name_list)[node_count_list > 0]
active_nodes = active_nodes.tolist()
node_count_list = node_count_list.tolist()
edge_count_list = np.array(edge_count_list) / np.sum(edge_count_list)
edge_count_list = edge_count_list.tolist()
valencies = np.array(valencies) / np.sum(valencies)
valencies = valencies.tolist()
no_edge = np.sum(transition_E, axis=-1) == 0
first_elt = transition_E[:, :, 0]
first_elt[no_edge] = 1
transition_E[:, :, 0] = first_elt
transition_E = transition_E / np.sum(transition_E, axis=-1, keepdims=True)
meta_dict = {
'source': source_name,
'num_graph': num_graph,
'n_nodes_per_graph': n_nodes_per_graph,
'max_n_nodes': max(n_node_list),
'max_n_edges': max(n_edge_list),
'node_type_list': node_count_list,
'edge_type_list': edge_count_list,
'valencies': valencies,
'active_nodes': active_nodes,
'num_active_nodes': len(active_nodes),
'transition_E': transition_E.tolist(),
}
with open(f'/home/stud/hanzhang/nasbenchDiT/graph_dit/nasbench-201-meta.json', 'w') as f:
json.dump(meta_dict, f)
return meta_dict
def graphs_to_json(graphs, filename): def graphs_to_json(graphs, filename):
bonds = { bonds = {
'nor_conv_1x1': 1, 'nor_conv_1x1': 1,
@@ -466,7 +615,7 @@ def graphs_to_json(graphs, filename):
'atom_type_dist': atom_count_list, 'atom_type_dist': atom_count_list,
'bond_type_dist': bond_count_list, 'bond_type_dist': bond_count_list,
'valencies': valencies, 'valencies': valencies,
'active_atoms': [atom_name_list[i] for i in range(118) if atom_count_list[i] > 0], 'active_nodes': [atom_name_list[i] for i in range(118) if atom_count_list[i] > 0],
'num_atom_type': len([atom_name_list[i] for i in range(118) if atom_count_list[i] > 0]), 'num_atom_type': len([atom_name_list[i] for i in range(118) if atom_count_list[i] > 0]),
'transition_E': transition_E.tolist(), 'transition_E': transition_E.tolist(),
} }
@@ -477,14 +626,17 @@ def graphs_to_json(graphs, filename):
class Dataset(InMemoryDataset): class Dataset(InMemoryDataset):
def __init__(self, source, root, target_prop=None, transform=None, pre_transform=None, pre_filter=None): def __init__(self, source, root, target_prop=None, transform=None, pre_transform=None, pre_filter=None):
self.target_prop = target_prop self.target_prop = target_prop
source = '/home/stud/hanzhang/Graph-DiT/graph_dit/NAS-Bench-201-v1_1-096897.pth' source = '/home/stud/hanzhang/nasbenchDiT/graph_dit/NAS-Bench-201-v1_1-096897.pth'
self.source = source self.source = source
self.api = API(source) # Initialize NAS-Bench-201 API # self.api = API(source) # Initialize NAS-Bench-201 API
print('API loaded') # print('API loaded')
super().__init__(root, transform, pre_transform, pre_filter) super().__init__(root, transform, pre_transform, pre_filter)
print('Dataset initialized') print(self.processed_paths[0]) #/home/stud/hanzhang/Graph-DiT/graph_dit/NAS-Bench-201-v1_1-096897.pth.pt
print(self.processed_paths[0])
self.data, self.slices = torch.load(self.processed_paths[0]) self.data, self.slices = torch.load(self.processed_paths[0])
print('Dataset initialized')
self.data.edge_attr = self.data.edge_attr.squeeze()
self.data.idx = torch.arange(len(self.data.y))
print(f"self.data={self.data}, self.slices={self.slices}")
@property @property
def raw_file_names(self): def raw_file_names(self):
@@ -495,82 +647,172 @@ class Dataset(InMemoryDataset):
return [f'{self.source}.pt'] return [f'{self.source}.pt']
def process(self): def process(self):
def parse_architecture_string(arch_str): source = '/home/stud/hanzhang/nasbenchDiT/graph_dit/NAS-Bench-201-v1_1-096897.pth'
stages = arch_str.split('+') self.api = API(source)
nodes = ['input']
edges = []
for stage in stages:
operations = stage.strip('|').split('|')
for op in operations:
operation, idx = op.split('~')
idx = int(idx)
edges.append((idx, len(nodes))) # Add edge from idx to the new node
nodes.append(operation)
nodes.append('output') # Add the output node
return nodes, edges
def create_graph(nodes, edges):
G = nx.DiGraph()
for i, node in enumerate(nodes):
G.add_node(i, label=node)
G.add_edges_from(edges)
return G
def arch_to_graph(arch_str, sa, sc, target, target2=None, target3=None):
nodes, edges = parse_architecture_string(arch_str)
node_labels = [bonds[node] for node in nodes] # Replace with appropriate encoding if necessary
assert 0 not in node_labels, f'Invalid node label: {node_labels}'
x = torch.LongTensor(node_labels)
print(f'in initialize Dataset, arch_to_Graph x={x}')
edges_list = [(start, end) for start, end in edges]
edge_type = [bonds[nodes[end]] for start, end in edges] # Example: using end node type as edge type
edge_index = torch.tensor(edges_list, dtype=torch.long).t().contiguous()
edge_type = torch.tensor(edge_type, dtype=torch.long)
edge_attr = edge_type.view(-1, 1)
if target3 is not None:
y = torch.tensor([sa, sc, target, target2, target3], dtype=torch.float).view(1, -1)
elif target2 is not None:
y = torch.tensor([sa, sc, target, target2], dtype=torch.float).view(1, -1)
else:
y = torch.tensor([sa, sc, target], dtype=torch.float).view(1, -1)
print(f'in initialize Dataset, Data_init, x={x}, y={y}, edge_index={edge_index}, edge_attr={edge_attr}')
data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
return data, nodes
bonds = {
'nor_conv_1x1': 1,
'nor_conv_3x3': 2,
'avg_pool_3x3': 3,
'skip_connect': 4,
'output': 5,
'none': 6,
'input': 7
}
# Prepare to process NAS-Bench-201 data
data_list = [] data_list = []
len_data = len(self.api) # Number of architectures len_data = len(self.api)
with tqdm(total=len_data) as pbar:
for arch_index in range(len_data):
arch_info = self.api.query_meta_info_by_index(arch_index)
arch_str = arch_info.arch_str
sa = np.random.rand() # Placeholder for synthetic accessibility
sc = np.random.rand() # Placeholder for substructure count
target = np.random.rand() # Placeholder for target value
target2 = np.random.rand() # Placeholder for second target value
target3 = np.random.rand() # Placeholder for third target value
data, active_nodes = arch_to_graph(arch_str, sa, sc, target, target2, target3) def graph_to_graph_data(graph):
ops = graph[1]
adj = graph[0]
nodes = []
for op in ops:
nodes.append(op_type[op])
x = torch.LongTensor(nodes)
edges_list = []
edge_type = []
for start in range(len(ops)):
for end in range(len(ops)):
if adj[start][end] == 1:
edges_list.append((start, end))
edge_type.append(1)
edges_list.append((end, start))
edge_type.append(1)
edge_index = torch.tensor(edges_list, dtype=torch.long).t()
edge_type = torch.tensor(edge_type, dtype=torch.long)
edge_attr = edge_type
y = torch.tensor([0, 0], dtype=torch.float).view(1, -1)
data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y, idx=i)
return data
graph_list = []
with tqdm(total = len_data) as pbar:
active_nodes = set()
for i in range(len_data):
arch_info = self.api.query_meta_info_by_index(i)
results = self.api.query_by_index(i, 'cifar100')
nodes, edges = parse_architecture_string(arch_info.arch_str)
adj_matrix, ops = create_adj_matrix_and_ops(nodes, edges)
for op in ops:
if op not in active_nodes:
active_nodes.add(op)
graph_list.append({
"adj_matrix": adj_matrix,
"ops": ops,
"idx": i,
"train": [{
"iepoch": result.get_train()['iepoch'],
"loss": result.get_train()['loss'],
"accuracy": result.get_train()['accuracy'],
"cur_time": result.get_train()['cur_time'],
"all_time": result.get_train()['all_time'],
"seed": seed,
}for seed, result in results.items()],
"valid": [{
"iepoch": result.get_eval('x-valid')['iepoch'],
"loss": result.get_eval('x-valid')['loss'],
"accuracy": result.get_eval('x-valid')['accuracy'],
"cur_time": result.get_eval('x-valid')['cur_time'],
"all_time": result.get_eval('x-valid')['all_time'],
"seed": seed,
}for seed, result in results.items()],
"test": [{
"iepoch": result.get_eval('x-test')['iepoch'],
"loss": result.get_eval('x-test')['loss'],
"accuracy": result.get_eval('x-test')['accuracy'],
"cur_time": result.get_eval('x-test')['cur_time'],
"all_time": result.get_eval('x-test')['all_time'],
"seed": seed,
}for seed, result in results.items()]
})
data = graph_to_graph_data((adj_matrix, ops))
data_list.append(data) data_list.append(data)
pbar.update(1) pbar.update(1)
for graph in graph_list:
adj_matrix = graph['adj_matrix']
if isinstance(adj_matrix, np.ndarray):
adj_matrix = adj_matrix.tolist()
graph['adj_matrix'] = adj_matrix
ops = graph['ops']
if isinstance(ops, np.ndarray):
ops = ops.tolist()
graph['ops'] = ops
with open(f'nasbench-201-graph.json', 'w') as f:
json.dump(graph_list, f)
torch.save(self.collate(data_list), self.processed_paths[0]) torch.save(self.collate(data_list), self.processed_paths[0])
# def parse_architecture_string(arch_str):
# stages = arch_str.split('+')
# nodes = ['input']
# edges = []
# for stage in stages:
# operations = stage.strip('|').split('|')
# for op in operations:
# operation, idx = op.split('~')
# idx = int(idx)
# edges.append((idx, len(nodes))) # Add edge from idx to the new node
# nodes.append(operation)
# nodes.append('output') # Add the output node
# return nodes, edges
# def create_graph(nodes, edges):
# G = nx.DiGraph()
# for i, node in enumerate(nodes):
# G.add_node(i, label=node)
# G.add_edges_from(edges)
# return G
# def arch_to_graph(arch_str, sa, sc, target, target2=None, target3=None):
# nodes, edges = parse_architecture_string(arch_str)
# node_labels = [bonds[node] for node in nodes] # Replace with appropriate encoding if necessary
# assert 0 not in node_labels, f'Invalid node label: {node_labels}'
# x = torch.LongTensor(node_labels)
# print(f'in initialize Dataset, arch_to_Graph x={x}')
# edges_list = [(start, end) for start, end in edges]
# edge_type = [bonds[nodes[end]] for start, end in edges] # Example: using end node type as edge type
# edge_index = torch.tensor(edges_list, dtype=torch.long).t().contiguous()
# edge_type = torch.tensor(edge_type, dtype=torch.long)
# edge_attr = edge_type.view(-1, 1)
# if target3 is not None:
# y = torch.tensor([sa, sc, target, target2, target3], dtype=torch.float).view(1, -1)
# elif target2 is not None:
# y = torch.tensor([sa, sc, target, target2], dtype=torch.float).view(1, -1)
# else:
# y = torch.tensor([sa, sc, target], dtype=torch.float).view(1, -1)
# print(f'in initialize Dataset, Data_init, x={x}, y={y}, edge_index={edge_index}, edge_attr={edge_attr}')
# data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
# return data, nodes
# bonds = {
# 'nor_conv_1x1': 1,
# 'nor_conv_3x3': 2,
# 'avg_pool_3x3': 3,
# 'skip_connect': 4,
# 'output': 5,
# 'none': 6,
# 'input': 7
# }
# # Prepare to process NAS-Bench-201 data
# data_list = []
# len_data = len(self.api) # Number of architectures
# with tqdm(total=len_data) as pbar:
# for arch_index in range(len_data):
# arch_info = self.api.query_meta_info_by_index(arch_index)
# arch_str = arch_info.arch_str
# sa = np.random.rand() # Placeholder for synthetic accessibility
# sc = np.random.rand() # Placeholder for substructure count
# target = np.random.rand() # Placeholder for target value
# target2 = np.random.rand() # Placeholder for second target value
# target3 = np.random.rand() # Placeholder for third target value
# data, active_nodes = arch_to_graph(arch_str, sa, sc, target, target2, target3)
# data_list.append(data)
# pbar.update(1)
# torch.save(self.collate(data_list), self.processed_paths[0])
class Dataset_origin(InMemoryDataset): class Dataset_origin(InMemoryDataset):
def __init__(self, source, root, target_prop=None, def __init__(self, source, root, target_prop=None,
transform=None, pre_transform=None, pre_filter=None): transform=None, pre_transform=None, pre_filter=None):
@@ -656,7 +898,7 @@ class Dataset_origin(InMemoryDataset):
torch.save(self.collate(data_list), self.processed_paths[0]) torch.save(self.collate(data_list), self.processed_paths[0])
def parse_architecture_string(arch_str): def parse_architecture_string(arch_str):
print(arch_str) # print(arch_str)
steps = arch_str.split('+') steps = arch_str.split('+')
nodes = ['input'] # Start with input node nodes = ['input'] # Start with input node
edges = [] edges = []
@@ -676,7 +918,7 @@ def create_adj_matrix_and_ops(nodes, edges):
adj_matrix[src][dst] = 1 adj_matrix[src][dst] = 1
return adj_matrix, nodes return adj_matrix, nodes
class DataInfos(AbstractDatasetInfos): class DataInfos(AbstractDatasetInfos):
def __init__(self, datamodule, cfg): def __init__(self, datamodule, cfg, dataset):
tasktype_dict = { tasktype_dict = {
'hiv_b': 'classification', 'hiv_b': 'classification',
'bace_b': 'classification', 'bace_b': 'classification',
@@ -689,6 +931,7 @@ class DataInfos(AbstractDatasetInfos):
self.task = task_name self.task = task_name
self.task_type = tasktype_dict.get(task_name, "regression") self.task_type = tasktype_dict.get(task_name, "regression")
self.ensure_connected = cfg.model.ensure_connected self.ensure_connected = cfg.model.ensure_connected
# self.api = dataset.api
datadir = cfg.dataset.datadir datadir = cfg.dataset.datadir
@@ -699,35 +942,55 @@ class DataInfos(AbstractDatasetInfos):
length = 15625 length = 15625
ops_type = {} ops_type = {}
len_ops = set() len_ops = set()
api = API('/home/stud/hanzhang/Graph-DiT/graph_dit/NAS-Bench-201-v1_1-096897.pth') # api = API('/home/stud/hanzhang/Graph-DiT/graph_dit/NAS-Bench-201-v1_1-096897.pth')
for i in range(length):
arch_info = api.query_meta_info_by_index(i)
nodes, edges = parse_architecture_string(arch_info.arch_str)
adj_matrix, ops = create_adj_matrix_and_ops(nodes, edges)
if i < 5:
print("Adjacency Matrix:")
print(adj_matrix)
print("Operations List:")
print(ops)
for op in ops:
if op not in ops_type:
ops_type[op] = len(ops_type)
len_ops.add(len(ops))
graphs.append((adj_matrix, ops))
meta_dict = graphs_to_json(graphs, 'nasbench-201')
def read_adj_ops_from_json(filename):
with open(filename, 'r') as json_file:
data = json.load(json_file)
adj_ops_pairs = []
for item in data:
adj_matrix = np.array(item['adj_matrix'])
ops = item['ops']
ops = [op_type[op] for op in ops]
adj_ops_pairs.append((adj_matrix, ops))
return adj_ops_pairs
# for i in range(length):
# arch_info = self.api.query_meta_info_by_index(i)
# nodes, edges = parse_architecture_string(arch_info.arch_str)
# adj_matrix, ops = create_adj_matrix_and_ops(nodes, edges)
# if i < 5:
# print("Adjacency Matrix:")
# print(adj_matrix)
# print("Operations List:")
# print(ops)
# for op in ops:
# if op not in ops_type:
# ops_type[op] = len(ops_type)
# len_ops.add(len(ops))
# graphs.append((adj_matrix, ops))
graphs = read_adj_ops_from_json(f'/home/stud/hanzhang/nasbenchDiT/graph_dit/nasbench-201-graph.json')
# check first five graphs
for i in range(5):
print(f'graph {i} : {graphs[i]}')
print(f'ops_type: {ops_type}')
meta_dict = new_graphs_to_json(graphs, 'nasbench-201')
self.base_path = base_path self.base_path = base_path
self.active_atoms = meta_dict['active_atoms'] self.active_nodes = meta_dict['active_nodes']
self.max_n_nodes = meta_dict['max_node'] self.max_n_nodes = meta_dict['max_n_nodes']
self.original_max_n_nodes = meta_dict['max_node'] self.original_max_n_nodes = meta_dict['max_n_nodes']
self.n_nodes = torch.Tensor(meta_dict['n_atoms_per_mol_dist']) self.n_nodes = torch.Tensor(meta_dict['n_nodes_per_graph'])
self.edge_types = torch.Tensor(meta_dict['bond_type_dist']) self.edge_types = torch.Tensor(meta_dict['edge_type_list'])
self.transition_E = torch.Tensor(meta_dict['transition_E']) self.transition_E = torch.Tensor(meta_dict['transition_E'])
self.atom_decoder = meta_dict['active_atoms'] self.node_decoder = meta_dict['active_nodes']
node_types = torch.Tensor(meta_dict['atom_type_dist']) node_types = torch.Tensor(meta_dict['node_type_list'])
active_index = (node_types > 0).nonzero().squeeze() active_index = (node_types > 0).nonzero().squeeze()
self.node_types = torch.Tensor(meta_dict['atom_type_dist'])[active_index] self.node_types = torch.Tensor(meta_dict['node_type_list'])[active_index]
self.nodes_dist = DistributionNodes(self.n_nodes) self.nodes_dist = DistributionNodes(self.n_nodes)
self.active_index = active_index self.active_index = active_index
@@ -922,11 +1185,11 @@ def compute_meta(root, source_name, train_index, test_index):
'transition_E': tansition_E.tolist(), 'transition_E': tansition_E.tolist(),
} }
with open(f'{root}/{source_name}.meta.json', "w") as f: with open(f'/home/stud/hanzhang/nasbenchDiT/graph_dit/nasbench201.meta.json', "w") as f:
json.dump(meta_dict, f) json.dump(meta_dict, f)
return meta_dict return meta_dict
if __name__ == "__main__": if __name__ == "__main__":
pass dataset = Dataset(source='nasbench', root='/home/stud/hanzhang/nasbenchDiT/graph-dit', target_prop='Class', transform=None)

View File

@@ -65,10 +65,11 @@ def reverse_tensor(x):
def sample_discrete_features(probX, probE, node_mask, step=None, add_nose=True): def sample_discrete_features(probX, probE, node_mask, step=None, add_nose=True):
''' Sample features from multinomial distribution with given probabilities (probX, probE, proby) ''' Sample features from multinomial distribution with given probabilities (probX, probE, proby)
:param probX: bs, n, dx_out node features :param probX: bs, n, dx_out node features 1200 8 7
:param probE: bs, n, n, de_out edge features :param probE: bs, n, n, de_out edge features 1200 8 8 2
:param proby: bs, dy_out global features. :param proby: bs, dy_out global features. 1200 8
''' '''
# print(f"sample_discrete_features in: probX: {probX.shape}, probE: {probE.shape}, node_mask: {node_mask.shape}")
bs, n, _ = probX.shape bs, n, _ = probX.shape
# Noise X # Noise X
@@ -97,8 +98,11 @@ def sample_discrete_features(probX, probE, node_mask, step=None, add_nose=True):
# Sample E # Sample E
E_t = probE.multinomial(1).reshape(bs, n, n) # (bs, n, n) E_t = probE.multinomial(1).reshape(bs, n, n) # (bs, n, n)
# print(f"sample_discrete_features out: X_t: {X_t.shape}, E_t: {E_t.shape}")
E_t = torch.triu(E_t, diagonal=1) E_t = torch.triu(E_t, diagonal=1)
# print(f"sample_discrete_features out: X_t: {X_t.shape}, E_t: {E_t.shape}")
E_t = (E_t + torch.transpose(E_t, 1, 2)) E_t = (E_t + torch.transpose(E_t, 1, 2))
# print(f"sample_discrete_features out: X_t: {X_t.shape}, E_t: {E_t.shape}")
return PlaceHolder(X=X_t, E=E_t, y=torch.zeros(bs, 0).type_as(X_t)) return PlaceHolder(X=X_t, E=E_t, y=torch.zeros(bs, 0).type_as(X_t))

View File

@@ -103,16 +103,25 @@ class MarginalTransition:
self.e_marginals = e_marginals # Dx, De self.e_marginals = e_marginals # Dx, De
self.xe_conditions = xe_conditions self.xe_conditions = xe_conditions
self.u_x = x_marginals.unsqueeze(0).expand(self.X_classes, -1).unsqueeze(0) # 1, Dx, Dx self.u_x = x_marginals.unsqueeze(0).expand(self.X_classes, -1).unsqueeze(0) # 1, Dx, Dx 1 7 7
self.u_e = e_marginals.unsqueeze(0).expand(self.E_classes, -1).unsqueeze(0) # 1, De, De self.u_e = e_marginals.unsqueeze(0).expand(self.E_classes, -1).unsqueeze(0) # 1, De, De 1 2 2
self.u_xe = xe_conditions.unsqueeze(0) # 1, Dx, De self.u_xe = xe_conditions.unsqueeze(0) # 1, Dx, De 1 7 2
self.u_ex = ex_conditions.unsqueeze(0) # 1, De, Dx self.u_ex = ex_conditions.unsqueeze(0) # 1, De, Dx 1 2 7
self.u = self.get_union_transition(self.u_x, self.u_e, self.u_xe, self.u_ex, n_nodes) # 1, Dx + n*De, Dx + n*De self.u = self.get_union_transition(self.u_x, self.u_e, self.u_xe, self.u_ex, n_nodes) # 1, Dx + n*De, Dx + n*De
# print(f"Shape of u_x: {self.u_x.shape}")
# print(f"Shape of u_e: {self.u_e.shape}")
# print(f"Shape of u_xe: {self.u_xe.shape}")
# print(f"Shape of u_ex: {self.u_ex.shape}")
# print(f"Shape of u: {self.u.shape}")
def get_union_transition(self, u_x, u_e, u_xe, u_ex, n_nodes): def get_union_transition(self, u_x, u_e, u_xe, u_ex, n_nodes):
# print(f"before processing Shape of u_e: {u_e.shape}")
# print(f"before processing Shape of u_ex: {u_ex.shape}")
u_e = u_e.repeat(1, n_nodes, n_nodes) # (1, n*de, n*de) u_e = u_e.repeat(1, n_nodes, n_nodes) # (1, n*de, n*de)
u_xe = u_xe.repeat(1, 1, n_nodes) # (1, dx, n*de) u_xe = u_xe.repeat(1, 1, n_nodes) # (1, dx, n*de)
u_ex = u_ex.repeat(1, n_nodes, 1) # (1, n*de, dx) u_ex = u_ex.repeat(1, n_nodes, 1) # (1, n*de, dx)
# print(f"After processing Shape of u_ex: {u_ex.shape}")
# print(f"After processing Shape of u_e: {u_e.shape}")
u0 = torch.cat([u_x, u_xe], dim=2) # (1, dx, dx + n*de) u0 = torch.cat([u_x, u_xe], dim=2) # (1, dx, dx + n*de)
u1 = torch.cat([u_ex, u_e], dim=2) # (1, n*de, dx + n*de) u1 = torch.cat([u_ex, u_e], dim=2) # (1, n*de, dx + n*de)
u = torch.cat([u0, u1], dim=1) # (1, dx + n*de, dx + n*de) u = torch.cat([u0, u1], dim=1) # (1, dx + n*de, dx + n*de)

View File

@@ -13,11 +13,11 @@ from metrics.abstract_metrics import SumExceptBatchMetric, SumExceptBatchKL, NLL
import utils import utils
class Graph_DiT(pl.LightningModule): class Graph_DiT(pl.LightningModule):
# def __init__(self, cfg, dataset_infos, train_metrics, sampling_metrics, visualization_tools): def __init__(self, cfg, dataset_infos, train_metrics, sampling_metrics, visualization_tools):
def __init__(self, cfg, dataset_infos, visualization_tools): # def __init__(self, cfg, dataset_infos, visualization_tools):
super().__init__() super().__init__()
# self.save_hyperparameters(ignore=['train_metrics', 'sampling_metrics']) self.save_hyperparameters(ignore=['train_metrics', 'sampling_metrics'])
self.test_only = cfg.general.test_only self.test_only = cfg.general.test_only
self.guidance_target = getattr(cfg.dataset, 'guidance_target', None) self.guidance_target = getattr(cfg.dataset, 'guidance_target', None)
@@ -57,8 +57,8 @@ class Graph_DiT(pl.LightningModule):
self.test_E_logp = SumExceptBatchMetric() self.test_E_logp = SumExceptBatchMetric()
self.test_y_collection = [] self.test_y_collection = []
# self.train_metrics = train_metrics self.train_metrics = train_metrics
# self.sampling_metrics = sampling_metrics self.sampling_metrics = sampling_metrics
self.visualization_tools = visualization_tools self.visualization_tools = visualization_tools
self.max_n_nodes = dataset_infos.max_n_nodes self.max_n_nodes = dataset_infos.max_n_nodes
@@ -181,7 +181,7 @@ class Graph_DiT(pl.LightningModule):
data_x = F.one_hot(data.x, num_classes=118).float()[:, self.active_index] data_x = F.one_hot(data.x, num_classes=118).float()[:, self.active_index]
data_edge_attr = F.one_hot(data.edge_attr, num_classes=10).float() data_edge_attr = F.one_hot(data.edge_attr, num_classes=10).float()
dense_data, node_mask = utils.to_dense(data_x, data.edge_index, data_edge_attr, data.batch, self.max_n_nodes) dense_data, node_mask = utils.to_dense(data_x, data.edge_index, data_edge_attr, data.batch, self.max_n_nodes)
dense_data = dense_data.mask(node_mask, collapse=False) dense_data = dense_data.mask(node_mask, collapse=True)
noisy_data = self.apply_noise(dense_data.X, dense_data.E, data.y, node_mask) noisy_data = self.apply_noise(dense_data.X, dense_data.E, data.y, node_mask)
pred = self.forward(noisy_data) pred = self.forward(noisy_data)
nll = self.compute_val_loss(pred, noisy_data, dense_data.X, dense_data.E, data.y, node_mask, test=False) nll = self.compute_val_loss(pred, noisy_data, dense_data.X, dense_data.E, data.y, node_mask, test=False)
@@ -444,11 +444,9 @@ class Graph_DiT(pl.LightningModule):
beta_t = self.noise_schedule(t_normalized=t_float) # (bs, 1) beta_t = self.noise_schedule(t_normalized=t_float) # (bs, 1)
alpha_s_bar = self.noise_schedule.get_alpha_bar(t_normalized=s_float) # (bs, 1) alpha_s_bar = self.noise_schedule.get_alpha_bar(t_normalized=s_float) # (bs, 1)
alpha_t_bar = self.noise_schedule.get_alpha_bar(t_normalized=t_float) # (bs, 1) alpha_t_bar = self.noise_schedule.get_alpha_bar(t_normalized=t_float) # (bs, 1)
print(f"alpha_t_bar.shape {alpha_t_bar.shape}")
Qtb = self.transition_model.get_Qt_bar(alpha_t_bar, self.device) # (bs, dx_in, dx_out), (bs, de_in, de_out) Qtb = self.transition_model.get_Qt_bar(alpha_t_bar, self.device) # (bs, dx_in, dx_out), (bs, de_in, de_out)
print(f"E.shape {E.shape}")
print(f"X.shape {X.shape}")
bs, n, d = X.shape bs, n, d = X.shape
X_all = torch.cat([X, E.reshape(bs, n, -1)], dim=-1) X_all = torch.cat([X, E.reshape(bs, n, -1)], dim=-1)
prob_all = X_all @ Qtb.X prob_all = X_all @ Qtb.X

View File

@@ -11,9 +11,13 @@ import utils
from datasets import dataset from datasets import dataset
from diffusion_model import Graph_DiT from diffusion_model import Graph_DiT
from metrics.molecular_metrics_train import TrainMolecularMetricsDiscrete from metrics.molecular_metrics_train import TrainMolecularMetricsDiscrete
from metrics.molecular_metrics_train import TrainGraphMetricsDiscrete
from metrics.molecular_metrics_sampling import SamplingMolecularMetrics from metrics.molecular_metrics_sampling import SamplingMolecularMetrics
from metrics.molecular_metrics_sampling import SamplingGraphMetrics
from analysis.visualization import MolecularVisualization from analysis.visualization import MolecularVisualization
from analysis.visualization import GraphVisualization
warnings.filterwarnings("ignore", category=UserWarning) warnings.filterwarnings("ignore", category=UserWarning)
torch.set_float32_matmul_precision("medium") torch.set_float32_matmul_precision("medium")
@@ -78,15 +82,20 @@ def main(cfg: DictConfig):
datamodule = dataset.DataModule(cfg) datamodule = dataset.DataModule(cfg)
datamodule.prepare_data() datamodule.prepare_data()
dataset_infos = dataset.DataInfos(datamodule=datamodule, cfg=cfg) dataset_infos = dataset.DataInfos(datamodule=datamodule, cfg=cfg, dataset=datamodule.dataset)
# train_smiles, reference_smiles = datamodule.get_train_smiles() train_smiles, reference_smiles = datamodule.get_train_smiles()
# train_graphs, reference_graphs = datamodule.get_train_graphs()
# get input output dimensions # get input output dimensions
dataset_infos.compute_input_output_dims(datamodule=datamodule) dataset_infos.compute_input_output_dims(datamodule=datamodule)
# train_metrics = TrainMolecularMetricsDiscrete(dataset_infos) train_metrics = TrainMolecularMetricsDiscrete(dataset_infos)
# train_metrics = TrainGraphMetricsDiscrete(dataset_infos)
# sampling_metrics = SamplingMolecularMetrics( sampling_metrics = SamplingMolecularMetrics(
# dataset_infos, train_smiles, reference_smiles dataset_infos, train_smiles, reference_smiles
)
# sampling_metrics = SamplingGraphMetrics(
# dataset_infos, train_graphs, reference_graphs
# ) # )
visualization_tools = MolecularVisualization(dataset_infos) visualization_tools = MolecularVisualization(dataset_infos)
@@ -135,5 +144,65 @@ def main(cfg: DictConfig):
else: else:
trainer.test(model, datamodule=datamodule, ckpt_path=cfg.general.test_only) trainer.test(model, datamodule=datamodule, ckpt_path=cfg.general.test_only)
@hydra.main(
version_base="1.1", config_path="../configs", config_name="config"
)
def test(cfg: DictConfig):
datamodule = dataset.DataModule(cfg)
datamodule.prepare_data()
dataset_infos = dataset.DataInfos(datamodule=datamodule, cfg=cfg, dataset=datamodule.dataset)
train_graphs, reference_graphs = datamodule.get_train_graphs()
dataset_infos.compute_input_output_dims(datamodule=datamodule)
train_metrics = TrainGraphMetricsDiscrete(dataset_infos)
sampling_metrics = SamplingGraphMetrics(
dataset_infos, train_graphs, reference_graphs
)
visulization_tools = GraphVisualization(dataset_infos)
model_kwargs = {
"dataset_infos": dataset_infos,
"train_metrics": train_metrics,
"sampling_metrics": sampling_metrics,
"visualization_tools": visulization_tools,
}
if cfg.general.test_only:
cfg, _ = get_resume(cfg, model_kwargs)
os.chdir(cfg.general.test_only.split("checkpoints")[0])
elif cfg.general.resume is not None:
cfg, _ = get_resume_adaptive(cfg, model_kwargs)
os.chdir(cfg.general.resume.split("checkpoints")[0])
# os.environ["CUDA_VISIBLE_DEVICES"] = cfg.general.gpu_number
model = Graph_DiT(cfg=cfg, **model_kwargs)
trainer = Trainer(
gradient_clip_val=cfg.train.clip_grad,
# accelerator="cpu",
accelerator="gpu"
if torch.cuda.is_available() and cfg.general.gpus > 0
else "cpu",
devices=[cfg.general.gpu_number]
if torch.cuda.is_available() and cfg.general.gpus > 0
else None,
max_epochs=cfg.train.n_epochs,
enable_checkpointing=False,
check_val_every_n_epoch=cfg.train.check_val_every_n_epoch,
val_check_interval=cfg.train.val_check_interval,
strategy="ddp" if cfg.general.gpus > 1 else "auto",
enable_progress_bar=cfg.general.enable_progress_bar,
callbacks=[],
reload_dataloaders_every_n_epochs=0,
logger=[],
)
if not cfg.general.test_only:
print("start testing fit method")
trainer.fit(model, datamodule=datamodule, ckpt_path=cfg.general.resume)
if cfg.general.save_model:
trainer.save_checkpoint(f"checkpoints/{cfg.general.name}/last.ckpt")
trainer.test(model, datamodule=datamodule)
if __name__ == "__main__": if __name__ == "__main__":
main() test()

View File

@@ -1,5 +1,6 @@
### packages for visualization ### packages for visualization
from analysis.rdkit_functions import compute_molecular_metrics from analysis.rdkit_functions import compute_molecular_metrics
from analysis.rdkit_functions import compute_graph_metrics
from mini_moses.metrics.metrics import compute_intermediate_statistics from mini_moses.metrics.metrics import compute_intermediate_statistics
from metrics.property_metric import TaskModel from metrics.property_metric import TaskModel
@@ -23,7 +24,121 @@ def result_to_csv(path, dict_data):
writer.writeheader() writer.writeheader()
writer.writerow(dict_data) writer.writerow(dict_data)
class SamplingGraphMetrics(nn.Module):
def __init__(
self,
dataset_infos,
train_graphs,
reference_graphs,
n_jobs=1,
device="cpu",
batch_size=512,
):
super().__init__()
self.task_name = dataset_infos.task
self.dataset_infos = dataset_infos
self.active_nodes = dataset_infos.active_nodes
self.train_graphs = train_graphs
self.stat_ref = None
self.compute_config = {
"n_jobs": n_jobs,
"device": device,
"batch_size": batch_size,
}
self.task_evaluator = {
'meta_taskname': dataset_infos.task,
# 'sas': None,
# 'scs': None
}
for cur_task in dataset_infos.task.split("-")[:]:
model_path = os.path.join(
dataset_infos.base_path, "data/evaluator", f"{cur_task}.joblib"
)
os.makedirs(os.path.dirname(model_path), exist_ok=True)
evaluator = TaskModel(model_path, cur_task)
self.task_evaluator[cur_task] = evaluator
def forward(self, graphs, targets, name, current_epoch, val_counter, test=False):
test = True
if isinstance(targets, list):
targets_cat = torch.cat(targets, dim=0)
targets_np = targets_cat.detach().cpu().numpy()
else:
targets_np = targets.detach().cpu().numpy()
unique_graphs, all_graphs, all_metrics, targets_log = compute_graph_metrics(
graphs,
targets_np,
self.train_graphs,
self.stat_ref,
self.dataset_infos,
self.task_evaluator,
self.compute_config,
)
print(f"all graphs: {all_graphs}")
print(f"all graphs[0]: {all_graphs[0]}")
tmp_graphs = all_graphs.copy()
str_graphs = []
for graph in tmp_graphs:
node_types = graph[0]
edge_types = graph[1]
node_str = " ".join([str(node) for node in node_types])
edge_str_list = []
for i in range(len(node_types)):
for j in range(len(node_types)):
edge_str_list.append(str(edge_types[i][j]))
edge_str_list.append("/n")
edge_str = " ".join(edge_str_list)
str_graphs.append(f"nodes: {node_str} /n edges: /n{edge_str}")
if test:
file_name = "final_graphs.txt"
with open(file_name, "w") as fp:
all_tasks_name = list(self.task_evaluator.keys())
all_tasks_name = all_tasks_name.copy()
if 'meta_taskname' in all_tasks_name:
all_tasks_name.remove('meta_taskname')
all_tasks_str = "graph, " + ", ".join([f"input_{task}" for task in all_tasks_name] + [f"output_{task}" for task in all_tasks_name])
fp.write(all_tasks_str + "\n")
for i, graph in enumerate(str_graphs):
if targets_log is not None:
all_result_str = f"{graph}, " + ", ".join([f"{targets_log['input_'+task][i]}" for task in all_tasks_name] + [f"{targets_log['output_'+task][i]}" for task in all_tasks_name])
fp.write(all_result_str + "\n")
else:
fp.write("%s\n" % graph)
print("All graphs saved")
else:
result_path = os.path.join(os.getcwd(), f"graphs/{name}")
os.makedirs(result_path, exist_ok=True)
text_path = os.path.join(
result_path,
f"valid_unique_graphs_e{current_epoch}_b{val_counter}.txt",
)
textfile = open(text_path, "w")
for graph in unique_graphs:
textfile.write(graph + "\n")
textfile.close()
all_logs = all_metrics
if test:
all_logs["log_name"] = "test"
else:
all_logs["log_name"] = (
"epoch" + str(current_epoch) + "_batch" + str(val_counter)
)
result_to_csv("output.csv", all_logs)
return str_graphs
def reset(self):
pass
class SamplingMolecularMetrics(nn.Module): class SamplingMolecularMetrics(nn.Module):
def __init__( def __init__(
self, self,
@@ -40,21 +155,21 @@ class SamplingMolecularMetrics(nn.Module):
self.active_atoms = dataset_infos.active_atoms self.active_atoms = dataset_infos.active_atoms
self.train_smiles = train_smiles self.train_smiles = train_smiles
if reference_smiles is not None: # if reference_smiles is not None:
print( # print(
f"--- Computing intermediate statistics for training for #{len(reference_smiles)} smiles ---" # f"--- Computing intermediate statistics for training for #{len(reference_smiles)} smiles ---"
) # )
start_time = time.time() # start_time = time.time()
self.stat_ref = compute_intermediate_statistics( # self.stat_ref = compute_intermediate_statistics(
reference_smiles, n_jobs=n_jobs, device=device, batch_size=batch_size # reference_smiles, n_jobs=n_jobs, device=device, batch_size=batch_size
) # )
end_time = time.time() # end_time = time.time()
elapsed_time = end_time - start_time # elapsed_time = end_time - start_time
print( # print(
f"--- End computing intermediate statistics: using {elapsed_time:.2f}s ---" # f"--- End computing intermediate statistics: using {elapsed_time:.2f}s ---"
) # )
else: # else:
self.stat_ref = None self.stat_ref = None
self.comput_config = { self.comput_config = {
"n_jobs": n_jobs, "n_jobs": n_jobs,

View File

@@ -35,7 +35,13 @@ class CEPerClass(Metric):
def compute(self): def compute(self):
return self.total_ce / self.total_samples return self.total_ce / self.total_samples
class NodeCE(CEPerClass):
def __init__(self, i):
super().__init__(i)
class EdgeCE(CEPerClass):
def __init__(self, i):
super().__init__(i)
class AtomCE(CEPerClass): class AtomCE(CEPerClass):
def __init__(self, i): def __init__(self, i):
@@ -65,6 +71,21 @@ class AromaticCE(CEPerClass):
def __init__(self, i): def __init__(self, i):
super().__init__(i) super().__init__(i)
class NodeMetricsCE(MetricCollection):
def __init__(self, active_nodes):
metrics_list = []
for i, node_type in enumerate(active_nodes) :
metrics_list.append(type(f'{node_type}_CE', (NodeCE,), {})(i))
super().__init__(metrics_list)
class EdgeMetricsCE(MetricCollection):
def __init__(self):
ce_no_bond = NoBondCE(0)
ce_SI = SingleCE(1)
ce_DO = DoubleCE(2)
ce_TR = TripleCE(3)
super().__init__([ce_no_bond, ce_SI])
class AtomMetricsCE(MetricCollection): class AtomMetricsCE(MetricCollection):
def __init__(self, active_atoms): def __init__(self, active_atoms):
@@ -84,7 +105,47 @@ class BondMetricsCE(MetricCollection):
ce_TR = TripleCE(3) ce_TR = TripleCE(3)
super().__init__([ce_no_bond, ce_SI, ce_DO, ce_TR]) super().__init__([ce_no_bond, ce_SI, ce_DO, ce_TR])
# #
class TrainGraphMetricsDiscrete(nn.Module):
def __init__(self, dataset_infos):
super().__init__()
active_nodes = dataset_infos.active_nodes
self.train_node_metrics = NodeMetricsCE(active_nodes=active_nodes)
self.train_edge_metrics = EdgeMetricsCE()
def forward(self, masked_pred_X, masked_pred_E, true_X, true_E, log: bool):
self.train_node_metrics(masked_pred_X, true_X)
self.train_edge_metrics(masked_pred_E, true_E)
if log:
to_log = {}
for key, val in self.train_node_metrics.compute().items():
to_log['train/' + key] = val.item()
for key, val in self.train_edge_metrics.compute().items():
to_log['train/' + key] = val.item()
def reset(self):
for metric in [self.train_node_metrics, self.train_edge_metrics]:
metric.reset()
def log_epoch_metrics(self, current_epoch, log=True):
epoch_node_metrics = self.train_node_metrics.compute()
epoch_edge_metrics = self.train_edge_metrics.compute()
to_log = {}
for key, val in epoch_node_metrics.items():
to_log['train_epoch/' + key] = val.item()
for key, val in epoch_edge_metrics.items():
to_log['train_epoch/' + key] = val.item()
for key, val in epoch_node_metrics.items():
epoch_node_metrics[key] = round(val.item(),4)
for key, val in epoch_edge_metrics.items():
epoch_edge_metrics[key] = round(val.item(),4)
if log:
print(f"Epoch {current_epoch}: {epoch_node_metrics} -- {epoch_edge_metrics}")
class TrainMolecularMetricsDiscrete(nn.Module): class TrainMolecularMetricsDiscrete(nn.Module):
def __init__(self, dataset_infos): def __init__(self, dataset_infos):
super().__init__() super().__init__()

View File

@@ -15,6 +15,17 @@ from rdkit.Chem import AllChem
from rdkit import DataStructs from rdkit import DataStructs
from rdkit.Chem import rdMolDescriptors from rdkit.Chem import rdMolDescriptors
rdBase.DisableLog('rdApp.error') rdBase.DisableLog('rdApp.error')
import json
op_type = {
'nor_conv_1x1': 1,
'nor_conv_3x3': 2,
'avg_pool_3x3': 3,
'skip_connect': 4,
'output': 5,
'none': 6,
'input': 7
}
task_to_colname = { task_to_colname = {
'hiv_b': 'HIV_active', 'hiv_b': 'HIV_active',
@@ -32,8 +43,10 @@ tasktype_name = {
'O2': 'regression', 'O2': 'regression',
'N2': 'regression', 'N2': 'regression',
'CO2': 'regression', 'CO2': 'regression',
'nasbench201': 'regression',
} }
class TaskModel(): class TaskModel():
"""Scores based on an ECFP classifier.""" """Scores based on an ECFP classifier."""
def __init__(self, model_path, task_name): def __init__(self, model_path, task_name):
@@ -55,8 +68,48 @@ class TaskModel():
perfermance = self.train() perfermance = self.train()
dump(self.model, model_path) dump(self.model, model_path)
print('Oracle peformance: ', perfermance) print('Oracle peformance: ', perfermance)
def train(self): def train(self):
def read_adj_ops_from_json(filename):
with open(filename, 'r') as json_file:
data = json.load(json_file)
adj_ops_pairs = []
for item in data:
adj_matrix = np.array(item['adj_matrix'])
ops = item['ops']
acc = item['train'][0]['accuracy']
adj_ops_pairs.append((adj_matrix, ops, acc))
return adj_ops_pairs
def feature_from_adj_and_ops(adj, ops):
return np.concatenate([adj.flatten(), ops])
filename = '/home/stud/hanzhang/nasbenchDiT/graph_dit/nasbench-201-graph.json'
graphs = read_adj_ops_from_json(filename)
adjs = []
opss = []
accs = []
features = []
for graph in graphs:
adj, ops, acc=graph
op_code = [op_type[op] for op in ops]
adjs.append(adj)
opss.append(op_code)
accs.append(acc)
features.append(feature_from_adj_and_ops(adj, op_code))
features = np.array(features)
labels = np.array(accs)
mask = ~np.isnan(labels)
labels = labels[mask]
features = features[mask]
# features = str(features)
self.model.fit(features, labels)
y_pred = self.model.predict(features)
perf = self.metric_func(labels, y_pred)
print(f'{self.task_name} performance: {perf}')
return perf
def train__(self):
data_path = os.path.dirname(self.model_path) data_path = os.path.dirname(self.model_path)
data_path = os.path.join(os.path.dirname(self.model_path), '..', f'raw/{self.task_name}.csv.gz') data_path = os.path.join(os.path.dirname(self.model_path), '..', f'raw/{self.task_name}.csv.gz')
df = pd.read_csv(data_path) df = pd.read_csv(data_path)
@@ -84,7 +137,7 @@ class TaskModel():
print(f'{self.task_name} performance: {perf}') print(f'{self.task_name} performance: {perf}')
return perf return perf
def __call__(self, smiles_list): def __call(self, smiles_list):
fps = [] fps = []
mask = [] mask = []
for i,smiles in enumerate(smiles_list): for i,smiles in enumerate(smiles_list):
@@ -101,6 +154,54 @@ class TaskModel():
scores = scores * np.array(mask) scores = scores * np.array(mask)
return np.float32(scores) return np.float32(scores)
def __call__(self, graph_list):
# def read_adj_ops_from_json(filename):
# with open(filename, 'r') as json_file:
# data = json.load(json_file)
# adj_ops_pairs = []
# for item in data:
# adj_matrix = np.array(item['adj_matrix'])
# ops = item['ops']
# acc = item['train'][0]['accuracy']
# adj_ops_pairs.append((adj_matrix, ops, acc))
# return adj_ops_pairs
def feature_from_adj_and_ops(ops, adj):
return np.concatenate([adj.flatten(), ops])
# filename = '/home/stud/hanzhang/nasbenchDiT/graph_dit/nasbench-201-graph.json'
# graphs = read_adj_ops_from_json(filename)
# adjs = []
# opss = []
# accs = []
# features = []
# for graph in graphs:
# adj, ops, acc=graph
# op_code = [op_type[op] for op in ops]
# adjs.append(adj)
# opss.append(op_code)
# accs.append(acc)
features = []
print(f"graphlist: {graph_list[0]}")
print(f"len graphlist: {len(graph_list)}")
for op_code, adj in graph_list:
features.append(feature_from_adj_and_ops(op_code, adj))
print(f"len features: {len(features)}")
# print(f"features: {features[0].shape}")
features = np.stack(features)
features = features.astype(np.float32)
print(f"features shape: {features.shape}")
fps = features
if 'classification' in self.task_type:
scores = self.model.predict_proba(fps)[:, 1]
else:
scores = self.model.predict(fps)
# scores = scores * np.array(mask)
return np.float32(scores)
@classmethod @classmethod
def fingerprints_from_mol(cls, mol): # use ECFP4 def fingerprints_from_mol(cls, mol): # use ECFP4
features_vec = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048) features_vec = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)

View File

@@ -87,7 +87,7 @@ class Denoiser(nn.Module):
def forward(self, x, e, node_mask, y, t, unconditioned): def forward(self, x, e, node_mask, y, t, unconditioned):
print("Denoiser Forward") print("Denoiser Forward")
print(x.shape, e.shape, y.shape, t.shape, unconditioned) # print(x.shape, e.shape, y.shape, t.shape, unconditioned)
force_drop_id = torch.zeros_like(y.sum(-1)) force_drop_id = torch.zeros_like(y.sum(-1))
# drop the nan values # drop the nan values
force_drop_id[torch.isnan(y.sum(-1))] = 1 force_drop_id[torch.isnan(y.sum(-1))] = 1
@@ -98,32 +98,32 @@ class Denoiser(nn.Module):
# bs = batch size, n = number of nodes # bs = batch size, n = number of nodes
bs, n, _ = x.size() bs, n, _ = x.size()
x = torch.cat([x, e.reshape(bs, n, -1)], dim=-1) x = torch.cat([x, e.reshape(bs, n, -1)], dim=-1)
print("X after concat with E") # print("X after concat with E")
print(x.shape) # print(x.shape)
# self.x_embedder = nn.Linear(Xdim + max_n_nodes * Edim, hidden_size, bias=False) # self.x_embedder = nn.Linear(Xdim + max_n_nodes * Edim, hidden_size, bias=False)
x = self.x_embedder(x) x = self.x_embedder(x)
print("X after x_embedder") # print("X after x_embedder")
print(x.shape) # print(x.shape)
# self.t_embedder = TimestepEmbedder(hidden_size) # self.t_embedder = TimestepEmbedder(hidden_size)
c1 = self.t_embedder(t) c1 = self.t_embedder(t)
print("C1 after t_embedder") # print("C1 after t_embedder")
print(c1.shape) # print(c1.shape)
for i in range(1, self.ydim): for i in range(1, self.ydim):
if i == 1: if i == 1:
c2 = self.y_embedding_list[i-1](y[:, :2], self.training, force_drop_id, t) c2 = self.y_embedding_list[i-1](y[:, :2], self.training, force_drop_id, t)
else: else:
c2 = c2 + self.y_embedding_list[i-1](y[:, i:i+1], self.training, force_drop_id, t) c2 = c2 + self.y_embedding_list[i-1](y[:, i:i+1], self.training, force_drop_id, t)
print("C2 after y_embedding_list") # print("C2 after y_embedding_list")
print(c2.shape) # print(c2.shape)
print("C1 + C2") # print("C1 + C2")
c = c1 + c2 c = c1 + c2
print(c.shape) # print(c.shape)
for i, block in enumerate(self.encoders): for i, block in enumerate(self.encoders):
x = block(x, c, node_mask) x = block(x, c, node_mask)
print("X after block") # print("X after block")
print(x.shape) # print(x.shape)
# X: B * N * dx, E: B * N * N * de # X: B * N * dx, E: B * N * N * de
X, E, y = self.out_layer(x, x_in, e_in, c, t, node_mask) X, E, y = self.out_layer(x, x_in, e_in, c, t, node_mask)

View File

@@ -46,13 +46,17 @@ def unnormalize(X, E, y, norm_values, norm_biases, node_mask, collapse=False):
def to_dense(x, edge_index, edge_attr, batch, max_num_nodes=None): def to_dense(x, edge_index, edge_attr, batch, max_num_nodes=None):
# print(f"to dense X: {x.shape}, edge_index: {edge_index.shape}, edge_attr: {edge_attr.shape}, batch: {batch}, max_num_nodes: {max_num_nodes}")
X, node_mask = to_dense_batch(x=x, batch=batch, max_num_nodes=max_num_nodes) X, node_mask = to_dense_batch(x=x, batch=batch, max_num_nodes=max_num_nodes)
# node_mask = node_mask.float() # node_mask = node_mask.float()
edge_index, edge_attr = torch_geometric.utils.remove_self_loops(edge_index, edge_attr) edge_index, edge_attr = torch_geometric.utils.remove_self_loops(edge_index, edge_attr)
if max_num_nodes is None: if max_num_nodes is None:
max_num_nodes = X.size(1) max_num_nodes = X.size(1)
# print(f"to dense X: {X.shape}, edge_index: {edge_index.shape}, edge_attr: {edge_attr.shape}, batch: {batch}, max_num_nodes: {max_num_nodes}")
E = to_dense_adj(edge_index=edge_index, batch=batch, edge_attr=edge_attr, max_num_nodes=max_num_nodes) E = to_dense_adj(edge_index=edge_index, batch=batch, edge_attr=edge_attr, max_num_nodes=max_num_nodes)
E = encode_no_edge(E) E = encode_no_edge(E)
# print(f"to dense X: {X.shape}, edge_index: {edge_index.shape}, edge_attr: {edge_attr.shape}, batch: {batch}, max_num_nodes: {max_num_nodes}")
# print(f"to dense X: {X.shape}, E: {E.shape}, batch: {batch}, lenE: {len(E)}")
return PlaceHolder(X=X, E=E, y=None), node_mask return PlaceHolder(X=X, E=E, y=None), node_mask
@@ -119,6 +123,7 @@ class PlaceHolder:
x_mask = node_mask.unsqueeze(-1) # bs, n, 1 x_mask = node_mask.unsqueeze(-1) # bs, n, 1
e_mask1 = x_mask.unsqueeze(2) # bs, n, 1, 1 e_mask1 = x_mask.unsqueeze(2) # bs, n, 1, 1
e_mask2 = x_mask.unsqueeze(1) # bs, 1, n, 1 e_mask2 = x_mask.unsqueeze(1) # bs, 1, n, 1
# print(f"mask X: {self.X.shape}, E: {self.E.shape}, node_mask: {node_mask.shape}, x_mask: {x_mask.shape}, e_mask1: {e_mask1.shape}, e_mask2: {e_mask2.shape}")
if collapse: if collapse:
self.X = torch.argmax(self.X, dim=-1) self.X = torch.argmax(self.X, dim=-1)
@@ -127,8 +132,13 @@ class PlaceHolder:
self.X[node_mask == 0] = - 1 self.X[node_mask == 0] = - 1
self.E[(e_mask1 * e_mask2).squeeze(-1) == 0] = - 1 self.E[(e_mask1 * e_mask2).squeeze(-1) == 0] = - 1
else: else:
# print(f"X: {self.X.shape}, E: {self.E.shape}")
# print(f"X: {self.X}, E: {self.E}")
# print(f"x_mask: {x_mask}, e_mask1: {e_mask1}, e_mask2: {e_mask2}")
self.X = self.X * x_mask self.X = self.X * x_mask
self.E = self.E * e_mask1 * e_mask2 self.E = self.E * e_mask1 * e_mask2
# print(f"X: {self.X.shape}, E: {self.E.shape}")
# print(f"X: {self.X}, E: {self.E}")
assert torch.allclose(self.E, torch.transpose(self.E, 1, 2)) assert torch.allclose(self.E, torch.transpose(self.E, 1, 2))
return self return self

374
graph_dit/workingdoc.md Normal file
View File

@@ -0,0 +1,374 @@
0626
**dataset.py**
## class DataModule:
### def prepare_data(self) -> None:
dataset = Dataset(source=source, root=root_path, target_prop=target, transform=None)
## class Dataset:
Dataset is the subclass of `InMemoryDataset` from `torch_geometric.data` module.
```python
super().__init__(root, transform, pre_transform)
```
### def __init__(self, source, root, target_prop=None, transform=None, pre_transform=None, pre_filter=None):
initiliaze the nasbench api and load the dataset.
`self.data` and `self.slices` are loaded from the nasbench api.
try to print the data of `self.data` and `self.slices`
get
```text
self.data=Data(x=[125000], edge_index=[2, 93750], edge_attr=[93750, 1], y=[15625, 5]),
self.slices=defaultdict(<class 'dict'>, {'x': tensor([ 0, 8, 16, ..., 124984, 124992, 125000]),
'edge_index': tensor([ 0, 6, 12, ..., 93738, 93744, 93750]),
'edge_attr': tensor([ 0, 6, 12, ..., 93738, 93744, 93750]),
'y': tensor([ 0, 1, 2, ..., 15623, 15624, 15625])})
```
The original get :
<details>
<summary>click to expand</summary>
```text
self.data: Data(x=[16599], edge_index=[2, 36132], edge_attr=[36132], y=[553, 5], idx=[553]),
self.slices: defaultdict(<class 'dict'>, {'x': tensor([ 0, 9, 32, 56, 78, 113, 138, 157, 173, 203,
219, 262, 299, 326, 350, 391, 432, 466, 502, 528,
547, 583, 589, 599, 609, 622, 637, 652, 664, 675,
687, 703, 715, 719, 729, 735, 742, 749, 754, 770,
799, 836, 840, 846, 878, 917, 954, 991, 1022, 1065,
1096, 1127, 1158, 1197, 1228, 1259, 1298, 1343, 1381, 1412,
1457, 1500, 1539, 1583, 1601, 1640, 1679, 1688, 1733, 1768,
1797, 1830, 1870, 1901, 1926, 1950, 1971, 2000, 2043, 2090,
2137, 2176, 2220, 2261, 2301, 2337, 2378, 2420, 2460, 2508,
2549, 2580, 2615, 2644, 2673, 2717, 2748, 2780, 2809, 2833,
2865, 2875, 2884, 2923, 2966, 3006, 3017, 3028, 3048, 3079,
3090, 3101, 3110, 3115, 3129, 3155, 3173, 3194, 3224, 3256,
3299, 3336, 3379, 3426, 3470, 3514, 3542, 3566, 3587, 3615,
3636, 3664, 3684, 3698, 3728, 3754, 3780, 3796, 3808, 3825,
3853, 3881, 3906, 3924, 3949, 3982, 4007, 4044, 4092, 4128,
4174, 4209, 4244, 4292, 4339, 4372, 4419, 4468, 4502, 4545,
4588, 4609, 4638, 4688, 4717, 4767, 4814, 4861, 4894, 4935,
4985, 5023, 5057, 5080, 5114, 5148, 5156, 5202, 5251, 5299,
5348, 5367, 5376, 5386, 5400, 5413, 5429, 5448, 5474, 5485,
5497, 5513, 5519, 5558, 5592, 5631, 5656, 5679, 5705, 5734,
5763, 5788, 5813, 5838, 5863, 5887, 5911, 5935, 5959, 5982,
6027, 6072, 6097, 6133, 6179, 6211, 6250, 6297, 6338, 6370,
6411, 6458, 6493, 6528, 6576, 6615, 6654, 6694, 6733, 6760,
6784, 6816, 6861, 6907, 6944, 6982, 7027, 7073, 7117, 7160,
7203, 7243, 7283, 7323, 7360, 7407, 7421, 7457, 7502, 7543,
7563, 7585, 7605, 7624, 7665, 7706, 7717, 7754, 7791, 7815,
7839, 7863, 7908, 7952, 7984, 8010, 8035, 8073, 8117, 8143,
8189, 8223, 8271, 8306, 8353, 8378, 8415, 8449, 8483, 8520,
8558, 8605, 8636, 8680, 8727, 8762, 8809, 8823, 8854, 8879,
8915, 8954, 8997, 9039, 9078, 9120, 9153, 9195, 9237, 9272,
9314, 9356, 9389, 9400, 9449, 9497, 9536, 9571, 9621, 9662,
9698, 9740, 9783, 9825, 9871, 9915, 9965, 10006, 10050, 10097,
10138, 10188, 10232, 10256, 10267, 10287, 10301, 10344, 10358, 10373,
10387, 10429, 10441, 10472, 10480, 10500, 10510, 10521, 10533, 10552,
10583, 10604, 10621, 10635, 10647, 10660, 10676, 10697, 10728, 10758,
10789, 10799, 10813, 10821, 10833, 10850, 10858, 10869, 10917, 10926,
10936, 10948, 10972, 10992, 11011, 11024, 11040, 11059, 11065, 11072,
11082, 11099, 11107, 11115, 11124, 11135, 11147, 11167, 11178, 11184,
11198, 11209, 11220, 11226, 11242, 11248, 11261, 11269, 11288, 11308,
11335, 11363, 11392, 11415, 11435, 11449, 11475, 11518, 11537, 11550,
11565, 11578, 11599, 11614, 11631, 11674, 11717, 11738, 11771, 11811,
11831, 11846, 11855, 11869, 11877, 11884, 11889, 11895, 11917, 11956,
11968, 11979, 12018, 12029, 12036, 12041, 12051, 12061, 12067, 12086,
12109, 12134, 12165, 12191, 12232, 12273, 12307, 12354, 12385, 12433,
12472, 12518, 12565, 12611, 12657, 12703, 12742, 12788, 12836, 12882,
12928, 12967, 13013, 13052, 13091, 13130, 13173, 13214, 13264, 13314,
13364, 13403, 13450, 13497, 13544, 13589, 13634, 13677, 13717, 13752,
13793, 13828, 13855, 13890, 13925, 13965, 14011, 14057, 14105, 14151,
14190, 14238, 14285, 14313, 14343, 14362, 14405, 14443, 14475, 14509,
14537, 14553, 14597, 14636, 14647, 14658, 14705, 14736, 14786, 14810,
14830, 14849, 14869, 14910, 14950, 14956, 14994, 15042, 15077, 15125,
15172, 15196, 15204, 15236, 15268, 15311, 15350, 15393, 15433, 15466,
15499, 15530, 15565, 15600, 15639, 15675, 15711, 15750, 15782, 15814,
15849, 15882, 15921, 15956, 15999, 16038, 16078, 16118, 16155, 16182,
16209, 16235, 16261, 16290, 16319, 16345, 16370, 16415, 16460, 16503,
16530, 16539, 16585, 16599]),
'edge_index': tensor([ 0, 18, 70, 124, 172, 250, 308, 350, 384, 454,
488, 590, 674, 736, 790, 882, 976, 1050, 1134, 1194,
1236, 1318, 1328, 1348, 1368, 1394, 1428, 1462, 1488, 1510,
1534, 1570, 1594, 1600, 1620, 1630, 1642, 1654, 1662, 1694,
1762, 1850, 1856, 1866, 1938, 2026, 2110, 2194, 2264, 2362,
2432, 2502, 2572, 2662, 2732, 2802, 2890, 2992, 3078, 3148,
3250, 3348, 3436, 3536, 3574, 3662, 3750, 3768, 3870, 3946,
4008, 4080, 4168, 4238, 4294, 4346, 4392, 4456, 4556, 4664,
4768, 4856, 4956, 5050, 5140, 5220, 5312, 5406, 5496, 5604,
5698, 5766, 5844, 5908, 5972, 6072, 6140, 6210, 6274, 6326,
6396, 6414, 6432, 6520, 6616, 6704, 6726, 6748, 6790, 6850,
6872, 6894, 6910, 6918, 6946, 6998, 7034, 7076, 7144, 7212,
7308, 7386, 7476, 7580, 7676, 7772, 7832, 7882, 7926, 7984,
8028, 8086, 8128, 8156, 8220, 8276, 8330, 8360, 8382, 8416,
8474, 8532, 8584, 8620, 8672, 8742, 8794, 8878, 8984, 9062,
9164, 9240, 9320, 9426, 9532, 9606, 9710, 9818, 9892, 9986,
10080, 10124, 10186, 10298, 10360, 10472, 10578, 10684, 10756, 10846,
10956, 11040, 11114, 11162, 11236, 11310, 11324, 11424, 11532, 11638,
11744, 11784, 11800, 11818, 11844, 11868, 11898, 11934, 11988, 12008,
12030, 12060, 12070, 12156, 12230, 12316, 12370, 12416, 12474, 12538,
12602, 12658, 12712, 12766, 12820, 12872, 12924, 12976, 13028, 13078,
13180, 13282, 13338, 13418, 13518, 13588, 13674, 13776, 13866, 13936,
14028, 14136, 14214, 14292, 14400, 14488, 14576, 14666, 14754, 14814,
14866, 14940, 15038, 15140, 15224, 15310, 15410, 15512, 15610, 15708,
15802, 15890, 15978, 16066, 16144, 16248, 16276, 16354, 16454, 16548,
16590, 16636, 16678, 16718, 16808, 16898, 16920, 17000, 17080, 17132,
17184, 17236, 17336, 17434, 17504, 17560, 17612, 17694, 17788, 17844,
17948, 18022, 18128, 18204, 18306, 18358, 18436, 18508, 18580, 18660,
18742, 18844, 18908, 19002, 19106, 19182, 19286, 19314, 19382, 19436,
19514, 19600, 19696, 19790, 19876, 19968, 20042, 20134, 20226, 20304,
20396, 20488, 20562, 20584, 20696, 20802, 20890, 20968, 21078, 21170,
21248, 21342, 21438, 21536, 21644, 21748, 21860, 21954, 22054, 22162,
22254, 22366, 22464, 22516, 22538, 22580, 22608, 22704, 22732, 22762,
22788, 22882, 22906, 22976, 22990, 23032, 23050, 23070, 23092, 23130,
23192, 23232, 23264, 23290, 23312, 23336, 23366, 23408, 23472, 23534,
23598, 23616, 23642, 23656, 23678, 23712, 23726, 23746, 23854, 23870,
23888, 23912, 23960, 24000, 24038, 24062, 24092, 24128, 24138, 24150,
24168, 24202, 24218, 24232, 24248, 24270, 24294, 24332, 24354, 24364,
24392, 24412, 24434, 24444, 24476, 24486, 24512, 24526, 24564, 24606,
24666, 24728, 24794, 24844, 24886, 24914, 24970, 25062, 25104, 25130,
25164, 25192, 25236, 25266, 25300, 25388, 25476, 25520, 25594, 25684,
25724, 25754, 25770, 25798, 25812, 25824, 25832, 25842, 25888, 25976,
26000, 26022, 26110, 26132, 26144, 26152, 26170, 26188, 26198, 26236,
26284, 26338, 26406, 26462, 26552, 26644, 26718, 26822, 26886, 26992,
27078, 27182, 27288, 27390, 27492, 27594, 27680, 27782, 27890, 27992,
28094, 28180, 28282, 28368, 28454, 28542, 28638, 28730, 28840, 28950,
29060, 29146, 29250, 29354, 29458, 29558, 29658, 29752, 29838, 29912,
30000, 30074, 30130, 30204, 30278, 30364, 30468, 30570, 30676, 30778,
30864, 30972, 31076, 31136, 31194, 31232, 31326, 31408, 31476, 31550,
31610, 31640, 31736, 31824, 31846, 31870, 31974, 32042, 32148, 32204,
32248, 32290, 32334, 32424, 32512, 32522, 32608, 32714, 32790, 32900,
33008, 33058, 33072, 33142, 33212, 33312, 33394, 33490, 33578, 33648,
33718, 33784, 33858, 33932, 34020, 34100, 34180, 34262, 34330, 34398,
34472, 34542, 34624, 34698, 34794, 34882, 34970, 35058, 35140, 35200,
35260, 35318, 35376, 35440, 35504, 35562, 35618, 35722, 35826, 35926,
35982, 36000, 36104, 36132]),
'edge_attr': tensor([ 0, 18, 70, 124, 172, 250, 308, 350, 384, 454,
488, 590, 674, 736, 790, 882, 976, 1050, 1134, 1194,
1236, 1318, 1328, 1348, 1368, 1394, 1428, 1462, 1488, 1510,
1534, 1570, 1594, 1600, 1620, 1630, 1642, 1654, 1662, 1694,
1762, 1850, 1856, 1866, 1938, 2026, 2110, 2194, 2264, 2362,
2432, 2502, 2572, 2662, 2732, 2802, 2890, 2992, 3078, 3148,
3250, 3348, 3436, 3536, 3574, 3662, 3750, 3768, 3870, 3946,
4008, 4080, 4168, 4238, 4294, 4346, 4392, 4456, 4556, 4664,
4768, 4856, 4956, 5050, 5140, 5220, 5312, 5406, 5496, 5604,
5698, 5766, 5844, 5908, 5972, 6072, 6140, 6210, 6274, 6326,
6396, 6414, 6432, 6520, 6616, 6704, 6726, 6748, 6790, 6850,
6872, 6894, 6910, 6918, 6946, 6998, 7034, 7076, 7144, 7212,
7308, 7386, 7476, 7580, 7676, 7772, 7832, 7882, 7926, 7984,
8028, 8086, 8128, 8156, 8220, 8276, 8330, 8360, 8382, 8416,
8474, 8532, 8584, 8620, 8672, 8742, 8794, 8878, 8984, 9062,
9164, 9240, 9320, 9426, 9532, 9606, 9710, 9818, 9892, 9986,
10080, 10124, 10186, 10298, 10360, 10472, 10578, 10684, 10756, 10846,
10956, 11040, 11114, 11162, 11236, 11310, 11324, 11424, 11532, 11638,
11744, 11784, 11800, 11818, 11844, 11868, 11898, 11934, 11988, 12008,
12030, 12060, 12070, 12156, 12230, 12316, 12370, 12416, 12474, 12538,
12602, 12658, 12712, 12766, 12820, 12872, 12924, 12976, 13028, 13078,
13180, 13282, 13338, 13418, 13518, 13588, 13674, 13776, 13866, 13936,
14028, 14136, 14214, 14292, 14400, 14488, 14576, 14666, 14754, 14814,
14866, 14940, 15038, 15140, 15224, 15310, 15410, 15512, 15610, 15708,
15802, 15890, 15978, 16066, 16144, 16248, 16276, 16354, 16454, 16548,
16590, 16636, 16678, 16718, 16808, 16898, 16920, 17000, 17080, 17132,
17184, 17236, 17336, 17434, 17504, 17560, 17612, 17694, 17788, 17844,
17948, 18022, 18128, 18204, 18306, 18358, 18436, 18508, 18580, 18660,
18742, 18844, 18908, 19002, 19106, 19182, 19286, 19314, 19382, 19436,
19514, 19600, 19696, 19790, 19876, 19968, 20042, 20134, 20226, 20304,
20396, 20488, 20562, 20584, 20696, 20802, 20890, 20968, 21078, 21170,
21248, 21342, 21438, 21536, 21644, 21748, 21860, 21954, 22054, 22162,
22254, 22366, 22464, 22516, 22538, 22580, 22608, 22704, 22732, 22762,
22788, 22882, 22906, 22976, 22990, 23032, 23050, 23070, 23092, 23130,
23192, 23232, 23264, 23290, 23312, 23336, 23366, 23408, 23472, 23534,
23598, 23616, 23642, 23656, 23678, 23712, 23726, 23746, 23854, 23870,
23888, 23912, 23960, 24000, 24038, 24062, 24092, 24128, 24138, 24150,
24168, 24202, 24218, 24232, 24248, 24270, 24294, 24332, 24354, 24364,
24392, 24412, 24434, 24444, 24476, 24486, 24512, 24526, 24564, 24606,
24666, 24728, 24794, 24844, 24886, 24914, 24970, 25062, 25104, 25130,
25164, 25192, 25236, 25266, 25300, 25388, 25476, 25520, 25594, 25684,
25724, 25754, 25770, 25798, 25812, 25824, 25832, 25842, 25888, 25976,
26000, 26022, 26110, 26132, 26144, 26152, 26170, 26188, 26198, 26236,
26284, 26338, 26406, 26462, 26552, 26644, 26718, 26822, 26886, 26992,
27078, 27182, 27288, 27390, 27492, 27594, 27680, 27782, 27890, 27992,
28094, 28180, 28282, 28368, 28454, 28542, 28638, 28730, 28840, 28950,
29060, 29146, 29250, 29354, 29458, 29558, 29658, 29752, 29838, 29912,
30000, 30074, 30130, 30204, 30278, 30364, 30468, 30570, 30676, 30778,
30864, 30972, 31076, 31136, 31194, 31232, 31326, 31408, 31476, 31550,
31610, 31640, 31736, 31824, 31846, 31870, 31974, 32042, 32148, 32204,
32248, 32290, 32334, 32424, 32512, 32522, 32608, 32714, 32790, 32900,
33008, 33058, 33072, 33142, 33212, 33312, 33394, 33490, 33578, 33648,
33718, 33784, 33858, 33932, 34020, 34100, 34180, 34262, 34330, 34398,
34472, 34542, 34624, 34698, 34794, 34882, 34970, 35058, 35140, 35200,
35260, 35318, 35376, 35440, 35504, 35562, 35618, 35722, 35826, 35926,
35982, 36000, 36104, 36132]),
'y': tensor([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97,
98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181,
182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195,
196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237,
238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265,
266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279,
280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293,
294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307,
308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321,
322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335,
336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349,
350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363,
364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377,
378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391,
392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405,
406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433,
434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447,
448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461,
462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475,
476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489,
490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503,
504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517,
518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531,
532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545,
546, 547, 548, 549, 550, 551, 552, 553]),
'idx': tensor([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97,
98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181,
182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195,
196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237,
238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265,
266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279,
280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293,
294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307,
308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321,
322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335,
336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349,
350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363,
364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377,
378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391,
392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405,
406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433,
434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447,
448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461,
462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475,
476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489,
490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503,
504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517,
518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531,
532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545,
546, 547, 548, 549, 550, 551, 552, 553])})
```
</details>
The different keys in the dictionary are:
- `self.data`
- `edge_attr`: In nasbenchDiT, we get (edge_number, 1) size, but in DiT, we get (edge_number) dimension.
- `idx`: we do not have this key in nasbenchDiT.
- `self.slices`: same with `self.data`
So I try to modify the self.data and self.slices to be compatible with the DiT dataset.
## DataInfos
now need a dataset to init the datainfo,
because I need the nasbench api in the dataset
```python
self.api = dataset.api
```
## get_train_smiles() vs. get_train_graphs()
We want to remove molecules from the dataset, so we need to rerwrite the get_train_smiles() function to get_train_graphs() function.
20240628
## change the dataset ready for DiT
Changed the process function in the dataset class to be compatible with the DiT dataset.
<details>
```python
def process(self):
data_list = []
len_data = len(self.api)
def graph_to_graph_data(graph):
ops = graph[1]
adj = graph[0]
nodes = []
for op in ops:
nodes.append(op_type[op])
x = torch.LongTensor(nodes)
edges_list = []
edge_type = []
for start in range(len(ops)):
for end in range(len(ops)):
if adj[start][end] == 1:
edges_list.append((start, end))
edge_type.append(1)
edges_list.append((end, start))
edge_type.append(1)
edge_index = torch.tensor(edges_list, dtype=torch.long).t()
edge_type = torch.tensor(edge_type, dtype=torch.long)
edge_attr = edge_type
y = torch.tensor([0], dtype=torch.float).view(1, -1)
data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y, idx=i)
return data
with tqdm(total = len_data) as pbar:
active_nodes = set()
for i in range(len_data):
arch_info = self.api.query_meta_info_by_index(i)
nodes, edges = parse_architecture_string(arch_info.arch_str)
adj_matrix, ops = create_adj_matrix_and_ops(nodes, edges)
for op in ops:
if op not in active_nodes:
active_nodes.add(op)
data = graph_to_graph_data((adj_matrix, ops))
data_list.append(data)
pbar.update(1)
torch.save(self.collate(data_list), self.processed_paths[0])
```
</details>
20240629
## change to remove loading nasbench201 but read the data from the pt file