9 Commits

Author SHA1 Message Date
mhz
7274b3f606 update the taskmodel 2024-06-30 16:39:42 +02:00
mhz
66fe70028e no need to read the api again and again 2024-06-29 17:16:08 +02:00
mhz
df26eef77c update the new graph to json function 2024-06-28 16:29:43 +02:00
mhz
222470a43c rewrite to graph metrics 2024-06-27 20:44:04 +02:00
mhz
a7f7010da7 write graph code for the absctract dataset 2024-06-26 23:42:01 +02:00
mhz
14186fa97f write test code 2024-06-26 23:41:37 +02:00
mhz
a222c514d9 add get_train_graphs 2024-06-26 22:42:06 +02:00
mhz
062a27b83f try update the api in DataInfo 2024-06-26 22:10:07 +02:00
mhz
0c7c525680 try update the api in DataInfo 2024-06-26 22:09:46 +02:00
6 changed files with 443 additions and 109 deletions

View File

@@ -118,6 +118,21 @@ class AbstractDatasetInfos:
example_batch_x = torch.nn.functional.one_hot(example_batch.x, num_classes=118).float()[:, self.active_index]
example_batch_edge_attr = torch.nn.functional.one_hot(example_batch.edge_attr, num_classes=10).float()
self.input_dims = {'X': example_batch_x.size(1),
'E': example_batch_edge_attr.size(1),
'y': example_batch['y'].size(1)}
self.output_dims = {'X': example_batch_x.size(1),
'E': example_batch_edge_attr.size(1),
'y': example_batch['y'].size(1)}
print('input dims')
print(self.input_dims)
print('output dims')
print(self.output_dims)
def compute_graph_input_output_dims(self, datamodule):
example_batch = datamodule.example_batch()
example_batch_x = torch.nn.functional.one_hot(example_batch.x, num_classes=8).float()[:, self.active_index]
example_batch_edge_attr = torch.nn.functional.one_hot(example_batch.edge_attr, num_classes=2).float()
self.input_dims = {'X': example_batch_x.size(1),
'E': example_batch_edge_attr.size(1),
'y': example_batch['y'].size(1)}

View File

@@ -39,6 +39,16 @@ op_to_atom = {
'none': 'S', # Sulfur for no operation
'output': 'He' # Helium for output
}
op_type = {
'nor_conv_1x1': 1,
'nor_conv_3x3': 2,
'avg_pool_3x3': 3,
'skip_connect': 4,
'output': 5,
'none': 6,
'input': 7
}
class DataModule(AbstractDataModule):
def __init__(self, cfg):
self.datadir = cfg.dataset.datadir
@@ -50,12 +60,12 @@ class DataModule(AbstractDataModule):
def prepare_data(self) -> None:
target = getattr(self.cfg.dataset, 'guidance_target', None)
print("target", target)
print("target", target) # nasbench-201
# try:
# base_path = pathlib.Path(os.path.realpath(__file__)).parents[2]
# except NameError:
# base_path = pathlib.Path(os.getcwd()).parent[2]
base_path = '/home/stud/hanzhang/Graph-Dit'
base_path = '/home/stud/hanzhang/nasbenchDiT'
root_path = os.path.join(base_path, self.datadir)
self.root_path = root_path
@@ -68,13 +78,16 @@ class DataModule(AbstractDataModule):
# Dataset has target property, root path, and transform
source = './NAS-Bench-201-v1_1-096897.pth'
dataset = Dataset(source=source, root=root_path, target_prop=target, transform=None)
self.dataset = dataset
# self.api = dataset.api
# if len(self.task.split('-')) == 2:
# train_index, val_index, test_index, unlabeled_index = self.fixed_split(dataset)
# else:
train_index, val_index, test_index, unlabeled_index = self.random_data_split(dataset)
self.train_index, self.val_index, self.test_index, self.unlabeled_index = train_index, val_index, test_index, unlabeled_index
self.train_index, self.val_index, self.test_index, self.unlabeled_index = (
train_index, val_index, test_index, unlabeled_index)
train_index, val_index, test_index, unlabeled_index = torch.LongTensor(train_index), torch.LongTensor(val_index), torch.LongTensor(test_index), torch.LongTensor(unlabeled_index)
if len(unlabeled_index) > 0:
train_index = torch.cat([train_index, unlabeled_index], dim=0)
@@ -175,6 +188,27 @@ class DataModule(AbstractDataModule):
smiles = Chem.MolToSmiles(mol)
return smiles
def get_train_graphs(self):
train_graphs = []
test_graphs = []
for graph in self.train_dataset:
train_graphs.append(graph)
for graph in self.test_dataset:
test_graphs.append(graph)
return train_graphs, test_graphs
# def get_train_smiles(self):
# filename = f'{self.task}.csv.gz'
# df = pd.read_csv(f'{self.root_path}/raw/{filename}')
# df_test = df.iloc[self.test_index]
# df = df.iloc[self.train_index]
# smiles_list = df['smiles'].tolist()
# smiles_list_test = df_test['smiles'].tolist()
# smiles_list = [Chem.MolToSmiles(Chem.MolFromSmiles(smi)) for smi in smiles_list]
# smiles_list_test = [Chem.MolToSmiles(Chem.MolFromSmiles(smi)) for smi in smiles_list_test]
# return smiles_list, smiles_list_test
def get_train_smiles(self):
train_smiles = []
test_smiles = []
@@ -319,6 +353,121 @@ class DataModule_original(AbstractDataModule):
def test_dataloader(self):
return self.test_loader
def new_graphs_to_json(graphs, filename):
source_name = "nasbench-201"
num_graph = len(graphs)
node_name_list = []
node_count_list = []
for op_name in op_type:
node_name_list.append(op_name)
node_count_list.append(0)
node_name_list.append('*')
node_count_list.append(0)
n_nodes_per_graph = [0] * num_graph
edge_count_list = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
valencies = [0] * (len(op_type) + 1)
transition_E = np.zeros((len(op_type) + 1, len(op_type) + 1, 2))
n_node_list = []
n_edge_list = []
for graph in graphs:
ops = graph[1]
adj = graph[0]
n_node = len(ops)
n_edge = len(ops)
n_node_list.append(n_node)
n_edge_list.append(n_edge)
n_nodes_per_graph[n_node] += 1
cur_node_count_arr = np.zeros(len(op_type) + 1)
for op in ops:
node = op
if node == '*':
node_count_list[-1] += 1
cur_node_count_arr[-1] += 1
else:
node_count_list[op_type[node]] += 1
cur_node_count_arr[op_type[node]] += 1
try:
valencies[int(op_type[node])] += 1
except:
print('int(op_type[node])', int(op_type[node]))
transition_E_temp = np.zeros((len(op_type) + 1, len(op_type) + 1, 2))
for i in range(n_node):
for j in range(n_node):
if i == j or adj[i][j] == 0:
continue
start_node, end_node = i, j
start_index = op_type[ops[start_node]]
end_index = op_type[ops[end_node]]
bond_index = 1
edge_count_list[bond_index] += 2
transition_E[start_index, end_index, bond_index] += 2
transition_E[end_index, start_index, bond_index] += 2
transition_E_temp[start_index, end_index, bond_index] += 2
transition_E_temp[end_index, start_index, bond_index] += 2
edge_count_list[0] += n_node * (n_node - 1) - n_edge * 2
cur_tot_edge = cur_node_count_arr.reshape(-1,1) * cur_node_count_arr.reshape(1,-1) * 2
print(f"cur_tot_edge={cur_tot_edge}, shape: {cur_tot_edge.shape}")
cur_tot_edge = cur_tot_edge - np.diag(cur_node_count_arr) * 2
transition_E[:, :, 0] += cur_tot_edge - transition_E_temp.sum(axis=-1)
assert (cur_tot_edge > transition_E_temp.sum(axis=-1)).sum() >= 0
n_nodes_per_graph = np.array(n_nodes_per_graph) / np.sum(n_nodes_per_graph)
n_nodes_per_graph = n_nodes_per_graph.tolist()[:51]
node_count_list = np.array(node_count_list) / np.sum(node_count_list)
print('processed meta info: ------', filename, '------')
print('len node_count_list', len(node_count_list))
print('len node_name_list', len(node_name_list))
active_nodes = np.array(node_name_list)[node_count_list > 0]
active_nodes = active_nodes.tolist()
node_count_list = node_count_list.tolist()
edge_count_list = np.array(edge_count_list) / np.sum(edge_count_list)
edge_count_list = edge_count_list.tolist()
valencies = np.array(valencies) / np.sum(valencies)
valencies = valencies.tolist()
no_edge = np.sum(transition_E, axis=-1) == 0
first_elt = transition_E[:, :, 0]
first_elt[no_edge] = 1
transition_E[:, :, 0] = first_elt
transition_E = transition_E / np.sum(transition_E, axis=-1, keepdims=True)
meta_dict = {
'source': source_name,
'num_graph': num_graph,
'n_nodes_per_graph': n_nodes_per_graph,
'max_n_nodes': max(n_node_list),
'max_n_edges': max(n_edge_list),
'node_type_list': node_count_list,
'edge_type_list': edge_count_list,
'valencies': valencies,
'active_nodes': active_nodes,
'num_active_nodes': len(active_nodes),
'transition_E': transition_E.tolist(),
}
with open(f'{filename}.meta.json', 'w') as f:
json.dump(meta_dict, f)
return meta_dict
def graphs_to_json(graphs, filename):
bonds = {
'nor_conv_1x1': 1,
@@ -466,7 +615,7 @@ def graphs_to_json(graphs, filename):
'atom_type_dist': atom_count_list,
'bond_type_dist': bond_count_list,
'valencies': valencies,
'active_atoms': [atom_name_list[i] for i in range(118) if atom_count_list[i] > 0],
'active_nodes': [atom_name_list[i] for i in range(118) if atom_count_list[i] > 0],
'num_atom_type': len([atom_name_list[i] for i in range(118) if atom_count_list[i] > 0]),
'transition_E': transition_E.tolist(),
}
@@ -477,14 +626,17 @@ def graphs_to_json(graphs, filename):
class Dataset(InMemoryDataset):
def __init__(self, source, root, target_prop=None, transform=None, pre_transform=None, pre_filter=None):
self.target_prop = target_prop
source = '/home/stud/hanzhang/Graph-DiT/graph_dit/NAS-Bench-201-v1_1-096897.pth'
source = '/home/stud/hanzhang/nasbenchDiT/graph_dit/NAS-Bench-201-v1_1-096897.pth'
self.source = source
self.api = API(source) # Initialize NAS-Bench-201 API
print('API loaded')
# self.api = API(source) # Initialize NAS-Bench-201 API
# print('API loaded')
super().__init__(root, transform, pre_transform, pre_filter)
print('Dataset initialized')
print(self.processed_paths[0])
print(self.processed_paths[0]) #/home/stud/hanzhang/Graph-DiT/graph_dit/NAS-Bench-201-v1_1-096897.pth.pt
self.data, self.slices = torch.load(self.processed_paths[0])
print('Dataset initialized')
self.data.edge_attr = self.data.edge_attr.squeeze()
self.data.idx = torch.arange(len(self.data.y))
print(f"self.data={self.data}, self.slices={self.slices}")
@property
def raw_file_names(self):
@@ -495,82 +647,146 @@ class Dataset(InMemoryDataset):
return [f'{self.source}.pt']
def process(self):
def parse_architecture_string(arch_str):
stages = arch_str.split('+')
nodes = ['input']
edges = []
source = '/home/stud/hanzhang/nasbenchDiT/graph_dit/NAS-Bench-201-v1_1-096897.pth'
self.api = API(source)
for stage in stages:
operations = stage.strip('|').split('|')
for op in operations:
operation, idx = op.split('~')
idx = int(idx)
edges.append((idx, len(nodes))) # Add edge from idx to the new node
nodes.append(operation)
nodes.append('output') # Add the output node
return nodes, edges
def create_graph(nodes, edges):
G = nx.DiGraph()
for i, node in enumerate(nodes):
G.add_node(i, label=node)
G.add_edges_from(edges)
return G
def arch_to_graph(arch_str, sa, sc, target, target2=None, target3=None):
nodes, edges = parse_architecture_string(arch_str)
node_labels = [bonds[node] for node in nodes] # Replace with appropriate encoding if necessary
assert 0 not in node_labels, f'Invalid node label: {node_labels}'
x = torch.LongTensor(node_labels)
print(f'in initialize Dataset, arch_to_Graph x={x}')
edges_list = [(start, end) for start, end in edges]
edge_type = [bonds[nodes[end]] for start, end in edges] # Example: using end node type as edge type
edge_index = torch.tensor(edges_list, dtype=torch.long).t().contiguous()
edge_type = torch.tensor(edge_type, dtype=torch.long)
edge_attr = edge_type.view(-1, 1)
if target3 is not None:
y = torch.tensor([sa, sc, target, target2, target3], dtype=torch.float).view(1, -1)
elif target2 is not None:
y = torch.tensor([sa, sc, target, target2], dtype=torch.float).view(1, -1)
else:
y = torch.tensor([sa, sc, target], dtype=torch.float).view(1, -1)
print(f'in initialize Dataset, Data_init, x={x}, y={y}, edge_index={edge_index}, edge_attr={edge_attr}')
data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
return data, nodes
bonds = {
'nor_conv_1x1': 1,
'nor_conv_3x3': 2,
'avg_pool_3x3': 3,
'skip_connect': 4,
'output': 5,
'none': 6,
'input': 7
}
# Prepare to process NAS-Bench-201 data
data_list = []
len_data = len(self.api) # Number of architectures
with tqdm(total=len_data) as pbar:
for arch_index in range(len_data):
arch_info = self.api.query_meta_info_by_index(arch_index)
arch_str = arch_info.arch_str
sa = np.random.rand() # Placeholder for synthetic accessibility
sc = np.random.rand() # Placeholder for substructure count
target = np.random.rand() # Placeholder for target value
target2 = np.random.rand() # Placeholder for second target value
target3 = np.random.rand() # Placeholder for third target value
len_data = len(self.api)
data, active_nodes = arch_to_graph(arch_str, sa, sc, target, target2, target3)
def graph_to_graph_data(graph):
ops = graph[1]
adj = graph[0]
nodes = []
for op in ops:
nodes.append(op_type[op])
x = torch.LongTensor(nodes)
edges_list = []
edge_type = []
for start in range(len(ops)):
for end in range(len(ops)):
if adj[start][end] == 1:
edges_list.append((start, end))
edge_type.append(1)
edges_list.append((end, start))
edge_type.append(1)
edge_index = torch.tensor(edges_list, dtype=torch.long).t()
edge_type = torch.tensor(edge_type, dtype=torch.long)
edge_attr = edge_type
y = torch.tensor([0], dtype=torch.float).view(1, -1)
data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y, idx=i)
return data
graph_list = []
with tqdm(total = len_data) as pbar:
active_nodes = set()
for i in range(len_data):
arch_info = self.api.query_meta_info_by_index(i)
nodes, edges = parse_architecture_string(arch_info.arch_str)
adj_matrix, ops = create_adj_matrix_and_ops(nodes, edges)
for op in ops:
if op not in active_nodes:
active_nodes.add(op)
graph_list.append({
"adj_matrix": adj_matrix,
"ops": ops,
"idx": i
})
data = graph_to_graph_data((adj_matrix, ops))
data_list.append(data)
pbar.update(1)
for graph in graph_list:
adj_matrix = graph['adj_matrix']
if isinstance(adj_matrix, np.ndarray):
adj_matrix = adj_matrix.tolist()
graph['adj_matrix'] = adj_matrix
ops = graph['ops']
if isinstance(ops, np.ndarray):
ops = ops.tolist()
graph['ops'] = ops
with open(f'nasbench-201-graph.json', 'w') as f:
json.dump(graph_list, f)
torch.save(self.collate(data_list), self.processed_paths[0])
# def parse_architecture_string(arch_str):
# stages = arch_str.split('+')
# nodes = ['input']
# edges = []
# for stage in stages:
# operations = stage.strip('|').split('|')
# for op in operations:
# operation, idx = op.split('~')
# idx = int(idx)
# edges.append((idx, len(nodes))) # Add edge from idx to the new node
# nodes.append(operation)
# nodes.append('output') # Add the output node
# return nodes, edges
# def create_graph(nodes, edges):
# G = nx.DiGraph()
# for i, node in enumerate(nodes):
# G.add_node(i, label=node)
# G.add_edges_from(edges)
# return G
# def arch_to_graph(arch_str, sa, sc, target, target2=None, target3=None):
# nodes, edges = parse_architecture_string(arch_str)
# node_labels = [bonds[node] for node in nodes] # Replace with appropriate encoding if necessary
# assert 0 not in node_labels, f'Invalid node label: {node_labels}'
# x = torch.LongTensor(node_labels)
# print(f'in initialize Dataset, arch_to_Graph x={x}')
# edges_list = [(start, end) for start, end in edges]
# edge_type = [bonds[nodes[end]] for start, end in edges] # Example: using end node type as edge type
# edge_index = torch.tensor(edges_list, dtype=torch.long).t().contiguous()
# edge_type = torch.tensor(edge_type, dtype=torch.long)
# edge_attr = edge_type.view(-1, 1)
# if target3 is not None:
# y = torch.tensor([sa, sc, target, target2, target3], dtype=torch.float).view(1, -1)
# elif target2 is not None:
# y = torch.tensor([sa, sc, target, target2], dtype=torch.float).view(1, -1)
# else:
# y = torch.tensor([sa, sc, target], dtype=torch.float).view(1, -1)
# print(f'in initialize Dataset, Data_init, x={x}, y={y}, edge_index={edge_index}, edge_attr={edge_attr}')
# data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
# return data, nodes
# bonds = {
# 'nor_conv_1x1': 1,
# 'nor_conv_3x3': 2,
# 'avg_pool_3x3': 3,
# 'skip_connect': 4,
# 'output': 5,
# 'none': 6,
# 'input': 7
# }
# # Prepare to process NAS-Bench-201 data
# data_list = []
# len_data = len(self.api) # Number of architectures
# with tqdm(total=len_data) as pbar:
# for arch_index in range(len_data):
# arch_info = self.api.query_meta_info_by_index(arch_index)
# arch_str = arch_info.arch_str
# sa = np.random.rand() # Placeholder for synthetic accessibility
# sc = np.random.rand() # Placeholder for substructure count
# target = np.random.rand() # Placeholder for target value
# target2 = np.random.rand() # Placeholder for second target value
# target3 = np.random.rand() # Placeholder for third target value
# data, active_nodes = arch_to_graph(arch_str, sa, sc, target, target2, target3)
# data_list.append(data)
# pbar.update(1)
# torch.save(self.collate(data_list), self.processed_paths[0])
class Dataset_origin(InMemoryDataset):
def __init__(self, source, root, target_prop=None,
transform=None, pre_transform=None, pre_filter=None):
@@ -676,7 +892,7 @@ def create_adj_matrix_and_ops(nodes, edges):
adj_matrix[src][dst] = 1
return adj_matrix, nodes
class DataInfos(AbstractDatasetInfos):
def __init__(self, datamodule, cfg):
def __init__(self, datamodule, cfg, dataset):
tasktype_dict = {
'hiv_b': 'classification',
'bace_b': 'classification',
@@ -689,6 +905,7 @@ class DataInfos(AbstractDatasetInfos):
self.task = task_name
self.task_type = tasktype_dict.get(task_name, "regression")
self.ensure_connected = cfg.model.ensure_connected
# self.api = dataset.api
datadir = cfg.dataset.datadir
@@ -699,36 +916,54 @@ class DataInfos(AbstractDatasetInfos):
length = 15625
ops_type = {}
len_ops = set()
api = API('/home/stud/hanzhang/Graph-DiT/graph_dit/NAS-Bench-201-v1_1-096897.pth')
for i in range(length):
arch_info = api.query_meta_info_by_index(i)
nodes, edges = parse_architecture_string(arch_info.arch_str)
adj_matrix, ops = create_adj_matrix_and_ops(nodes, edges)
if i < 5:
print("Adjacency Matrix:")
print(adj_matrix)
print("Operations List:")
print(ops)
for op in ops:
if op not in ops_type:
ops_type[op] = len(ops_type)
len_ops.add(len(ops))
graphs.append((adj_matrix, ops))
# api = API('/home/stud/hanzhang/Graph-DiT/graph_dit/NAS-Bench-201-v1_1-096897.pth')
meta_dict = graphs_to_json(graphs, 'nasbench-201')
def read_adj_ops_from_json(filename):
with open(filename, 'r') as json_file:
data = json.load(json_file)
adj_ops_pairs = []
for item in data:
adj_matrix = np.array(item['adjacency_matrix'])
ops = item['operations']
adj_ops_pairs.append((adj_matrix, ops))
return adj_ops_pairs
# for i in range(length):
# arch_info = self.api.query_meta_info_by_index(i)
# nodes, edges = parse_architecture_string(arch_info.arch_str)
# adj_matrix, ops = create_adj_matrix_and_ops(nodes, edges)
# if i < 5:
# print("Adjacency Matrix:")
# print(adj_matrix)
# print("Operations List:")
# print(ops)
# for op in ops:
# if op not in ops_type:
# ops_type[op] = len(ops_type)
# len_ops.add(len(ops))
# graphs.append((adj_matrix, ops))
graphs = read_adj_ops_from_json(f'nasbench-201.meta.json')
# check first five graphs
for i in range(5):
print(f'graph {i} : {graphs[i]}')
print(f'ops_type: {ops_type}')
meta_dict = new_graphs_to_json(graphs, 'nasbench-201')
self.base_path = base_path
self.active_atoms = meta_dict['active_atoms']
self.max_n_nodes = meta_dict['max_node']
self.original_max_n_nodes = meta_dict['max_node']
self.n_nodes = torch.Tensor(meta_dict['n_atoms_per_mol_dist'])
self.edge_types = torch.Tensor(meta_dict['bond_type_dist'])
self.active_nodes = meta_dict['active_nodes']
self.max_n_nodes = meta_dict['max_n_nodes']
self.original_max_n_nodes = meta_dict['max_n_nodes']
self.n_nodes = torch.Tensor(meta_dict['n_nodes_per_graph'])
self.edge_types = torch.Tensor(meta_dict['edge_type_list'])
self.transition_E = torch.Tensor(meta_dict['transition_E'])
self.atom_decoder = meta_dict['active_atoms']
node_types = torch.Tensor(meta_dict['atom_type_dist'])
self.node_decoder = meta_dict['active_nodes']
node_types = torch.Tensor(meta_dict['node_type_list'])
active_index = (node_types > 0).nonzero().squeeze()
self.node_types = torch.Tensor(meta_dict['atom_type_dist'])[active_index]
self.node_types = torch.Tensor(meta_dict['node_type_list'])[active_index]
self.nodes_dist = DistributionNodes(self.n_nodes)
self.active_index = active_index
@@ -930,4 +1165,4 @@ def compute_meta(root, source_name, train_index, test_index):
if __name__ == "__main__":
pass
dataset = Dataset(source='nasbench', root='/home/stud/hanzhang/nasbenchDiT/graph-dit', target_prop='Class', transform=None)

View File

@@ -78,16 +78,20 @@ def main(cfg: DictConfig):
datamodule = dataset.DataModule(cfg)
datamodule.prepare_data()
dataset_infos = dataset.DataInfos(datamodule=datamodule, cfg=cfg)
dataset_infos = dataset.DataInfos(datamodule=datamodule, cfg=cfg, dataset=datamodule.dataset)
# train_smiles, reference_smiles = datamodule.get_train_smiles()
train_graphs, reference_graphs = datamodule.get_train_graphs()
# get input output dimensions
dataset_infos.compute_input_output_dims(datamodule=datamodule)
# train_metrics = TrainMolecularMetricsDiscrete(dataset_infos)
train_metrics = TrainMolecularMetricsDiscrete(dataset_infos)
# sampling_metrics = SamplingMolecularMetrics(
# dataset_infos, train_smiles, reference_smiles
# )
sampling_metrics = SamplingGraphMetrics(
dataset_infos, train_graphs, reference_graphs
)
visualization_tools = MolecularVisualization(dataset_infos)
model_kwargs = {
@@ -135,5 +139,16 @@ def main(cfg: DictConfig):
else:
trainer.test(model, datamodule=datamodule, ckpt_path=cfg.general.test_only)
@hydra.main(
version_base="1.1", config_path="../configs", config_name="config"
)
def test(cfg: DictConfig):
datamodule = dataset.DataModule(cfg)
datamodule.prepare_data()
dataset_infos = dataset.DataInfos(datamodule=datamodule, cfg=cfg, dataset=datamodule.dataset)
train_graphs, reference_graphs = datamodule.get_train_graphs()
dataset_infos.compute_input_output_dims(datamodule=datamodule)
if __name__ == "__main__":
main()
test()

View File

@@ -35,7 +35,13 @@ class CEPerClass(Metric):
def compute(self):
return self.total_ce / self.total_samples
class NodeCE(CEPerClass):
def __init__(self, i):
super().__init__(i)
class EdgeCE(CEPerClass):
def __init__(self, i):
super().__init__(i)
class AtomCE(CEPerClass):
def __init__(self, i):
@@ -65,6 +71,12 @@ class AromaticCE(CEPerClass):
def __init__(self, i):
super().__init__(i)
class NodeMetricsCE(MetricCollection):
def __init__(self, active_nodes):
metrics_list = []
for i, node_type in enumerate(active_nodes) :
metrics_list.append(type(f'{node_type}_CE', (NodeCE,), {})(i))
class AtomMetricsCE(MetricCollection):
def __init__(self, active_atoms):
@@ -85,6 +97,11 @@ class BondMetricsCE(MetricCollection):
super().__init__([ce_no_bond, ce_SI, ce_DO, ce_TR])
#
class TrainGraphMetricsDiscrete(nn.Module):
def __init__(self, dataset_infos):
super().__init__()
class TrainMolecularMetricsDiscrete(nn.Module):
def __init__(self, dataset_infos):
super().__init__()

View File

@@ -15,6 +15,17 @@ from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.Chem import rdMolDescriptors
rdBase.DisableLog('rdApp.error')
import json
op_type = {
'nor_conv_1x1': 1,
'nor_conv_3x3': 2,
'avg_pool_3x3': 3,
'skip_connect': 4,
'output': 5,
'none': 6,
'input': 7
}
task_to_colname = {
'hiv_b': 'HIV_active',
@@ -32,8 +43,10 @@ tasktype_name = {
'O2': 'regression',
'N2': 'regression',
'CO2': 'regression',
'nasbench201': 'regression',
}
class TaskModel():
"""Scores based on an ECFP classifier."""
def __init__(self, model_path, task_name):
@@ -55,8 +68,47 @@ class TaskModel():
perfermance = self.train()
dump(self.model, model_path)
print('Oracle peformance: ', perfermance)
def train(self):
def read_adj_ops_from_json(filename):
with open(filename, 'r') as json_file:
data = json.load(json_file)
adj_ops_pairs = []
for item in data:
adj_matrix = np.array(item['adj_matrix'])
ops = item['ops']
acc = item['train'][0]['accuracy']
adj_ops_pairs.append((adj_matrix, ops, acc))
return adj_ops_pairs
def feature_from_adj_and_ops(adj, ops):
return np.concatenate([adj.flatten(), ops])
filename = '/home/stud/hanzhang/nasbenchDiT/graph_dit/nasbench-201-graph.json'
graphs = read_adj_ops_from_json(filename)
adjs = []
opss = []
accs = []
features = []
for graph in graphs:
adj, ops, acc=graph
op_code = [op_type[op] for op in ops]
adjs.append(adj)
opss.append(op_code)
accs.append(acc)
features.append(feature_from_adj_and_ops(adj, op_code))
features = np.array(features)
labels = np.array(accs)
mask = ~np.isnan(labels)
labels = labels[mask]
features = features[mask]
self.model.fit(features, labels)
y_pred = self.model.predict(features)
perf = self.metric_func(labels, y_pred)
print(f'{self.task_name} performance: {perf}')
return perf
def train__(self):
data_path = os.path.dirname(self.model_path)
data_path = os.path.join(os.path.dirname(self.model_path), '..', f'raw/{self.task_name}.csv.gz')
df = pd.read_csv(data_path)

0
graph_dit/workingdoc.md Normal file
View File