210 lines
7.2 KiB
Python
210 lines
7.2 KiB
Python
"""MMD Evaluation on graph structure statistics. Modified from https://github.com/uoguelph-mlrg/GGM-metrics"""
|
|
|
|
import numpy as np
|
|
import networkx as nx
|
|
import numpy as np
|
|
# from scipy.linalg import toeplitz
|
|
# import pyemd
|
|
import concurrent.futures
|
|
from scipy.linalg import eigvalsh
|
|
from functools import partial
|
|
|
|
|
|
class Descriptor():
|
|
def __init__(self, is_parallel=False, bins=100, kernel='rbf', sigma_type='single', **kwargs):
|
|
self.is_parallel = is_parallel
|
|
self.bins = bins
|
|
self.max_workers = kwargs.get('max_workers')
|
|
|
|
if kernel == 'rbf':
|
|
self.distance = self.l2
|
|
self.name += '_rbf'
|
|
else:
|
|
ValueError
|
|
|
|
if sigma_type == 'argmax':
|
|
log_sigmas = np.linspace(-5., 5., 50)
|
|
# the first 30 sigma values is usually enough
|
|
log_sigmas = log_sigmas[:30]
|
|
self.sigmas = [np.exp(log_sigma) for log_sigma in log_sigmas]
|
|
elif sigma_type == 'single':
|
|
self.sigmas = kwargs['sigma']
|
|
else:
|
|
raise ValueError
|
|
|
|
def evaluate(self, graph_ref_list, graph_pred_list):
|
|
"""Compute the distance between the distributions of two unordered sets of graphs.
|
|
Args:
|
|
graph_ref_list, graph_pred_list: two lists of networkx graphs to be evaluated.
|
|
"""
|
|
|
|
graph_pred_list = [G for G in graph_pred_list if not G.number_of_nodes() == 0]
|
|
|
|
sample_pred = self.extract_features(graph_pred_list)
|
|
sample_ref = self.extract_features(graph_ref_list)
|
|
|
|
GG = self.disc(sample_pred, sample_pred, distance_scaling=self.distance_scaling)
|
|
GR = self.disc(sample_pred, sample_ref, distance_scaling=self.distance_scaling)
|
|
RR = self.disc(sample_ref, sample_ref, distance_scaling=self.distance_scaling)
|
|
|
|
sigmas = self.sigmas
|
|
max_mmd = 0
|
|
mmd_dict = []
|
|
for sigma in sigmas:
|
|
gamma = 1 / (2 * sigma ** 2)
|
|
|
|
K_GR = np.exp(-gamma * GR)
|
|
K_GG = np.exp(-gamma * GG)
|
|
K_RR = np.exp(-gamma * RR)
|
|
|
|
mmd = K_GG.mean() + K_RR.mean() - (2 * K_GR.mean())
|
|
mmd_dict.append((sigma, mmd))
|
|
max_mmd = mmd if mmd > max_mmd else max_mmd
|
|
|
|
# print(self.name, mmd_dict)
|
|
|
|
return max_mmd
|
|
|
|
def pad_histogram(self, x, y):
|
|
# convert histogram values x and y to float, and pad them for equal length
|
|
support_size = max(len(x), len(y))
|
|
x = x.astype(np.float)
|
|
y = y.astype(np.float)
|
|
if len(x) < len(y):
|
|
x = np.hstack((x, [0.] * (support_size - len(x))))
|
|
elif len(y) < len(x):
|
|
y = np.hstack((y, [0.] * (support_size - len(y))))
|
|
|
|
return x, y
|
|
|
|
# def emd(self, x, y, distance_scaling=1.0):
|
|
# support_size = max(len(x), len(y))
|
|
# x, y = self.pad_histogram(x, y)
|
|
#
|
|
# d_mat = toeplitz(range(support_size)).astype(np.float)
|
|
# distance_mat = d_mat / distance_scaling
|
|
#
|
|
# dist = pyemd.emd(x, y, distance_mat)
|
|
# return dist ** 2
|
|
|
|
def l2(self, x, y, **kwargs):
|
|
# gaussian rbf
|
|
x, y = self.pad_histogram(x, y)
|
|
dist = np.linalg.norm(x - y, 2)
|
|
return dist ** 2
|
|
|
|
def kernel_parallel_unpacked(self, x, samples2, kernel):
|
|
dist = []
|
|
for s2 in samples2:
|
|
dist += [kernel(x, s2)]
|
|
return dist
|
|
|
|
def kernel_parallel_worker(self, t):
|
|
return self.kernel_parallel_unpacked(*t)
|
|
|
|
def disc(self, samples1, samples2, **kwargs):
|
|
# Discrepancy between 2 samples
|
|
tot_dist = []
|
|
if not self.is_parallel:
|
|
for s1 in samples1:
|
|
for s2 in samples2:
|
|
tot_dist += [self.distance(s1, s2)]
|
|
else:
|
|
with concurrent.futures.ProcessPoolExecutor(max_workers=self.max_workers) as executor:
|
|
for dist in executor.map(self.kernel_parallel_worker,
|
|
[(s1, samples2, partial(self.distance, **kwargs)) for s1 in samples1]):
|
|
tot_dist += [dist]
|
|
return np.array(tot_dist)
|
|
|
|
|
|
class degree(Descriptor):
|
|
def __init__(self, *args, **kwargs):
|
|
self.name = 'degree'
|
|
self.sigmas = [kwargs.get('sigma', 1.0)]
|
|
self.distance_scaling = 1.0
|
|
super().__init__(*args, **kwargs)
|
|
|
|
def extract_features(self, dataset):
|
|
res = []
|
|
if self.is_parallel:
|
|
with concurrent.futures.ProcessPoolExecutor(max_workers=self.max_workers) as executor:
|
|
for deg_hist in executor.map(self.degree_worker, dataset):
|
|
res.append(deg_hist)
|
|
else:
|
|
for g in dataset:
|
|
degree_hist = self.degree_worker(g)
|
|
res.append(degree_hist)
|
|
|
|
res = [s1 / np.sum(s1) for s1 in res]
|
|
return res
|
|
|
|
def degree_worker(self, G):
|
|
return np.array(nx.degree_histogram(G))
|
|
|
|
|
|
class cluster(Descriptor):
|
|
def __init__(self, *args, **kwargs):
|
|
self.name = 'cluster'
|
|
self.sigmas = [kwargs.get('sigma', [1.0 / 10])]
|
|
super().__init__(*args, **kwargs)
|
|
self.distance_scaling = self.bins
|
|
|
|
def extract_features(self, dataset):
|
|
res = []
|
|
if self.is_parallel:
|
|
with concurrent.futures.ProcessPoolExecutor(max_workers=self.max_workers) as executor:
|
|
for clustering_hist in executor.map(self.clustering_worker, [(G, self.bins) for G in dataset]):
|
|
res.append(clustering_hist)
|
|
else:
|
|
for g in dataset:
|
|
clustering_hist = self.clustering_worker((g, self.bins))
|
|
res.append(clustering_hist)
|
|
|
|
res = [s1 / np.sum(s1) for s1 in res]
|
|
return res
|
|
|
|
def clustering_worker(self, param):
|
|
G, bins = param
|
|
clustering_coeffs_list = list(nx.clustering(G).values())
|
|
hist, _ = np.histogram(
|
|
clustering_coeffs_list, bins=bins, range=(0.0, 1.0), density=False)
|
|
return hist
|
|
|
|
|
|
class spectral(Descriptor):
|
|
def __init__(self, *args, **kwargs):
|
|
self.name = 'spectral'
|
|
self.sigmas = [kwargs.get('sigma', 1.0)]
|
|
self.distance_scaling = 1
|
|
super().__init__(*args, **kwargs)
|
|
|
|
def extract_features(self, dataset):
|
|
res = []
|
|
if self.is_parallel:
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
|
for spectral_density in executor.map(self.spectral_worker, dataset):
|
|
res.append(spectral_density)
|
|
else:
|
|
for g in dataset:
|
|
spectral_temp = self.spectral_worker(g)
|
|
res.append(spectral_temp)
|
|
return res
|
|
|
|
def spectral_worker(self, G):
|
|
eigs = eigvalsh(nx.normalized_laplacian_matrix(G).todense())
|
|
spectral_pmf, _ = np.histogram(eigs, bins=200, range=(-1e-5, 2), density=False)
|
|
spectral_pmf = spectral_pmf / spectral_pmf.sum()
|
|
return spectral_pmf
|
|
|
|
|
|
def mmd_eval(graph_ref_list, graph_pred_list, methods):
|
|
evaluators = []
|
|
for (method, sigma, sigma_type) in methods:
|
|
evaluators.append(eval(method)(sigma=sigma, sigma_type=sigma_type))
|
|
|
|
results = {}
|
|
for evaluator in evaluators:
|
|
results[evaluator.name] = evaluator.evaluate(graph_ref_list, graph_pred_list)
|
|
|
|
return results
|