import numpy as np
import networkx as nx
import scipy.sparse as sp
from sklearn import preprocessing
from sklearn.utils.extmath import randomized_svd
from multiprocessing import Pool
import time
from .. import BaseModel, register_model, alias_draw, alias_setup
[docs]@register_model("netsmf")
class NetSMF(BaseModel):
r"""The NetSMF model from the `"NetSMF: Large-Scale Network Embedding as Sparse Matrix Factorization"
<http://arxiv.org/abs/1710.02971>`_ paper.
Args:
hidden_size (int) : The dimension of node representation.
window_size (int) : The actual context size which is considered in language model.
negative (int) : The number of nagative samples in negative sampling.
num_round (int) : The number of round in NetSMF.
worker (int) : The number of workers for NetSMF.
"""
@staticmethod
[docs] def add_args(parser):
"""Add model-specific arguments to the parser."""
# fmt: off
parser.add_argument('--window-size', type=int, default=10,
help='Window size of approximate matrix. Default is 10.')
parser.add_argument('--negative', type=int, default=1,
help='Number of negative node in sampling. Default is 1.')
parser.add_argument('--num-round', type=int, default=100,
help='Number of round in NetSMF. Default is 100.')
parser.add_argument('--worker', type=int, default=10,
help='Number of parallel workers. Default is 10.')
# fmt: on
@classmethod
[docs] def build_model_from_args(cls, args):
return cls(
args.hidden_size,
args.window_size,
args.negative,
args.num_round,
args.worker,
)
def __init__(self, dimension, window_size, negative, num_round, worker):
super(NetSMF, self).__init__()
self.dimension = dimension
self.window_size = window_size
self.negative = negative
self.worker = worker
self.num_round = num_round
[docs] def train(self, G):
self.G = G
node2id = dict([(node, vid) for vid, node in enumerate(G.nodes())])
self.is_directed = nx.is_directed(self.G)
self.num_node = self.G.number_of_nodes()
self.num_edge = G.number_of_edges()
self.edges = [[node2id[e[0]], node2id[e[1]]] for e in self.G.edges()]
id2node = dict(zip(node2id.values(), node2id.keys()))
self.num_neigh = np.asarray(
[len(list(self.G.neighbors(id2node[i]))) for i in range(self.num_node)]
)
self.neighbors = [
[node2id[v] for v in self.G.neighbors(id2node[i])]
for i in range(self.num_node)
]
s = time.time()
self.alias_nodes = {}
self.node_weight = {}
for i in range(self.num_node):
unnormalized_probs = [
G[id2node[i]][nbr].get("weight", 1.0) for nbr in G.neighbors(id2node[i])
]
norm_const = sum(unnormalized_probs)
normalized_probs = [
float(u_prob) / norm_const for u_prob in unnormalized_probs
]
self.alias_nodes[i] = alias_setup(normalized_probs)
self.node_weight[i] = dict(
zip(
[node2id[nbr] for nbr in G.neighbors(id2node[i])],
unnormalized_probs,
)
)
t = time.time()
print("alias_nodes", t - s)
# run netsmf algorithm with multiprocessing and apply randomized svd
print(
"number of sample edges ", self.num_round * self.num_edge * self.window_size
)
print("random walk start...")
t0 = time.time()
results = []
pool = Pool(processes=self.worker)
for i in range(self.worker):
results.append(pool.apply_async(func=self._random_walk_matrix, args=(i,)))
pool.close()
pool.join()
print("random walk time", time.time() - t0)
matrix = sp.lil_matrix((self.num_node, self.num_node))
A = sp.csr_matrix(nx.adjacency_matrix(self.G))
degree = sp.diags(np.array(A.sum(axis=0))[0], format="csr")
degree_inv = degree.power(-1)
t1 = time.time()
for res in results:
matrix += res.get()
print("number of nzz", matrix.nnz)
t2 = time.time()
print("construct random walk matrix time", time.time() - t1)
L = sp.csgraph.laplacian(matrix, normed=False, return_diag=False)
M = degree_inv.dot(degree - L).dot(degree_inv)
M = M * A.sum() / self.negative
M.data[M.data <= 1] = 1
M.data = np.log(M.data)
print("construct matrix sparsifier time", time.time() - t2)
embedding = self._get_embedding_rand(M)
return embedding
[docs] def _get_embedding_rand(self, matrix):
# Sparse randomized tSVD for fast embedding
t1 = time.time()
l = matrix.shape[0]
smat = sp.csc_matrix(matrix)
print("svd sparse", smat.data.shape[0] * 1.0 / l ** 2)
U, Sigma, VT = randomized_svd(
smat, n_components=self.dimension, n_iter=5, random_state=None
)
U = U * np.sqrt(Sigma)
U = preprocessing.normalize(U, "l2")
print("sparsesvd time", time.time() - t1)
return U
[docs] def _path_sampling(self, u, v, r):
# sample a r-length path from edge(u, v) and return path end node
k = np.random.randint(r) + 1
zp, rand_u, rand_v = 1e-20, k - 1, r - k
for i in range(rand_u):
new_u = self.neighbors[u][
alias_draw(self.alias_nodes[u][0], self.alias_nodes[u][1])
]
zp += 2.0 / self.node_weight[u][new_u]
u = new_u
for j in range(rand_v):
new_v = self.neighbors[v][
alias_draw(self.alias_nodes[v][0], self.alias_nodes[v][1])
]
zp += 2.0 / self.node_weight[v][new_v]
v = new_v
return u, v, zp
[docs] def _random_walk_matrix(self, pid):
# construct matrix based on random walk
np.random.seed(pid)
matrix = sp.lil_matrix((self.num_node, self.num_node))
t0 = time.time()
for round in range(int(self.num_round / self.worker)):
if round % 10 == 0 and pid == 0:
print(
"round %d / %d, time: %lf"
% (round * self.worker, self.num_round, time.time() - t0)
)
for i in range(self.num_edge):
u, v = self.edges[i]
if not self.is_directed and np.random.rand() > 0.5:
v, u = self.edges[i]
for r in range(1, self.window_size + 1):
u_, v_, zp = self._path_sampling(u, v, r)
matrix[u_, v_] += 2 * r / self.window_size / self.num_round / zp
return matrix