Source code for cogdl.models.emb.netmf

import networkx as nx
import numpy as np
import scipy.sparse as sp

from .. import BaseModel


[docs]class NetMF(BaseModel): r"""The NetMF model from the `"Network Embedding as Matrix Factorization: Unifying DeepWalk, LINE, PTE, and node2vec" <http://arxiv.org/abs/1710.02971>`_ paper. Args: hidden_size (int) : The dimension of node representation. window_size (int) : The actual context size which is considered in language model. rank (int) : The rank in approximate normalized laplacian. negative (int) : The number of nagative samples in negative sampling. is-large (bool) : When window size is large, use approximated deepwalk matrix to decompose. """
[docs] @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument("--window-size", type=int, default=5) parser.add_argument("--rank", type=int, default=256) parser.add_argument("--negative", type=int, default=1) parser.add_argument("--is-large", action="store_true") parser.add_argument("--hidden-size", type=int, default=128)
# fmt: on
[docs] @classmethod def build_model_from_args(cls, args): return cls(args.hidden_size, args.window_size, args.rank, args.negative, args.is_large)
def __init__(self, dimension, window_size, rank, negative, is_large=False): super(NetMF, self).__init__() self.dimension = dimension self.window_size = window_size self.rank = rank self.negative = negative self.is_large = is_large
[docs] def forward(self, graph, return_dict=False): nx_g = graph.to_networkx() A = sp.csr_matrix(nx.adjacency_matrix(nx_g)) if not self.is_large: print("Running NetMF for a small window size...") deepwalk_matrix = self._compute_deepwalk_matrix(A, window=self.window_size, b=self.negative) else: print("Running NetMF for a large window size...") vol = float(A.sum()) evals, D_rt_invU = self._approximate_normalized_laplacian(A, rank=self.rank, which="LA") deepwalk_matrix = self._approximate_deepwalk_matrix( evals, D_rt_invU, window=self.window_size, vol=vol, b=self.negative ) # factorize deepwalk matrix with SVD u, s, _ = sp.linalg.svds(deepwalk_matrix, self.dimension) embeddings = sp.diags(np.sqrt(s)).dot(u.T).T if return_dict: features_matrix = dict() for vid, node in enumerate(nx_g.nodes()): features_matrix[node] = embeddings[vid] else: features_matrix = np.zeros((graph.num_nodes, embeddings.shape[1])) nx_nodes = nx_g.nodes() features_matrix[nx_nodes] = embeddings[np.arange(graph.num_nodes)] return features_matrix
def _compute_deepwalk_matrix(self, A, window, b): # directly compute deepwalk matrix n = A.shape[0] vol = float(A.sum()) L, d_rt = sp.csgraph.laplacian(A, normed=True, return_diag=True) # X = D^{-1/2} A D^{-1/2} X = sp.identity(n) - L S = np.zeros_like(X) X_power = sp.identity(n) for i in range(window): print("Compute matrix %d-th power", i + 1) X_power = X_power.dot(X) S += X_power S *= vol / window / b D_rt_inv = sp.diags(d_rt ** -1) M = D_rt_inv.dot(D_rt_inv.dot(S).T).todense() M[M <= 1] = 1 Y = np.log(M) return sp.csr_matrix(Y) def _approximate_normalized_laplacian(self, A, rank, which="LA"): # perform eigen-decomposition of D^{-1/2} A D^{-1/2} and keep top rank eigenpairs n = A.shape[0] L, d_rt = sp.csgraph.laplacian(A, normed=True, return_diag=True) # X = D^{-1/2} W D^{-1/2} X = sp.identity(n) - L print("Eigen decomposition...") evals, evecs = sp.linalg.eigsh(X, rank, which=which) print("Maximum eigenvalue %f, minimum eigenvalue %f", np.max(evals), np.min(evals)) print("Computing D^{-1/2}U..") D_rt_inv = sp.diags(d_rt ** -1) D_rt_invU = D_rt_inv.dot(evecs) return evals, D_rt_invU def _deepwalk_filter(self, evals, window): for i in range(len(evals)): x = evals[i] evals[i] = 1.0 if x >= 1 else x * (1 - x ** window) / (1 - x) / window evals = np.maximum(evals, 0) print( "After filtering, max eigenvalue=%f, min eigenvalue=%f", np.max(evals), np.min(evals), ) return evals def _approximate_deepwalk_matrix(self, evals, D_rt_invU, window, vol, b): # approximate deepwalk matrix evals = self._deepwalk_filter(evals, window=window) X = sp.diags(np.sqrt(evals)).dot(D_rt_invU.T).T M = X.dot(X.T) * vol / b M[M <= 1] = 1 Y = np.log(M) print("Computed DeepWalk matrix with %d non-zero elements", np.count_nonzero(Y)) return sp.csr_matrix(Y)