Source code for cogdl.datasets.matlab_matrix

import os.path as osp
from itertools import product

import numpy as np
import scipy.io
import torch
import time

from cogdl.data import Graph, Dataset
from cogdl.utils import download_url


[docs]class MatlabMatrix(Dataset): r"""networks from the http://leitang.net/code/social-dimension/data/ or http://snap.stanford.edu/node2vec/ Args: root (string): Root directory where the dataset should be saved. name (string): The name of the dataset (:obj:`"Blogcatalog"`). """ def __len__(self): return 1 def __init__(self, root, name, url): self.name = name self.url = url super(MatlabMatrix, self).__init__(root) self.data = torch.load(self.processed_paths[0]) @property def raw_file_names(self): splits = [self.name] files = ["mat"] return ["{}.{}".format(s, f) for s, f in product(splits, files)] @property def processed_file_names(self): return ["data.pt"] @property def num_classes(self): return self.data.y.shape[1] @property def num_nodes(self): return self.data.y.shape[0]
[docs] def download(self): for name in self.raw_file_names: download_url("{}{}".format(self.url, name), self.raw_dir)
[docs] def get(self, idx): assert idx == 0 return self.data
[docs] def process(self): path = osp.join(self.raw_dir, "{}.mat".format(self.name)) smat = scipy.io.loadmat(path) adj_matrix, group = smat["network"], smat["group"] y = torch.from_numpy(group.todense()).to(torch.float) row_ind, col_ind = adj_matrix.nonzero() edge_index = torch.stack([torch.tensor(row_ind), torch.tensor(col_ind)], dim=0) edge_attr = torch.tensor(adj_matrix[row_ind, col_ind]) data = Graph(edge_index=edge_index, edge_attr=edge_attr, x=None, y=y) torch.save(data, self.processed_paths[0])
[docs]class BlogcatalogDataset(MatlabMatrix): def __init__(self, data_path="data"): dataset, filename = "blogcatalog", "blogcatalog" url = "http://leitang.net/code/social-dimension/data/" path = osp.join(data_path, dataset) super(BlogcatalogDataset, self).__init__(path, filename, url)
[docs]class FlickrDataset(MatlabMatrix): def __init__(self, data_path="data"): dataset, filename = "flickr", "flickr" url = "http://leitang.net/code/social-dimension/data/" path = osp.join(data_path, dataset) super(FlickrDataset, self).__init__(path, filename, url)
[docs]class WikipediaDataset(MatlabMatrix): def __init__(self, data_path="data"): dataset, filename = "wikipedia", "POS" url = "http://snap.stanford.edu/node2vec/" path = osp.join(data_path, dataset) super(WikipediaDataset, self).__init__(path, filename, url)
[docs]class PPIDataset(MatlabMatrix): def __init__(self, data_path="data"): dataset, filename = "ppi", "Homo_sapiens" url = "http://snap.stanford.edu/node2vec/" path = osp.join(data_path, dataset + "-ne") super(PPIDataset, self).__init__(path, filename, url)
[docs]class NetworkEmbeddingCMTYDataset(Dataset): def __init__(self, root, name, url): self.url = url self.name = name super(NetworkEmbeddingCMTYDataset, self).__init__(root) self.data = torch.load(self.processed_paths[0]) @property def raw_file_names(self): return [f"{self.name}.{x}" for x in ["ungraph", "cmty"]] @property def num_classes(self): return self.data.y.shape[1] @property def num_nodes(self): return self.data.y.shape[0] @property def processed_file_names(self): return ["data.pt"]
[docs] def get(self, idx): assert idx == 0 return self.data
[docs] def download(self): for name in self.raw_file_names: download_url(self.url.format(name), self.raw_dir, name=name) time.sleep(0.5)
[docs] def process(self): filenames = self.raw_paths with open(f"{filenames[0]}", "r") as f: edge_index = f.read().strip().split("\n") edge_index = [[int(i) for i in x.split("\t")] for x in edge_index] edge_index = np.array(edge_index, dtype=np.int64).transpose() edge_index = torch.from_numpy(edge_index) rev_edge_index = torch.stack([edge_index[1], edge_index[0]]) edge_index = torch.cat((edge_index, rev_edge_index), dim=1) self_loop_mask = edge_index[0] != edge_index[1] edge_index = edge_index[:, self_loop_mask] with open(f"{filenames[1]}", "r") as f: cmty = f.read().strip().split("\n") cmty = [[int(i) for i in x.split("\t")] for x in cmty] num_classes = len(cmty) num_nodes = torch.max(edge_index).item() + 1 labels = np.zeros((num_nodes, num_classes), dtype=np.float) for i, cls in enumerate(cmty): labels[cls, i] = 1.0 labels = torch.from_numpy(labels) data = Graph(x=None, y=labels, edge_index=edge_index) torch.save(data, self.processed_paths[0])
def __repr__(self): return "{}()".format(self.name) def __len__(self): return self.data.y.shape[0]
[docs]class DblpNEDataset(NetworkEmbeddingCMTYDataset): def __init__(self, data_path="data"): dataset = "dblp" path = osp.join(data_path, dataset + "-ne") url = "https://cloud.tsinghua.edu.cn/d/5ba8b35db80343549c67/files/?p=%2F{}&dl=1" super(DblpNEDataset, self).__init__(path, dataset, url)
[docs]class YoutubeNEDataset(NetworkEmbeddingCMTYDataset): def __init__(self, data_path="data"): dataset = "youtube" path = osp.join(data_path, dataset + "-ne") url = "https://cloud.tsinghua.edu.cn/d/c1ae63c4f1f14afb8ab8/files/?p=%2F{}&dl=1" super(YoutubeNEDataset, self).__init__(path, dataset, url)