-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstandardize_data.py
74 lines (60 loc) · 2.36 KB
/
standardize_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
# This program is free software; you can redistribute it and/or modify
# it under the terms of the Apache License Version 2.0.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Apache License Version 2.0 for more details.
# ============================================================================
import os.path as osp
import sys
import math
import numpy as np
import networkx as nx
from absl import flags
FLAGS = flags.FLAGS
flags.DEFINE_string("data", None, "")
flags.mark_flag_as_required("data")
FLAGS(sys.argv)
if FLAGS.data == "BlogCatalog":
from datasets.BlogCatalog.standardize import standardize
elif FLAGS.data == "PPI":
from datasets.PPI.standardize import standardize
elif FLAGS.data == "Mashup":
from datasets.Mashup.standardize import standardize
elif FLAGS.data == "Flickr":
from datasets.Flickr.standardize import standardize
else:
raise ValueError
dataset_path = osp.join(
osp.dirname(osp.abspath(__file__)), "datasets", FLAGS.data)
G = standardize()
num_nodes = G.number_of_nodes()
adj = nx.convert_matrix.to_scipy_sparse_matrix(
G, nodelist=list(range(num_nodes)), format="csr")
degrees = np.asarray(adj.sum(axis=1), dtype=np.int64).flatten()
sampling_table_file = osp.join(dataset_path, "sampling_table.npy")
if osp.isfile(sampling_table_file):
print("Sampling table already exists.")
else:
table_size = 1e8
power = 0.75
numNodes = num_nodes
print("Pre-procesing for non-uniform negative sampling!")
node_degree = degrees.copy()
# node_degree = np.zeros(numNodes) # out degree
# import ipdb; ipdb.set_trace()
# look_up = self.g.look_up_dict
# for edge in self.g.G.edges():
# node_degree[look_up[edge[0]]
# ] += self.g.G[edge[0]][edge[1]]["weight"]
norm = sum([math.pow(node_degree[i], power) for i in range(numNodes)])
sampling_table = np.zeros(int(table_size), dtype=np.uint32)
p = 0
i = 0
for j in range(numNodes):
p += float(math.pow(node_degree[j], power)) / norm
while i < table_size and float(i) / table_size < p:
sampling_table[i] = j
i += 1
np.save(sampling_table_file, sampling_table)