From a959891599cb33c2984fb60a62497cb75ab46e1b Mon Sep 17 00:00:00 2001 From: NoeCecillon Date: Thu, 10 Aug 2023 19:19:19 +0200 Subject: [PATCH] SiNE embeddings --- SiNE/graph.py | 99 ++++++++++++++++++++++ SiNE/learn_SiNE_emb.py | 47 +++++++++++ SiNE/model.py | 134 ++++++++++++++++++++++++++++++ SiNE/stemgraph.py | 42 ++++++++++ SiNE/stemmodels.py | 184 +++++++++++++++++++++++++++++++++++++++++ SiNE/util.py | 170 +++++++++++++++++++++++++++++++++++++ 6 files changed, 676 insertions(+) create mode 100644 SiNE/graph.py create mode 100644 SiNE/learn_SiNE_emb.py create mode 100644 SiNE/model.py create mode 100644 SiNE/stemgraph.py create mode 100644 SiNE/stemmodels.py create mode 100644 SiNE/util.py diff --git a/SiNE/graph.py b/SiNE/graph.py new file mode 100644 index 0000000..77c8f4b --- /dev/null +++ b/SiNE/graph.py @@ -0,0 +1,99 @@ +import networkx as nx +import numpy as np +import csv + +# From https://github.com/CompNet/SignedCentrality + +class Vocabulary(object): + def __init__(self, graph): + self._id2node = {} + self._node2id = {} + self._curr_id = 1 + for node in graph.nodes(): + if node not in self._node2id: + self._curr_id += 1 + self._node2id[node] = self._curr_id + self._id2node[self._curr_id] = node + + def id2node(self, id): + return self._id2node[id] + + def node2id(self, node): + return self._node2id[node] + + def augment(self, graph): + for node in graph.nodes(): + if node not in self._node2id: + self._curr_id += 1 + self._node2id[node] = self._curr_id + self._id2node[self._curr_id] = node + + def __len__(self): + return self._curr_id + + +class Graph(object): + def __init__(self, positive_graph, negative_graph): + self.positive_graph = positive_graph + self.negative_graph = negative_graph + self.vocab = Vocabulary(positive_graph) + self.vocab.augment(negative_graph) + + def get_positive_edges(self): + return self.positive_graph.edges() + + def get_negative_edges(self): + return self.negative_graph.edges() + + def __len__(self): + return len(self.vocab) + #return max(len(self.positive_graph), len(self.negative_graph)) + + def get_triplets(self, p0=True, ids=True): + triplets = [] + for xi in self.positive_graph.nodes(): + for xj in self.positive_graph[xi]: + if xj in self.negative_graph: + for xk in self.negative_graph[xj]: + a, b, c = xi, xj, xk + if ids: + a = self.vocab.node2id(xi) + b = self.vocab.node2id(xj) + c = self.vocab.node2id(xk) + triplets.append([a, b, c]) + elif p0: + a, b = xi, xj + c = 0 + if ids: + a = self.vocab.node2id(xi) + b = self.vocab.node2id(xj) + triplets.append([a, b, c]) + triplets = np.array(triplets) + return triplets + + @staticmethod + def read_from_file(filepath, delimiter=',', directed=False): + positive_graph = nx.DiGraph() if directed else nx.Graph() + negative_graph = nx.DiGraph() if directed else nx.Graph() + file = open(filepath) + #skip header line + next(file) + for line in file: + line = line.strip() + #print(line) + u, v, w = line.split(delimiter) + w = float(w) + if w > 0: + positive_graph.add_edge(u, v, weight=w) + if w < 0: + negative_graph.add_edge(u, v, weight=w) + file.close() + graph = Graph(positive_graph, negative_graph) + return graph + + + + + + + diff --git a/SiNE/learn_SiNE_emb.py b/SiNE/learn_SiNE_emb.py new file mode 100644 index 0000000..4c42840 --- /dev/null +++ b/SiNE/learn_SiNE_emb.py @@ -0,0 +1,47 @@ +from SiNEmaster.graph import * +from SiNEmaster.stemmodels import SiNE, fit_sine_model as fit_model +import pickle + +#pickled list of labels +labels_path = "labels.pickle" +graphs_path = "data/CCS" + +embeddings = [] +labels = [] +with open(labels_path, "rb") as f: + lb = pickle.load(f) + + +for i in range(2545): + try: + graph = Graph.read_from_file("%s/%s.csv" %(graphs_path, i), delimiter=',', directed=True) + if len(graph.get_positive_edges()) + len(graph.get_negative_edges()) > 1: + + model = fit_model( + num_nodes=len(graph), + dims_arr=[32, 32], + triples=graph.get_triplets(), + triples0=None, + delta=1.0, + delta0=0.5, + batch_size=300, + batch_size0=300, + epochs=30, + lr=0.01, + lam=0.0001, + lr_decay=0.0, + p=2, + print_loss=False, + p0=False, + ) + + embedding = model.get_x() + embedding = embedding.detach().numpy().tolist()[0] + embeddings.append(embedding) + labels.append(lb[i]) + print (i) + except: + print ("error") + +with open("out/SiNE/sine_embeddings.pkl", "wb") as f: + pickle.dump(embeddings, f) \ No newline at end of file diff --git a/SiNE/model.py b/SiNE/model.py new file mode 100644 index 0000000..3de97f8 --- /dev/null +++ b/SiNE/model.py @@ -0,0 +1,134 @@ +import torch +import torch.nn as nn +from torch.nn.parameter import Parameter +from torch.autograd import Variable +import numpy as np +import torch.optim as optim + +# From https://github.com/CompNet/SignedCentrality + +def hadamard(x, y): + return x * y + + +def average(x, y): + return (x + y)/2.0 + + +def l1(x, y): + return np.abs(x - y) + + +def l2(x, y): + return np.power(x - y, 2) + + +def concat(x, y): + return np.concatenate((x, y), axis=1) + + +FEATURE_FUNCS = { + 'l1': l1, + 'l2': l2, + 'concat': concat, + 'average': average, + 'hadamard': hadamard +} + + +class SiNE(nn.Module): + def __init__(self, num_nodes, dim1, dim2): + super(SiNE, self).__init__() + self.tanh = nn.Tanh() + self.embeddings = nn.Embedding(num_nodes + 1, dim1) + self.layer11 = nn.Linear(dim1, dim2, bias=False) + self.layer12 = nn.Linear(dim1, dim2, bias=False) + self.bias1 = Parameter(torch.zeros(1)) + self.layer2 = nn.Linear(dim2, 1, bias=False) + self.bias2 = Parameter(torch.zeros(1)) + self.register_parameter('bias1', self.bias1) + self.register_parameter('bias2', self.bias2) + + def forward(self, xi, xj, xk, delta): + i_emb = self.embeddings(xi) + j_emb = self.embeddings(xj) + k_emb = self.embeddings(xk) + + z11 = self.tanh(self.layer11(i_emb) + self.layer12(j_emb) + self.bias1) + z12 = self.tanh(self.layer11(i_emb) + self.layer12(k_emb) + self.bias1) + + f_pos = self.tanh(self.layer2(z11) + self.bias2) + f_neg = self.tanh(self.layer2(z12) + self.bias2) + + zeros = Variable(torch.zeros(1)) + + loss = torch.max(zeros, f_pos + delta - f_neg) + loss = torch.sum(loss) + + return loss + + def _regularizer(self, x): + zeros = torch.zeros_like(x) + normed = torch.norm(x - zeros, p=2) + term = torch.pow(normed, 2) + # print('The parameter of ', x) + # print('Yields ',term) + return term + + def regularize_weights(self): + loss = 0 + for parameter in self.parameters(): + loss += self._regularizer(parameter) + return loss + + def get_embedding(self, x): + x = Variable(torch.LongTensor([x])) + emb = self.embeddings(x) + emb = emb.data.numpy()[0] + return emb + + def get_edge_feature(self, x, y, operation='hadamard'): + func = FEATURE_FUNCS[operation] + x = self.get_embedding(x) + y = self.get_embedding(y) + return func(x, y) + + + + +def tensorfy_col(x, col_idx): + col = x[:,col_idx] + col = torch.LongTensor(col) + col = Variable(col) + return col + + +def get_training_batch(triples, batch_size): + nrows = triples.shape[0] + rows = np.random.choice(nrows, batch_size, replace=False) + choosen = triples[rows,:] + xi = tensorfy_col(choosen, 0) + xj = tensorfy_col(choosen, 1) + xk = tensorfy_col(choosen, 2) + return xi, xj, xk + + +def fit_model(sine, triplets, delta, batch_size, epochs, alpha, + lr=0.4, weight_decay=0.0, print_loss=True): + optimizer = optim.Adagrad(sine.parameters(), lr=lr, weight_decay=weight_decay) + for epoch in range(epochs): + sine.zero_grad() + xi, xj, xk = get_training_batch(triplets, batch_size) + loss = sine(xi, xj, xk, delta) + # print(loss) + regularizer_loss = alpha * sine.regularize_weights() + # print(regularizer_loss) + loss += regularizer_loss + loss.backward() + optimizer.step() + if print_loss: + print('Loss at epoch ', epoch + 1, ' is ', loss.data[0]) + return sine + + + diff --git a/SiNE/stemgraph.py b/SiNE/stemgraph.py new file mode 100644 index 0000000..3fbf98c --- /dev/null +++ b/SiNE/stemgraph.py @@ -0,0 +1,42 @@ +import networkx as nx +import numpy as np + +# From https://github.com/CompNet/SignedCentrality + +def get_empty_graph(directed=True): + if directed: + return nx.DiGraph() + return nx.Graph() + +def from_edgelist_array_to_graph(X, y, directed=True): + positive_graph = get_empty_graph(directed) + negative_graph = get_empty_graph(directed) + + for edge, label in zip(X, y): + u, v = edge + if label == 0: + negative_graph.add_edge(u, v) + else: + positive_graph.add_edge(u, v) + return positive_graph, negative_graph + + +def get_triples(positive_graph, negative_graph, p0=True): + triples = [] + triples0 = [] + for u, v in positive_graph.edges(): + if v in negative_graph: + v_neigbors = negative_graph[v] + for w in v_neigbors: + triple = (u, v, w) + triples.append(triple) + elif p0: + triple0 = (u, v, 0) + triples0.append(triple0) + triples = np.array(triples) + triples0 = np.array(triples0) + if p0: + return triples, triples0 + return triples + + diff --git a/SiNE/stemmodels.py b/SiNE/stemmodels.py new file mode 100644 index 0000000..c444f8c --- /dev/null +++ b/SiNE/stemmodels.py @@ -0,0 +1,184 @@ +import torch +import torch.nn as nn +import torch.optim as optim +from torch.autograd import Variable +#from imblearn.under_sampling import RandomUnderSampler +import numpy as np +import SiNEmaster.util as util # Altered code. +import numpy as np + + +# From https://github.com/CompNet/SignedCentrality + + +def regularize(parameter, p=2): + zeros = torch.zeros_like(parameter) + diff = torch.abs(parameter - zeros) + norm = torch.norm(diff, p) + return norm + + +class GraphEmbeddingModel(nn.Module): + def __init__(self): + super().__init__() + + def get_embedding(self, x): + raise NotImplementedError + + def get_all_weights(self): + raise NotImplementedError + + def get_edge_features(self, x, y, operation='hadamard'): + func = util.FEATURE_FUNCS[operation] + x_emb = self.get_embedding(x) + y_emb = self.get_embedding(y) + return func(x_emb, y_emb) + + def regularize(self, lam=0.0055, p=2): + regularizer_term = Variable(torch.zeros(1)) + for parameter in self.parameters(): + regularizer_term += regularize(parameter, p) + regularizer_term *= lam + return regularizer_term + + + + +class EnergyToProbsLayer(nn.Module): + def __init__(self): + super(EnergyToProbsLayer, self).__init__() + self.transform = nn.Sigmoid() + def forward(self, x): + ones = torch.ones_like(x) + positive_prob = self.transform(x) + negative_prob = ones - positive_prob + output = torch.cat((negative_prob, positive_prob), dim=1) + #output = torch.log(output) + return output + + + + +#----------------------------------------------------------------------------------------------------------------- +# +# SiNE model of Wang et al. +# +#----------------------------------------------------------------------------------------------------------------- + + + +class SiNESubModule(nn.Module): + def __init__(self, input_dim, output_dim): + super().__init__() + self.layer = nn.Linear(input_dim, output_dim, bias=False) + self.bias = nn.Parameter(torch.zeros(output_dim)) + self.tanh = nn.Tanh() + + initrange = np.sqrt(6.0/(input_dim + output_dim)) + self.layer.weight.data.uniform_(-initrange, initrange) + + def forward(self, input1): + x = self.layer(input1) + x += self.bias + x = self.tanh(x) + return x + + +class SiNECompModule(nn.Module): + def __init__(self, input_dim, output_dim): + super().__init__() + self.layer1 = nn.Linear(input_dim, output_dim, bias=False) + self.layer2 = nn.Linear(input_dim, output_dim, bias=False) + self.bias = nn.Parameter(torch.zeros(output_dim)) + self.tanh = nn.Tanh() + + initrange = np.sqrt(6.0 / (input_dim + output_dim)) + self.layer1.weight.data.uniform_(-initrange, initrange) + self.layer2.weight.data.uniform_(-initrange, initrange) + + def forward(self, input1, input2): + x = self.layer1(input1) + self.layer2(input2) + x += self.bias + x = self.tanh(x) + return x + + + + + + +class SiNE(GraphEmbeddingModel): + def __init__(self, num_nodes, dims_arr): + super().__init__() + self.embeddings = nn.Embedding(num_nodes + 1, dims_arr[0]) + self.embeddings.weight.data.uniform_(-0.0, 0) + self.layers = [] + self.comp_layer = SiNECompModule(dims_arr[0], dims_arr[1]) + length = len(dims_arr) + for i in range(1, length - 1): + layer = SiNESubModule(dims_arr[i], dims_arr[i + 1]) + self.add_module('l{0}'.format(i), layer) + self.layers.append(layer) + layer = SiNESubModule(dims_arr[-1], 1) + self.add_module('l{0}'.format(len(dims_arr)), layer) + self.layers.append(layer) + + def get_all_weights(self): + res = self.embeddings.weight.data.numpy() + return res + + def forward(self, xi, xj, xk, delta): + i_emb = self.embeddings(xi) + j_emb = self.embeddings(xj) + k_emb = self.embeddings(xk) + + zn1 = self.comp_layer(i_emb, j_emb) + zn2 = self.comp_layer(i_emb, k_emb) + + for layer in self.layers: + zn1 = layer(zn1) + zn2 = layer(zn2) + + f_pos = zn1 + f_neg = zn2 + + zeros = Variable(torch.zeros(1)) + + loss = torch.max(zeros, f_neg + delta - f_pos) + loss = torch.sum(loss) + loss = torch.tensor([loss]) # Added code. + + return loss + + def get_embedding(self, x): + x = Variable(torch.LongTensor([int(x)])) + emb = self.embeddings(x) + emb = emb.data.numpy()[0] + return emb + + def get_x(self): # Added code. + return self.layers[-1].layer.weight.data # Added code. + + + +def fit_sine_model(num_nodes, dims_arr, triples, triples0, delta, delta0, batch_size, batch_size0, epochs, + lr=0.01, lam=0.0055, lr_decay=0.0, p=2, print_loss=True, p0=True): + sine = SiNE(num_nodes, dims_arr) + optimizer = optim.Adagrad(sine.parameters(), lr=lr, lr_decay=lr_decay) + for epoch in range(epochs): + optimizer.zero_grad() + C = batch_size + xi, xj, xk = util.get_triples_training_batch(triples, batch_size) + loss = sine(xi, xj, xk, delta) + if p0: + xi, xj, xk = util.get_triples_training_batch(triples0, batch_size0) + loss += sine(xi, xj, xk, delta0) + C += batch_size0 + loss /= C + loss += sine.regularize(lam, p) + loss.backward() + optimizer.step() + if print_loss: + print('Loss at epoch ', epoch + 1, ' is ', loss.data[0]) + return sine + diff --git a/SiNE/util.py b/SiNE/util.py new file mode 100644 index 0000000..17221e1 --- /dev/null +++ b/SiNE/util.py @@ -0,0 +1,170 @@ +import torch +import torch.nn as nn +from torch.autograd import Variable +import numpy as np +from collections import OrderedDict +import SiNEmaster.stemgraph as graph # Altered code. +import networkx as nx + +# From https://github.com/CompNet/SignedCentrality + +def get_num_unbalanced(X, y): + indicies = np.arange(0, len(X)) + count = 0.0 + negatives = 0.0 + for u, v in X: + connecting_idx = indicies[X[:,0] == v] + count += len(connecting_idx) + negs = 1 - y[connecting_idx] + negatives += sum(negs) + +# connecting_idx = indicies[X[:,1] == v] +# count += len(connecting_idx) +# negs = 1 - y[connecting_idx] +# negatives += sum(negs) + + return negatives / count + +def remove_edge_sign(input_filepath, output_filepath, indelimiter=',', outdelimiter=' '): + with open(input_filepath, 'r') as ifp: + with open(output_filepath, 'w') as ofp: + for line in ifp: + contents = line.split(indelimiter) + ofp.write(outdelimiter.join(contents[:-1]) + '\n') + +def embedding_layer_from_numpy(arr, start_with=1): + rows, cols = arr.shape + if start_with == 1: + rows += 1 + embeddings = nn.Embedding(rows, cols) + brr = arr + if start_with == 1: + temp = np.random.rand(1, cols) + brr = np.concatenate((temp, arr)) + brr = nn.Parameter(brr) + embeddings.weight = brr + return embeddings + +def embedding_from_file(filepath, ids_explicit=True, delimiter=' ', start_with=1): + with open(filepath, 'r') as fp: + count = start_with + nrows = 0 + d = dict() + for line in fp: + data = map(float, line.split(delimiter)) + if ids_explicit: + d[int(data[0])] = data[1:] + nrows = max(nrows, int(data[0])) + else: + d[count] = data + nrows = count + count += 1 + sorted_keys = sorted(d.keys()) + arr = [] + for key in sorted_keys: + arr.append(d[key]) + arr = np.array(arr) + return embedding_layer_from_numpy(arr, start_with) + + +def array_edgelist_to_graph(X, directed=False): + graph = nx.DiGraph() if directed else nx.Graph() + for u, v in X: + graph.add_edge(u, v) + return graph + + + + + + + + + + + + + +def tensorfy_col(x, col_idx, tensor_type='long'): + """ + Extracts a column from a numpy array and wraps it as a PyTorch Variable + Parameters + ---------- + x : np.array + A 2D Numpy array + col_idx : int + The column to extract + tensor_type : str (optional) + The type of tensor to create from the column (default is 'long') + + Returns + ------- + Pytorch Variable + A Pytorch Variable wrapping the specified column from the submitted array + + """ + col = x[:,col_idx] + if tensor_type == 'long': + col = torch.LongTensor(col) + if tensor_type == 'float': + col = torch.FloatTensor(col) + col = Variable(col) + return col + + + +def get_triples_training_batch(triples, batch_size): + nrows = triples.shape[0] + rows = None # Added code. + try: # Added code. + rows = np.random.choice(nrows, batch_size, replace=False) # Altered code. + except: # Added code. + rows = np.random.choice(nrows, batch_size, replace=True) # Added code. + choosen = triples[rows,:] + xi = tensorfy_col(choosen, 0) + xj = tensorfy_col(choosen, 1) + xk = tensorfy_col(choosen, 2) + return xi, xj, xk + + +def hadamard(x, y): + return x * y + + +def average(x, y): + return (x + y)/2.0 + + +def l1(x, y): + return np.abs(x - y) + + +def l2(x, y): + return np.power(x - y, 2) + + +def concat(x, y): + return np.concatenate((x, y), axis=0) + + +def graph_from_numpy_array(X_train, num_nodes, directed=True): + all_nodes = set(range(0, num_nodes + 1)) + graph = nx.DiGraph() if directed else nx.Graph() + graph.add_nodes_from(all_nodes) + for u, v in X_train: + graph.add_edge(u, v) + return graph + +FEATURE_FUNCS = { + 'l1': l1, + 'l2': l2, + 'concat': concat, + 'average': average, + 'hadamard': hadamard +} + + +def triples_from_array(X, y, directed=True): + p, n = graph.from_edgelist_array_to_graph(X,y,directed=directed) + triples, triples0 = graph.get_triples(p, n, True) + return triples, triples0