-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrdf2vec.py
230 lines (185 loc) · 7.9 KB
/
rdf2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
import rdflib
import numpy as np
from sklearn.utils.validation import check_is_fitted
from gensim.models.word2vec import Word2Vec
import tqdm
import copy
from graph import Vertex, rdflib_to_kg
from hashlib import md5
class UnknownEntityError(Exception):
pass
class RDF2VecTransformer():
"""Project random walks or subtrees in graphs into embeddings, suited
for classification.
Parameters
----------
vector_size: int (default: 500)
The dimension of the embeddings.
max_path_depth: int (default: 1)
The maximum number of hops to take in the knowledge graph. Due to the
fact that we transform s -(p)-> o to s -> p -> o, this will be
translated to `2 * max_path_depth` hops internally.
wl: bool (default: True)
Whether to use Weisfeiler-Lehman embeddings
wl_iterations: int (default: 4)
The number of Weisfeiler-Lehman iterations. Ignored if `wl` is False.
walks_per_graph: int (default: infinity)
The maximum number of walks to extract from the neighborhood of
each instance.
n_jobs: int (default: 1)
gensim.models.Word2Vec parameter.
window: int (default: 5)
gensim.models.Word2Vec parameter.
sg: int (default: 1)
gensim.models.Word2Vec parameter.
max_iter: int (default: 10)
gensim.models.Word2Vec parameter.
negative: int (default: 25)
gensim.models.Word2Vec parameter.
min_count: int (default: 1)
gensim.models.Word2Vec parameter.
Attributes
----------
model: gensim.models.Word2Vec
The fitted Word2Vec model. Embeddings can be accessed through
`self.model.wv.get_vector(str(instance))`.
"""
def __init__(self, vector_size=500, max_path_depth=1, wl=True,
wl_iterations=4, walks_per_graph=float('inf'), n_jobs=1,
window=5, sg=1, max_iter=10, negative=25, min_count=1):
self.vector_size = vector_size
self.max_path_depth = max_path_depth
self.wl = wl
self.wl_iterations = wl_iterations
self.walks_per_graph = walks_per_graph
self.n_jobs = n_jobs
self.window = window
self.sg = sg
self.max_iter = max_iter
self.negative = negative
self.min_count = min_count
def print_walks(self, walks):
walk_strs = []
for walk_nr, walk in enumerate(walks):
s = ''
for i in range(len(walk)):
if i % 2:
s += '{} '.format(walk[i])
else:
s += '{} '.format(walk[i])
if i < len(walk) - 1:
s += '--> '
walk_strs.append(s)
with open("test.txt", "w") as myfile:
for s in walk_strs:
myfile.write(s)
myfile.write('\n\n')
def _extract_random_walks(self, graph, instance):
walks = graph.extract_random_walks(self.max_path_depth*2, instance,
max_walks=self.walks_per_graph)
canonical_walks = set()
for walk in walks:
canonical_walk = []
for i, hop in enumerate(walk):
if i == 0:
canonical_walk.append(hop.name)
else:
# Take the first 8 bytes of the hash, allowing for
# 255**8 unique entities
digest = md5(hop.name.encode()).digest()[:8]
canonical_walk.append(str(digest))
canonical_walks.add(tuple(canonical_walk))
return list(canonical_walks)
def _extract_wl_walks(self, graph, instance, verbose=True):
print(self.walks_per_graph)
walks = graph.extract_random_walks(self.max_path_depth*2, instance,
max_walks=self.walks_per_graph)
canonical_walks = set()
for n in range(self.wl_iterations + 1):
for walk in walks:
canonical_walk = []
for i, hop in enumerate(walk):
# For the root and predicates, we just append the name
if i == 0 or i % 2 == 1:
canonical_walk.append(hop.name)
# For entities, we take the Weisfeiler-Lehman label
else:
canonical_walk.append(graph._label_map[hop][n])
canonical_walks.add(tuple(canonical_walk))
return canonical_walks
def fit(self, graph, instances):
"""Fit the embedding network based on provided instances.
Parameters
----------
graphs: graph.KnowledgeGraph
The graph from which we will extract neighborhoods for the
provided instances. You can create a `graph.KnowledgeGraph` object
from an `rdflib.Graph` object by using `rdflib_to_kg`.
instances: array-like
The instances for which an embedding will be created. It important
to note that the test instances should be passed to the fit method
as well. Due to RDF2Vec being unsupervised, there is no
label leakage.
-------
"""
if self.wl:
graph.weisfeiler_lehman(iterations=self.wl_iterations)
all_walks = []
for i, instance in tqdm.tqdm(enumerate(instances)):
if self.wl:
walks = self._extract_wl_walks(graph, Vertex(str(instance)))
else:
walks = self._extract_random_walks(graph,
Vertex(str(instance)))
all_walks += list(walks)
print('Extracted {} walks for {} instances!'.format(len(all_walks),
len(instances)))
sentences = [list(map(str, x)) for x in all_walks]
print("Starting Word2Vec Model")
self.model = Word2Vec(sentences, size=self.vector_size,
window=self.window, workers=self.n_jobs,
sg=self.sg, iter=self.max_iter,
negative=self.negative,
min_count=self.min_count, seed=42)
print("Word2Vec Ended")
def transform(self, graph, instances):
"""Construct a feature vector for the provided instances.
Parameters
----------
graphs: graph.KnowledgeGraph
The graph from which we will extract neighborhoods for the
provided instances. You can create a `graph.KnowledgeGraph` object
from an `rdflib.Graph` object by using `rdflib_to_kg`.
instances: array-like
The instances for which an embedding will be created. These
instances must have been passed to the fit method as well,
or their embedding will not exist in the model vocabulary.
Returns
-------
embeddings: array-like
The embeddings of the provided instances.
"""
check_is_fitted(self, ['model'])
feature_vectors = []
print("Transform.")
for instance in instances:
feature_vectors.append(self.model.wv.get_vector(str(instance)))
return feature_vectors
def fit_transform(self, graph, instances):
"""First apply fit to create a Word2Vec model and then generate
embeddings for the provided instances.
Parameters
----------
graphs: graph.KnowledgeGraph
The graph from which we will extract neighborhoods for the
provided instances. You can create a `graph.KnowledgeGraph` object
from an `rdflib.Graph` object by using `rdflib_to_kg`.
instances: array-like
The instances for which an embedding will be created.
Returns
-------
embeddings: array-like
The embeddings of the provided instances.
"""
self.fit(graph, instances)
return self.transform(graph, instances)