-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathNELLProcessing.py
228 lines (193 loc) · 10.2 KB
/
NELLProcessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
from collections import Counter
from KBUtils import MissingDict
from RuleParser import RuleParser, get_terminal_nodes
import numpy as np
import Cnst
import os
SELECTED_RELATIONS = [
"https://w3id.org/nellrdf/ontology/agentbelongstoorganization",
"https://w3id.org/nellrdf/ontology/athletecoach",
"https://w3id.org/nellrdf/ontology/teamplaysinleague",
"https://w3id.org/nellrdf/ontology/personbelongstoorganization",
"https://w3id.org/nellrdf/ontology/athleteplayedforschool",
"https://w3id.org/nellrdf/ontology/athleteplaysforteam",
"https://w3id.org/nellrdf/ontology/athleteplaysinleague",
"https://w3id.org/nellrdf/ontology/coachesinleague",
"https://w3id.org/nellrdf/ontology/athleteledsportsteam",
"https://w3id.org/nellrdf/ontology/worksfor",
"https://w3id.org/nellrdf/ontology/coachesteam"]
SELECTED_SPORTS_RELATIONS = ["https://w3id.org/nellrdf/ontology/athletecoach",
"https://w3id.org/nellrdf/ontology/teamplaysinleague",
"https://w3id.org/nellrdf/ontology/athleteplayedforschool",
"https://w3id.org/nellrdf/ontology/athleteplaysforteam",
"https://w3id.org/nellrdf/ontology/athleteplaysinleague",
"https://w3id.org/nellrdf/ontology/coachesinleague",
"https://w3id.org/nellrdf/ontology/athleteledsportsteam",
"https://w3id.org/nellrdf/ontology/coachesteam"]
SELECTED_RELATIONS_WITH_TAGS = ["<"+x+">" for x in SELECTED_RELATIONS]
SELECTED_SPORTS_RELATIONS_WITH_TAGS = ["<"+x+">" for x in SELECTED_SPORTS_RELATIONS]
SELECTED_REL_DICT = {rel: idx for idx, rel in enumerate(SELECTED_RELATIONS_WITH_TAGS)}
NELL_FILE = "NELL2RDF_0.3_1100.nt"
def parse_ntriple(input_ntriple: str):
split_triple = input_ntriple.strip().split(" ")
if len(split_triple) > 4:
ent_2 = " ".join(split_triple[2:-1])
else:
ent_2 = split_triple[2]
ent_1 = split_triple[0]
relation = split_triple[1]
return ent_1, relation, ent_2
def materialise_simple_hierarchy_prep(rule_file):
rules = RuleParser(rule_file, enforce_same=False).get_parsed_rules()
rule_lhs_terminals = [get_terminal_nodes(rule[0]) for rule in rules]
rule_rhs_terminals = [get_terminal_nodes(rule[1]) for rule in rules]
rule_implications = np.array([rule[2] == Cnst.IMPLIES for rule in rules])
nb_lhs_terminals = np.array([len(x) for x in rule_lhs_terminals])
nb_rhs_terminals = np.array([len(x) for x in rule_rhs_terminals])
if np.amax(nb_lhs_terminals) > 1 or np.amax(nb_rhs_terminals) > 1:
print("One or more rules consists of more than a simple atom on more than one side")
return
if not np.all(rule_implications):
print("Equivalence Rules Provided. Please re-write these as implications and try again")
return
lhs_to_rhs_rel = [(rule[0].rel, rule[1].rel) for rule in rules]
return lhs_to_rhs_rel
def materialise_simple_hierarchy_step(numeric_triples_np, lhs_to_rhs_rel):
triples_to_inject = [numeric_triples_np]
for old_rel, new_rel in lhs_to_rhs_rel:
relevant_triples = numeric_triples_np[numeric_triples_np[:, 1] == old_rel]
new_triples = np.copy(relevant_triples)
new_triples[:, 1] = new_rel
triples_to_inject.append(new_triples)
all_triples = np.concatenate(triples_to_inject, axis=0)
mat_triples = all_triples[numeric_triples_np.shape[0]:, :]
return np.unique(all_triples, axis=0), mat_triples
def materialise_simple_hierarchy(numeric_triples_np, rule_file):
numeric_triples_np = np.array(numeric_triples_np)
lhs_to_rhs_rel = materialise_simple_hierarchy_prep(rule_file)
step_count = 0
current_triples_np = numeric_triples_np
new_triples = None
while True:
old_nb_facts = current_triples_np.shape[0]
current_triples_np, new_trip = materialise_simple_hierarchy_step(current_triples_np, lhs_to_rhs_rel)
step_count += 1
if step_count == 1:
new_triples = new_trip
else:
new_triples = np.concatenate([new_triples, new_trip], axis=0)
new_nb_facts = current_triples_np.shape[0]
if new_nb_facts == old_nb_facts:
break
return current_triples_np, np.unique(new_triples, axis=0)
def count_entities_in_triples(triples):
original_list = [x[0] for x in triples] + [x[2] for x in triples]
return Counter(original_list)
def get_entities_in_triples(triples, relation_subset=None, unique=True, return_dict=True):
if relation_subset is None:
original_list = [x[0] for x in triples] + [x[2] for x in triples]
else:
original_list = [x[0] for x in triples if x[1] in relation_subset] + [x[2] for x in triples if
x[1] in relation_subset]
if unique:
original_list = list(set(original_list))
if return_dict:
m_dict = MissingDict(lambda: False)
for ent in original_list:
m_dict[ent] = True
return original_list, m_dict
else:
return original_list
def map_triples(triples, ent_dict):
if ent_dict is None:
ent_dict = map_entities_to_idx(triples)
return np.array([(ent_dict[triple[0]], SELECTED_REL_DICT[triple[1]], ent_dict[triple[2]]) for triple in triples])
def map_entities_to_idx(triples):
original_list = sorted(get_entities_in_triples(triples, unique=True, return_dict=False))
return {ent: idx for idx, ent in enumerate(original_list)}
def get_facts_with_relation(selected_relations, triple_store):
retained_triples = []
with open(triple_store, "r") as nell:
line = nell.readline()
line_count = 1
while line:
ent1, rel, ent2 = parse_ntriple(line)
line = nell.readline()
line_count += 1
if rel in selected_relations:
retained_triples.append((ent1, rel, ent2))
nell.close()
return retained_triples
def prepare_dataset(materialise=True, rule_dir="RulesNELLPreMap.txt", sel_rel=None, nell=NELL_FILE, pre_loaded_triples=None,
train_split=0.9, val_split=0.95, rotatefriendly=True, name="NELLHierarchies", mat_split=False):
if sel_rel is None:
sel_rel = SELECTED_RELATIONS_WITH_TAGS
if pre_loaded_triples is None:
retained_triples = get_facts_with_relation(selected_relations=sel_rel, triple_store=nell)
else:
retained_triples = pre_loaded_triples
sports_entities, m_dict = get_entities_in_triples(retained_triples, SELECTED_SPORTS_RELATIONS_WITH_TAGS,
unique=True, return_dict=True)
surviving_triples = [triple for triple in retained_triples if m_dict[triple[0]] and m_dict[triple[2]]]
counter = count_entities_in_triples(surviving_triples)
filtered_triples = [triple for triple in surviving_triples if
(counter[triple[0]] >= 50 and counter[triple[2]] >= 50) or triple[1]
== SELECTED_RELATIONS_WITH_TAGS[4]]
ent_dict = map_entities_to_idx(filtered_triples)
mapped_triples = map_triples(filtered_triples, ent_dict=ent_dict)
write_dir = Cnst.DEFAULT_KB_DIR + name
if materialise:
mapped_triples_2, mat_triples = materialise_simple_hierarchy(mapped_triples, rule_dir)
write_dir = write_dir + "Mat"
else:
mapped_triples_2 = mapped_triples
mat_triples = None
write_dir = write_dir + "/"
if mat_split:
in_training = MissingDict(lambda: False)
for triple in mapped_triples:
in_training[tuple(triple)] = True
new_mat_triples = np.array([triple for triple in mat_triples if not in_training[tuple(triple)]])
np.random.shuffle(mapped_triples)
np.random.shuffle(new_mat_triples)
even_mat_split = new_mat_triples.shape[0] // 2
train_idx = int(train_split * mapped_triples.shape[0])
val_idx = int(val_split * mapped_triples.shape[0])
train_triples = mapped_triples[:train_idx, :]
if train_split < 1:
val_triples = np.concatenate([mapped_triples[train_idx:val_idx, :], new_mat_triples[:even_mat_split, :]],
axis=0)
else:
val_triples = new_mat_triples[:even_mat_split, :]
if train_split < 1 and val_split < 1:
test_triples = np.concatenate([mapped_triples[val_idx:, :], new_mat_triples[even_mat_split:, :]],
axis=0)
else:
test_triples = new_mat_triples[even_mat_split:, :]
else:
np.random.shuffle(mapped_triples_2)
train_idx = int(train_split * mapped_triples_2.shape[0])
val_idx = int(val_split * mapped_triples_2.shape[0])
train_triples = mapped_triples_2[:train_idx, :]
val_triples = mapped_triples_2[train_idx:val_idx, :]
test_triples = mapped_triples_2[val_idx:, :]
if not os.path.exists(Cnst.DEFAULT_KB_DIR):
os.mkdir(Cnst.DEFAULT_KB_DIR)
if not os.path.exists(write_dir):
os.mkdir(write_dir)
np.savetxt(os.path.join(write_dir, "train.txt"), train_triples, delimiter='\t', fmt='%i')
np.savetxt(os.path.join(write_dir, "valid.txt"), val_triples, delimiter='\t', fmt='%i')
np.savetxt(os.path.join(write_dir, "test.txt"), test_triples, delimiter='\t', fmt='%i')
if rotatefriendly:
with open(os.path.join(write_dir, "entities.dict"), "w") as f:
lines = [str(x)+"\t"+str(x)+"\r\n" for x in range(len(ent_dict))]
f.writelines(lines)
f.close()
with open(os.path.join(write_dir, "relations.dict"), "w") as f:
lines = [str(x) + "\t" + str(x) + "\r\n" for x in range(len(SELECTED_RELATIONS))]
f.writelines(lines)
f.close()
if __name__ == "__main__":
retained_triples = get_facts_with_relation(SELECTED_RELATIONS_WITH_TAGS, NELL_FILE)
prepare_dataset(pre_loaded_triples=retained_triples, train_split=0.9, val_split=0.95,
name="NELLRuleInjSplit90", mat_split=True)