-
Notifications
You must be signed in to change notification settings - Fork 2
/
13_analyse_results.py
147 lines (117 loc) · 5.48 KB
/
13_analyse_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import csv
import datetime
import codecs
import ujson
from rdflib import Graph, URIRef, RDFS
from collections import Counter, defaultdict
from utilwebisadb import set_csv_field_size, read_redirects
def read_types(redirects, subject_object, file_path):
print("{} - start loading {}".format(datetime.datetime.now(), file_path))
with codecs.open(file_path, 'r', encoding='utf8') as file:
for i, line in enumerate(file):
if line[0] != '<':
continue
subject = line[1:line.index('>')]
subject = redirects.get(subject, subject)
object = line[line.rindex('<') + 1:line.rindex('>')]
if object.startswith('http://dbpedia.org/ontology/'):
object_list = subject_object[subject]
if object not in object_list:
object_list.append(object)
if i % 1000000 == 0:
print("{} - {} imported".format(datetime.datetime.now(), i))
#if i > 1000:
# break
def read_instance_types():
redirects = read_redirects()
subject_object = defaultdict(list)
read_types(redirects, subject_object, 'instance_types_en.ttl')
#read_types(redirects, subject_object, 'instance_types_transitive_en.ttl')
#read_types(redirects, subject_object, 'instance_types_lhd_dbo_en.ttl')
read_types(redirects, subject_object, 'instance_types_sdtyped_dbo_en.ttl')
#read_types(redirects, subject_object, 'instance_types_dbtax_dbo_en.ttl')
print('write json')
with open('instance_types.json', 'w') as outfile:
ujson.dump(subject_object, outfile)
print('finish')
def read_sub_class_of():
sub_class_of = {}
g = Graph()
g.parse("dbpedia_2016-04.nt", format="nt")
for s, o in g.subject_objects(RDFS.subClassOf):
str_s = str(s)
str_o = str(o)
if str_s.startswith('http://dbpedia.org/ontology/') and str_o.startswith('http://dbpedia.org/ontology/'):
sub_class_of[str_s] = str_o
return sub_class_of
def get_highest_class(list_of_types, sub_class_of):
for type in list_of_types:
while True:
higher_type = sub_class_of.get(type, None)
if higher_type == None or \
higher_type == 'http://dbpedia.org/ontology/Agent' or \
higher_type == 'http://dbpedia.org/ontology/Work' or \
higher_type == 'http://dbpedia.org/ontology/Person' or\
higher_type == 'http://dbpedia.org/ontology/Organisation' or \
higher_type == 'http://dbpedia.org/ontology/Place':
return type
type = higher_type
return 'http://www.w3.org/2002/07/owl#Thing'
def get_highest_class_with_distance(list_of_types, sub_class_of, distance):
if len(list_of_types) == 0:
return 'http://www.w3.org/2002/07/owl#Thing'
my_type = list_of_types[0]
my_list = get_list_of_class_with_super_classes(my_type, sub_class_of)
return my_list[:distance][-1]#shink to 3 elements and use last
def get_list_of_class_with_super_classes(clazz, sub_class_of):
list_of_super_classes = [clazz]
while True:
higher_type = sub_class_of.get(clazz, None)
if higher_type == None:
return list(reversed(list_of_super_classes))
list_of_super_classes.append(higher_type)
clazz = higher_type
def get_local_name(str):
return str[str.rfind('/')+1:].strip()
def analyse():
print("{} - read instance_types".format(datetime.datetime.now()))
with open('instance_types.json') as data_file:
subject_object = ujson.load(data_file)
print("{} - read subclassof".format(datetime.datetime.now()))
sub_class_of = read_sub_class_of()
print("{} - read mappings".format(datetime.datetime.now()))
mapped_resources = set()
with open('webisa_1_final_with_mapping.csv') as in_file:
reader = csv.reader(in_file)
for i, row in enumerate(reader):
for entity in ujson.loads(row[19]):#actually only has one
mapped_resources.add(entity)
for entity in ujson.loads(row[21]):
mapped_resources.add(entity)
#if i > 10000:
# break
print("{} - analyse".format(datetime.datetime.now()))
instance_types = []
for mapped_resource in mapped_resources:
types = subject_object.get(mapped_resource, [])
highest_type = get_highest_class_with_distance(types, sub_class_of, 3)#get_highest_class(types, sub_class_of)
#print('{} -> {} \t->{}'.format(mapped_resource, highest_type, types))#highest_types))
instance_types.append(highest_type)
print("{} - count and write".format(datetime.datetime.now()))
counter = Counter(instance_types)
print(counter)
resulting_list = []
for (concept, count) in counter.most_common():
local_names = [get_local_name(x) for x in get_list_of_class_with_super_classes(concept, sub_class_of)]
local_names += [''] * (3 - len(local_names)) # fill with '' if lost is too small
resulting_list.append(local_names + [count])
resulting_list_sorted = sorted(resulting_list, key=lambda x: (x[0], x[1], x[2]))
with open('type_analysis.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
for row in resulting_list_sorted:
writer.writerow(row)
print("{} - finish".format(datetime.datetime.now()))
if __name__ == "__main__":
set_csv_field_size()
#read_instance_types()
analyse()