-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
141 lines (106 loc) · 4.69 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
"""
This modules takes as input the file generated by Stanford NER, and looks for the recognized entities.
For every entity, it looks on wikipedia for the article without doing any kind of preprocessing, query generation, or
any feature extraction.
If it finds a candidate article (any article), it assigns it to the entity.
If it doesn't, it classifies the entity as NIL.
"""
import datetime
import argparse
import logging
from os import listdir
from os.path import join
from sys import exit
import xml.etree.ElementTree as ET
import wptools
from model import Mention, Entry, LinkedMention
from ner import detect as apply_ner
# create logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
# create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
ch.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(ch)
class MentionDetector:
IGNORED_ENTITY_TYPES = {"ORDINAL", "NUMBER", "DATE", "PERCENT", "MONEY", "DURATION", "CAUSE_OF_DEATH", "SET",
"TIME", "URL", "IDEOLOGY", "CRIMINAL_CHARGE",
"RELIGION", # Ver si no se toma como ORG
"TITLE"} # Title is an special case, because given coreference I should use it
# but for this baseline, it doesn't make sense
def __init__(self, file_name):
self.file_name = file_name
self.doc_id = None
def get_mentions(self):
tree = ET.parse(self.file_name)
doc = tree.getroot()[0]
self.doc_id = doc.find("docId").text
result = []
previous_type = None
for sentence in doc.find("sentences"):
for token in sentence[0]:
entity_type = token.find("NER").text
if entity_type != 'O' and entity_type not in MentionDetector.IGNORED_ENTITY_TYPES:
if entity_type == previous_type:
head_string, end_offset = self.get_head_string_and_offset(token)
result[-1].add(head_string, end_offset)
else:
mention = self.create_mention(token)
result.append(mention)
previous_type = entity_type
return result
def create_mention(self, token):
head_string, end = self.get_head_string_and_offset(token)
begin = token.find("CharacterOffsetBegin").text
entity_type = token.find("NER").text
return Mention(head_string, self.doc_id, begin, end, entity_type)
def get_head_string_and_offset(self, token):
head_string = token.find("word").text
end = str(int(token.find("CharacterOffsetEnd").text) - 1) # StanfordNER offsets is exclusive (adds one extra char at the end)
return head_string, end
def link_mentions(mentions):
result = []
for mention in mentions:
try:
entry = Entry(wptools.page(mention.head_string.lower()).get_query())
except LookupError:
entry = Entry(None)
result.append(LinkedMention(mention, entry))
return result
def get_run_id():
now = datetime.datetime.now()
return now.strftime("%Y%m%d-%H%M%S")
def export_linked_mentions(file_name, linked_mentions):
run_id = get_run_id()
with open(file_name + ".tab", "w+") as f:
for linked_mention in linked_mentions:
line = "{}\t{}\n".format(run_id, str(linked_mention))
logger.debug(line)
f.write(line)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Executes NEL Baseline.')
parser.add_argument('-r', '--raw', help='Input raw text file')
parser.add_argument('-x', '--xml', help='Input NER xml file')
parser.add_argument('-xd', '--xmldir', help='Input NER xml directory')
args = parser.parse_args()
if args.raw is None and args.xml is None and args.xmldir is None:
parser.print_usage()
exit(0)
ner_files = []
if args.raw is not None:
logger.info("Applying NER on file {}".format(args.raw))
ner_files.append(apply_ner(args.raw))
if args.xml is not None:
ner_files.append(args.xml)
if args.xmldir is not None:
ner_files.extend([join(args.xmldir, filename) for filename in listdir(args.xmldir)])
logger.info("Detecting mentions from XML Files")
mentions = []
for ner_file in ner_files:
md = MentionDetector(ner_file)
mentions.extend(md.get_mentions())
logger.info("Linking mentions to wikipedia articles")
linked_mentions = link_mentions(mentions)
logger.info("Exporting mentions to tab file")
export_linked_mentions("res-"+get_run_id(), linked_mentions)