main.py

"""
This modules takes as input the file generated by Stanford NER, and looks for the recognized entities.
For every entity, it looks on wikipedia for the article without doing any kind of preprocessing, query generation, or
any feature extraction.
If it finds a candidate article (any article), it assigns it to the entity.
If it doesn't, it classifies the entity as NIL.
"""

import datetime
import argparse
import logging
from os import listdir
from os.path import join
from sys import exit
import xml.etree.ElementTree as ET
import wptools
from model import Mention, Entry, LinkedMention
from ner import detect as apply_ner

# create logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

# create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)

ch.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(ch)


class MentionDetector:

    IGNORED_ENTITY_TYPES = {"ORDINAL", "NUMBER", "DATE", "PERCENT", "MONEY", "DURATION", "CAUSE_OF_DEATH", "SET",
                            "TIME", "URL", "IDEOLOGY", "CRIMINAL_CHARGE",
                            "RELIGION", # Ver si no se toma como ORG
                            "TITLE"}    # Title is an special case, because given coreference I should use it
                                        # but for this baseline, it doesn't make sense

    def __init__(self, file_name):
        self.file_name = file_name
        self.doc_id = None

    def get_mentions(self):
        tree = ET.parse(self.file_name)
        doc = tree.getroot()[0]

        self.doc_id = doc.find("docId").text

        result = []
        previous_type = None

        for sentence in doc.find("sentences"):
            for token in sentence[0]:
                entity_type = token.find("NER").text

                if entity_type != 'O' and entity_type not in MentionDetector.IGNORED_ENTITY_TYPES:
                    if entity_type == previous_type:
                        head_string, end_offset = self.get_head_string_and_offset(token)
                        result[-1].add(head_string, end_offset)
                    else:
                        mention = self.create_mention(token)
                        result.append(mention)

                previous_type = entity_type

        return result

    def create_mention(self, token):
        head_string, end = self.get_head_string_and_offset(token)
        begin = token.find("CharacterOffsetBegin").text
        entity_type = token.find("NER").text
        return Mention(head_string, self.doc_id, begin, end, entity_type)

    def get_head_string_and_offset(self, token):
        head_string = token.find("word").text
        end = str(int(token.find("CharacterOffsetEnd").text) - 1)   # StanfordNER offsets is exclusive (adds one extra char at the end)
        return head_string, end


def link_mentions(mentions):
    result = []
    for mention in mentions:
        try:
            entry = Entry(wptools.page(mention.head_string.lower()).get_query())
        except LookupError:
            entry = Entry(None)

        result.append(LinkedMention(mention, entry))

    return result


def get_run_id():
    now = datetime.datetime.now()
    return now.strftime("%Y%m%d-%H%M%S")


def export_linked_mentions(file_name, linked_mentions):
    run_id = get_run_id()
    with open(file_name + ".tab", "w+") as f:
        for linked_mention in linked_mentions:
            line = "{}\t{}\n".format(run_id, str(linked_mention))
            logger.debug(line)
            f.write(line)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Executes NEL Baseline.')
    parser.add_argument('-r', '--raw', help='Input raw text file')
    parser.add_argument('-x', '--xml', help='Input NER xml file')
    parser.add_argument('-xd', '--xmldir', help='Input NER xml directory')

    args = parser.parse_args()

    if args.raw is None and args.xml is None and args.xmldir is None:
        parser.print_usage()
        exit(0)

    ner_files = []
    if args.raw is not None:
        logger.info("Applying NER on file {}".format(args.raw))
        ner_files.append(apply_ner(args.raw))

    if args.xml is not None:
        ner_files.append(args.xml)

    if args.xmldir is not None:
        ner_files.extend([join(args.xmldir, filename) for filename in listdir(args.xmldir)])

    logger.info("Detecting mentions from XML Files")
    mentions = []
    for ner_file in ner_files:
        md = MentionDetector(ner_file)
        mentions.extend(md.get_mentions())

    logger.info("Linking mentions to wikipedia articles")
    linked_mentions = link_mentions(mentions)

    logger.info("Exporting mentions to tab file")
    export_linked_mentions("res-"+get_run_id(), linked_mentions)