diff --git a/.gitignore b/.gitignore index 388c25c..b4f7ae3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ .idea/runConfigurations.xml - +.idea/markdown-navigator/ +.idea/markdown-navigator.xml .idea/dictionaries .idea/misc.xml .idea/*.iml diff --git a/.idea/inspectionProfiles/Default.xml b/.idea/inspectionProfiles/Default.xml new file mode 100644 index 0000000..8f7f28f --- /dev/null +++ b/.idea/inspectionProfiles/Default.xml @@ -0,0 +1,17 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..17b6b9a --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,9 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..06a401e --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/Resolver.py b/Resolver.py deleted file mode 100644 index 1e410a2..0000000 --- a/Resolver.py +++ /dev/null @@ -1,466 +0,0 @@ -# -*- coding: utf-8 -*- - -import logging -import os -import re -from collections import Counter - -from nltk.tokenize import sent_tokenize, word_tokenize -from pycorenlp import StanfordCoreNLP -from typing import List, Tuple, Dict, Union, Set - -from classifier import GenderClassifier -from references import NameReference, NominalReference, PronominalReference -from references import ResolvedPassage -from references import Substitution -from references.TypeLookup import TypeLookup - -logging.getLogger('requests').setLevel(logging.WARNING) - - -# noinspection PyPep8Naming -class Resolver(object): - _typeLookUp = TypeLookup( - os.path.normpath( - os.path.join( - os.path.abspath(__file__), - os.path.pardir, - 'resources', - 'instance.types.bz2' - ) - ) - ) - _separator = '---- <> ----' - _nlp = StanfordCoreNLP('http://corenlp:9000') - # _nlp = StanfordCoreNLP('http://localhost:9000') - - _genderClassifier = GenderClassifier() - - _personalPronouns = {'he': 'male', 'He': 'male', 'she': 'female', 'She': 'female', 'it': 'neutral', - 'It': 'neutral', 'they': 'neutral/plural', 'They': 'neutral/plural'} - _possessivePronouns = {'his': 'male', 'him': 'male', 'His': 'male', 'her': 'female', 'Her': 'female'} - - _reflexivePronouns = {'himself': 'male', 'herself': 'female'} - - @staticmethod - def resolve(text: str, entityURI: str) -> List[Substitution]: - - substitutions = [] - - sentences = Resolver._cleanText(sent_tokenize(text)) - - if len(sentences) < 1: - return [] - - types = Resolver._typeLookUp.getType(entityURI).split(' ') - - nameReferences, pronominalReferences, nominalReferences = Resolver._collectReferencesAndCoreferences( - text=sentences, - types=types) - - firstNameMention = Resolver._getFirstNameMention( - nameReferences=nameReferences, - predominantGender=Resolver._getPredominantGender(pronominalReferences), - text=sentences) - - Resolver._resolveNameCorefs( - firstNameMention=firstNameMention, - nameReferences=nameReferences, - substitutions=substitutions) - - Resolver._resolvePronominalCorefs( - firstNameMention=firstNameMention, - nameReferences=nameReferences, - pronominalReferences=pronominalReferences, - substitutions=substitutions) - - Resolver._resolveNominalCorefs( - firstNameMention=firstNameMention, - nominalReferences=nominalReferences, - substitutions=substitutions) - - return substitutions - - @staticmethod - def _collectReferencesAndCoreferences( - text: List[str], - types: List[str] - ) -> Tuple[List[NameReference], List[PronominalReference], List[NominalReference]]: - - nameReferences, pronominalReferences, nominalReferences = Resolver._collectReferences(text, types) - return nameReferences, pronominalReferences, nominalReferences - - @staticmethod - def substituteInText( - text: str, - substitutions: List[Substitution] - ) -> Tuple[str, Dict[int, str]]: - - out_text = [] - resolvedSentences = {} - - for index, sentence in enumerate(sent_tokenize(text)): - sentence = ' ' + sentence.replace('—', ' ').replace('\'s', ' \'s') - for substitution in substitutions: - if index == substitution.sentenceIndex: - sentence = Resolver._substituteCoref(substitution.originalTerm, - substitution.referenceTerm, - sentence) - resolvedSentences[index] = sentence - - out_text.append(sentence.strip()) - - return "\n".join(out_text), resolvedSentences - - @staticmethod - def _resolveNameCorefs( - firstNameMention: NameReference, - nameReferences: List[NameReference], - substitutions: List[Substitution]) -> None: - firstMention = firstNameMention.term if firstNameMention is not None else [] - for ne in nameReferences: - - if ne.term in firstMention: - reference = firstMention - else: - reference = Resolver._getLongestPrecedentStringWithSubstring(nameReferences, ne.term, ne.sentence) - - ne.resolvedTerm = reference - - substitutions.append(Substitution(ne.sentence, ne.term, reference)) - - @staticmethod - def _resolvePronominalCorefs( - firstNameMention: NameReference, - nameReferences: List[NameReference], - pronominalReferences: List[PronominalReference], - substitutions: List[Substitution]) -> None: - - for pronoun in pronominalReferences: - if pronoun.gender == firstNameMention.gender: - nameTerm = firstNameMention.term - else: - nameTerm = Resolver._getClosestMatchingNameMention(pronoun, nameReferences) - - if nameTerm is None: - nameTerm = pronoun.pronoun - - substitutions.append(Substitution(pronoun.sentence, pronoun.pronoun, nameTerm)) - - @staticmethod - def _resolveNominalCorefs( - firstNameMention: NameReference, - nominalReferences: List[NominalReference], - substitutions) -> None: - - for nominalReference in nominalReferences: - substitutions.append(Substitution(nominalReference.sentence, nominalReference.term, firstNameMention.term)) - - @staticmethod - def _cleanText(text: List[str]): - cleanText = [] - for sentence in text: - sentence = re.sub(r'\[\d+\]', '', sentence) - sentence = sentence.strip() - if sentence.endswith('.') or sentence.endswith('.\n'): - if sentence != '': - cleanText.append(sentence) - - return cleanText - - @staticmethod - def _substituteCoref( - originalTerm: str, - referenceTerm: str, - sentence: str) -> str: - - originalTerm = ' ' + originalTerm + ' ' - referenceTerm = ' ' + referenceTerm + ' ' - - # print(originalTerm + ' ----> ' + referenceTerm) - - sentence = re.sub(r'([.,;])', r' \1', sentence) - - if Resolver._getPronounType(originalTerm) != '': - - if referenceTerm == '': - referenceTerm = originalTerm - sentence = sentence.replace(originalTerm, referenceTerm) - else: - if Resolver._getPronounType(originalTerm) == 'personal': - sentence = sentence.replace(originalTerm, referenceTerm) - elif Resolver._getPronounType(originalTerm) == 'possessive': - sentence = sentence.replace(originalTerm, referenceTerm + "'s ") - else: - referenceTerm = originalTerm - sentence = sentence.replace(originalTerm, referenceTerm) - else: - sentence = sentence.replace(originalTerm, referenceTerm) - - sentence = re.sub(r'\s+([.,;])', r'\1', sentence) - - return sentence - - @staticmethod - def _getClosestMatchingNameMention(pronominalReference: PronominalReference, - nameReferences: List[NameReference] - ) -> Union[None, str]: - for sentenceIndex in range(pronominalReference.sentence - 1, 0, -1): - name = Resolver._getNameAtSentence(nameReferences, sentenceIndex, pronominalReference.gender, - pronominalReference.entityType) - - if name is not None: - return name.resolvedTerm - - return None - - @staticmethod - def _getNameAtSentence(nameReferences: List[NameReference], sentenceIndex: int, gender: str, entityType: str): - for name in nameReferences: - if (sentenceIndex == name.sentence) \ - and (name.gender == gender) \ - and (name.type == entityType): - return name - - return None - - @staticmethod - def _getFirstNameMention(nameReferences: List[NameReference], predominantGender: str, text: List[str] - ) -> NameReference: - for nameReference in nameReferences: - if nameReference.sentence == 0 and nameReference.position < 10: - if predominantGender in ['male', 'female']: - return NameReference(nameReference.term, 'PERSON', predominantGender, 'singular', 0, 0) - else: - return NameReference(nameReference.term, 'OTHER', 'neutral', 'singular', 0, 0) - - for sentence in text: - sentence = Resolver._cleanSentence(sentence) - if sentence != '': - entity = '' - for token in word_tokenize(sentence): - if len(token) > 0 and token[0].isupper(): - entity += token + ' ' - else: - return NameReference(entity.strip(), 'OTHER', 'neutral', 'singular', 0, 0) - - @staticmethod - def _getPredominantGender(pronominalReferences: List[PronominalReference]) -> str: - cnt = Counter( - p.gender.lower() for p in pronominalReferences - ).most_common(n=1) - - if len(cnt) == 1: - return cnt[0][0] - - return 'neutral' # TODO default value? - - @staticmethod - def _cleanSentence( - sentence: str) -> str: - sentence = sentence.replace('—', ' ').replace('–', ' ').replace("'s", " 's").replace('ʻ', '') - return ''.join([token if ord(token) < 128 else ' ' for token in sentence]) - - @staticmethod - def _collectReferences(sentences: List[str], types: List[str] - ) -> Tuple[List[NameReference], List[PronominalReference], List[NominalReference]]: - names, pronouns, nominals = [], [], [] - - for i, sentence in enumerate(sentences): - cleaned = Resolver._cleanSentence(sentence) - annotated = Resolver._nlp.annotate(cleaned, properties={ - 'annotators': 'tokenize,ssplit,ner', - 'outputFormat': 'json' - }) - - names.extend(list(Resolver._getNER(annotated['sentences'][0]['tokens'], i))) - - if Resolver._hasPronoun(cleaned): - pronouns.append(Resolver._getPronoun(cleaned, i)) - - nominals.extend( - list(Resolver._getNominalReferences(annotated['sentences'][0]['tokens'], i, types)) - ) - - return names, pronouns, nominals - - @staticmethod - def _getNER(parsedString: List[Dict[str, str]], sentenceIndex: int) -> Set[NameReference]: - nes = set() - nerTerms = '' - nerType = [] - nerPos = 0 - previousNER = 'O' - for token in parsedString: - if token['ner'] is not 'O': - nerTerms += token['originalText'] + ' ' - nerType.append(token['ner']) - if previousNER is not 'O': - nerPos = token['index'] - else: - if Resolver._hasValidNERType(nerType): - if nerTerms is not '': - ne = NameReference(nerTerms.strip(), - nerType[0], - Resolver._getNameGender(nerTerms), - Resolver._getNameNumber(nerTerms), - sentenceIndex, nerPos) - - nes.add(ne) - nerTerms = '' - nerType = [] - nerPos = 0 - previousNER = token['ner'] - - return nes - - @staticmethod - def _getNominalReferences(parsedString: List[Dict[str, str]], sentenceIndex: int, types: List[str]): - nominalRefs = set() - previousDET = 0 - term = '' - for token in parsedString: - if token['originalText'].lower() == 'the': - previousDET = 1 - term = token['originalText'] + ' ' - - if previousDET and 'NN' == token['pos']: - if token['originalText'].lower() in types: - term += token['originalText'] - nr = NominalReference(term.strip(), sentenceIndex) - nominalRefs.add(nr) - - return nominalRefs - - @staticmethod - def _hasPronoun(sentence: str): - return any( - token in Resolver._personalPronouns - or token in Resolver._possessivePronouns - or token in Resolver._reflexivePronouns - for token in word_tokenize(sentence)) - - @staticmethod - def _getPronoun(sentence: str, sentenceIndex: int) -> PronominalReference: - for token in word_tokenize(sentence): - - if token in Resolver._personalPronouns: - number = 'singular' - if 'plural' in Resolver._personalPronouns[token]: - number = 'plural' - return PronominalReference(token, - Resolver._personalPronouns[token], - number, - Resolver._getPronounType(token), - Resolver._getEntityType(token), - sentenceIndex, - sentence.index(token)) - - if token in Resolver._possessivePronouns: - number = 'singular' - if 'plural' in Resolver._possessivePronouns[token]: - number = 'plural' - return PronominalReference(token, - Resolver._possessivePronouns[token], - number, - Resolver._getPronounType(token), - Resolver._getEntityType(token), - sentenceIndex, - sentence.index(token)) - - if token in Resolver._reflexivePronouns: - number = 'singular' - if 'plural' in Resolver._reflexivePronouns[token]: - number = 'plural' - return PronominalReference(token, - Resolver._reflexivePronouns[token], - number, - Resolver._getPronounType(token), - Resolver._getEntityType(token), - sentenceIndex, - sentence.index(token)) - - return PronominalReference() - - @staticmethod - def _getPronounType(token: str) -> str: - - if token in Resolver._personalPronouns: - return 'personal' - if token in Resolver._possessivePronouns: - return 'possessive' - if token in Resolver._reflexivePronouns: - return 'reflexive' - - return '' - - @staticmethod - def _getEntityType(token: str) -> str: - if token.lower() in ['he', 'she', 'his', 'her', 'him', 'hers', 'they', 'their', 'theirs']: - return 'PERSON' - - return 'THING' - - @staticmethod - def _getLongestPrecedentStringWithSubstring( - nameReferences: List[NameReference], - substring: str, - sentenceIndex: int) -> str: - - output = substring - for nameReference in nameReferences: - terms = nameReference.term.split(' ') - if output in terms: - if nameReference.sentence < sentenceIndex: - if len(terms) > len(output.split(' ')): - output = nameReference.term - - return output - - @staticmethod - def _getNameGender( - name: str) -> str: - return Resolver._genderClassifier.classify(name.split(' ')[0]) - - # noinspection PyUnusedLocal - @staticmethod - def _getNameNumber(name: str) -> str: - return '' - - @staticmethod - def _hasValidNERType(terms: List[str]) -> bool: - return all(x not in terms for x in ['DATE', 'NUMBER', 'SET', 'MONEY', 'PERCENT', 'DURATION', 'MISC', 'ORDINAL']) - - @staticmethod - def getPassagesAndLinkedEntities( - substitutions: List[Substitution], - resolvedSentences: Dict[int, str], - entityLinks: Dict[str, str]) -> List[ResolvedPassage]: - linkedPassages = [] - - for index, sentence in resolvedSentences.items(): - linkedEntities = {} - for substitution in substitutions: - if substitution.sentenceIndex == index: - for entityLabel, entityLink in entityLinks.items(): - if substitution.referenceTerm.lower() == entityLabel.lower(): - linkedEntities[substitution.referenceTerm] = entityLink - if substitution.referenceTerm not in linkedEntities: - linkedEntities[substitution.referenceTerm] = '' - - if 'p.' not in sentence and '(' not in sentence and len(sentence) >= 25: - linkedPassages.append( - ResolvedPassage(index, linkedEntities) - ) - - return linkedPassages - - @staticmethod - def getEntityLinks(articleId: str, links: List[Dict[str, str]]) -> Dict[str, str]: - entityLinks = {} - label = articleId[articleId.rfind('/') + 1:].replace('_', ' ') - entityLinks[label] = articleId - for link in links: - entityLinks[link['anchorText']] = link['link'] - - return entityLinks diff --git a/examples/expected.txt b/examples/expected.txt new file mode 100644 index 0000000..a19d660 --- /dev/null +++ b/examples/expected.txt @@ -0,0 +1,25 @@ +Barack Hussein Obama II is an American politician who served as the 44th President of the United States from 2009 to 2017. +Barack Hussein Obama II is the first African American to have served as president, as well as the first born outside the contiguous United States. +Barack Hussein Obama II previously served in the U.S. Senate representing Illinois from 2005 to 2008, and in the Illinois State Senate from 1997 to 2004. +Barack Hussein Obama II was born in Honolulu, Hawaii, two years after the territory was admitted to the Union as the 50th state. +Barack Hussein Obama II grew up mostly in Hawaii, but also spent one year of Barack Hussein Obama II 's childhood in Washington State and four years in Indonesia. +After graduating from Columbia University in 1983, Barack Hussein Obama II worked as a community organizer in Chicago. +In 1988 Obama enrolled in Harvard Law School, where Barack Hussein Obama II was the first black president of the Harvard Law Review. +After graduation, Barack Hussein Obama II became a civil rights attorney and professor, teaching constitutional law at the University of Chicago Law School from 1992 to 2004. +Barack Hussein Obama II represented the 13th District for three terms in the Illinois Senate from 1997 to 2004, when Barack Hussein Obama II ran for the U.S. Senate. +Barack Hussein Obama II received national attention in 2004, with Barack Hussein Obama II 's unexpected March primary win, Barack Hussein Obama II 's well-received July Democratic National Convention keynote address, and Barack Hussein Obama II 's landslide November election to the U.S. Senate. +In 2008, Barack Hussein Obama II was nominated for president, a year after Barack Hussein Obama II 's campaign began, and after a close primary campaign against Hillary Clinton. +Barack Hussein Obama II was elected over Republican John McCain, and was inaugurated on January 20, 2009. +Nine months later, Barack Hussein Obama II was named the 2009 Nobel Peace Prize laureate. +During Barack Hussein Obama II 's first two years in office, Barack Hussein Obama II signed more landmark legislation than any Democratic president since LBJ 's Great Society. +Main reforms were the Patient Protection and Affordable Care Act (often referred to as "Obamacare"), the Dodd–Frank Wall Street Reform and Consumer Protection Act, and the Don't Ask, Don't Tell Repeal Act of 2010. +The American Recovery and Reinvestment Act of 2009 and Tax Relief, Unemployment Insurance Reauthorization, and Job Creation Act of 2010 served as economic stimulus amidst the Great Recession, but the GOP regained control of the House of Representatives in 2011. +After a lengthy debate over the national debt limit, Barack Hussein Obama II signed the Budget Control and the American Taxpayer Relief Acts. +In foreign policy, Barack Hussein Obama II increased U.S. troop levels in Afghanistan, reduced nuclear weapons with the U.S.-Russian New START treaty, and ended military involvement in the Iraq War. +Barack Hussein Obama II ordered military involvement in Libya in opposition to Muammar Gaddafi, and the military operation that resulted in the death of Osama bin Laden. +After winning re-election over Mitt Romney, Barack Hussein Obama II was sworn in for a second term in 2013. +During Barack Hussein Obama II 's second term, Barack Hussein Obama II promoted greater inclusiveness for LGBT Americans, with Barack Hussein Obama II 's administration filing briefs that urged the Supreme Court to strike down same-sex marriage bans as unconstitutional (United States v. Windsor and Obergefell v. Hodges). +Barack Hussein Obama II also advocated gun control in response to the Sandy Hook Elementary School shooting, and issued wide-ranging executive actions concerning climate change and immigration. +In foreign policy, Barack Hussein Obama II ordered military intervention in Iraq in response to gains made by ISIL after the 2011 withdrawal from Iraq, continued the process of ending U.S. combat operations in Afghanistan, promoted discussions that led to the 2015 Paris Agreement on global climate change, initiated the sanctions against Russia following the invasion in Ukraine, brokered a nuclear deal with Iran, and normalized U.S. relations with Cuba. +Barack Hussein Obama II left office in January 2017 with a 60% approval rating. +Barack Hussein Obama II 's presidential library will be built in Chicago. \ No newline at end of file diff --git a/examples/original.txt b/examples/original.txt new file mode 100644 index 0000000..1338266 --- /dev/null +++ b/examples/original.txt @@ -0,0 +1,25 @@ +Barack Hussein Obama II is an American politician who served as the 44th President of the United States from 2009 to 2017. +He is the first African American to have served as president, as well as the first born outside the contiguous United States. +He previously served in the U.S. Senate representing Illinois from 2005 to 2008, and in the Illinois State Senate from 1997 to 2004. +Obama was born in Honolulu, Hawaii, two years after the territory was admitted to the Union as the 50th state. +He grew up mostly in Hawaii, but also spent one year of his childhood in Washington State and four years in Indonesia. +After graduating from Columbia University in 1983, he worked as a community organizer in Chicago. +In 1988 Obama enrolled in Harvard Law School, where he was the first black president of the Harvard Law Review. +After graduation, he became a civil rights attorney and professor, teaching constitutional law at the University of Chicago Law School from 1992 to 2004. +Obama represented the 13th District for three terms in the Illinois Senate from 1997 to 2004, when he ran for the U.S. Senate. +Obama received national attention in 2004, with his unexpected March primary win, his well-received July Democratic National Convention keynote address, and his landslide November election to the Senate. +In 2008, Obama was nominated for president, a year after his campaign began, and after a close primary campaign against Hillary Clinton. +He was elected over Republican John McCain, and was inaugurated on January 20, 2009. +Nine months later, Obama was named the 2009 Nobel Peace Prize laureate. +During his first two years in office, Obama signed more landmark legislation than any Democratic president since LBJ's Great Society. +Main reforms were the Patient Protection and Affordable Care Act (often referred to as "Obamacare"), the Dodd–Frank Wall Street Reform and Consumer Protection Act, and the Don't Ask, Don't Tell Repeal Act of 2010. +The American Recovery and Reinvestment Act of 2009 and Tax Relief, Unemployment Insurance Reauthorization, and Job Creation Act of 2010 served as economic stimulus amidst the Great Recession, but the GOP regained control of the House of Representatives in 2011. +After a lengthy debate over the national debt limit, Obama signed the Budget Control and the American Taxpayer Relief Acts. +In foreign policy, Obama increased U.S. troop levels in Afghanistan, reduced nuclear weapons with the U.S.-Russian New START treaty, and ended military involvement in the Iraq War. +He ordered military involvement in Libya in opposition to Muammar Gaddafi, and the military operation that resulted in the death of Osama bin Laden. +After winning re-election over Mitt Romney, Obama was sworn in for a second term in 2013. +During his second term, Obama promoted greater inclusiveness for LGBT Americans, with his administration filing briefs that urged the Supreme Court to strike down same-sex marriage bans as unconstitutional (United States v. Windsor and Obergefell v. Hodges). +Obama also advocated gun control in response to the Sandy Hook Elementary School shooting, and issued wide-ranging executive actions concerning climate change and immigration. +In foreign policy, Obama ordered military intervention in Iraq in response to gains made by ISIL after the 2011 withdrawal from Iraq, continued the process of ending U.S. combat operations in Afghanistan, promoted discussions that led to the 2015 Paris Agreement on global climate change, initiated the sanctions against Russia following the invasion in Ukraine, brokered a nuclear deal with Iran, and normalized U.S. relations with Cuba. +Obama left office in January 2017 with a 60% approval rating. +His presidential library will be built in Chicago. \ No newline at end of file diff --git a/references/NameReference.py b/references/NameReference.py deleted file mode 100644 index 3651773..0000000 --- a/references/NameReference.py +++ /dev/null @@ -1,30 +0,0 @@ -class NameReference(object): - def __init__(self, term: str, type: str, gender: str, number: str, sentence: int, position: int): - self.term = term - self.type = type - if type == 'PERSON': - self.gender = gender - else: - self.gender = 'neutral' - self.number = number - self.sentence = sentence - self.position = position - self.resolvedTerm = term - - def getSentenceIndex(self): - return self.sentence - - def __str__(self): - return "{self.term} {self.type} {self.gender} {self.number} {self.sentence} {self.position}\n".format(self=self) - - def __repr__(self): - return self.__str__() - - def __eq__(self, other): - return (self.term == other.term) \ - and (self.type == other.type) \ - and (self.sentence == other.sentence) \ - and (self.position == other.position) - - def __hash__(self): - return hash(self.term + self.type + str(self.sentence)) diff --git a/references/NominalReference.py b/references/NominalReference.py deleted file mode 100644 index c60b702..0000000 --- a/references/NominalReference.py +++ /dev/null @@ -1,19 +0,0 @@ -class NominalReference(object): - def __init__(self, term: str, sentence: int): - self.term = term - self.sentence = sentence - - def getSentenceIndex(self): - return self.sentence - - def __str__(self): - return "{self.term} {self.sentence}".format(self=self) - - def __repr__(self): - return self.__str__() - - def __eq__(self, other): - return (self.term == other.term) and (self.sentence == other.sentence) - - def __hash__(self): - return hash(self.term + str(self.sentence)) diff --git a/references/PronominalReference.py b/references/PronominalReference.py deleted file mode 100644 index 685eff9..0000000 --- a/references/PronominalReference.py +++ /dev/null @@ -1,28 +0,0 @@ -# -*- coding: utf-8 -*- - - -# noinspection PyPep8Naming -class PronominalReference(object): - def __init__(self, pronoun: str = '', gender: str = '', number: str = '', pronounType: str = '', - entityType: str = '', sentence: int = -1, position: int = -1): - self.pronoun = pronoun - self.gender = gender - self.number = number - self.pronounType = pronounType - self.entityType = entityType - self.sentence = sentence - self.position = position - - def __str__(self): - return "{self.pronoun} {self.gender} {self.number} {self.sentence} {self.position}".format(self=self) - - def __repr__(self): - return self.__str__() - - def __eq__(self, other): - return (self.pronoun == other.term) \ - and (self.sentence == other.sentence) \ - and (self.position == other.position) - - def __hash__(self): - return hash(self.pronoun + ' ' + str(self.sentence) + ' ' + str(self.position)) diff --git a/references/Substitution.py b/references/Substitution.py deleted file mode 100644 index 2ff5c55..0000000 --- a/references/Substitution.py +++ /dev/null @@ -1,12 +0,0 @@ -# -*- coding: utf-8 -*- - - -# noinspection PyPep8Naming -class Substitution(object): - def __init__(self, sentenceIndex: int, originalTerm: str, referenceTerm: str): - self.sentenceIndex = sentenceIndex - self.originalTerm = originalTerm - self.referenceTerm = referenceTerm - - def __str__(self): - return "{self.sentenceIndex} '{self.originalTerm}' -> '{self.referenceTerm}'".format(self=self) diff --git a/run.py b/run.py index a77070f..99b2008 100644 --- a/run.py +++ b/run.py @@ -56,13 +56,13 @@ def _run(article: str) -> str: try: substitutions = Resolver.resolve(json_article['text'], json_article['id']) - substituted_text, resolved_sentences = Resolver.substituteInText(json_article['text'], substitutions) + substituted_text, resolved_sentences = Resolver.substitute_in_text(json_article['text'], substitutions) - entity_links = Resolver.getEntityLinks(json_article['id'], json_article['links']) + entity_links = Resolver.get_entity_links(json_article['id'], json_article['links']) - passages_and_linked_entities = Resolver.getPassagesAndLinkedEntities(substitutions, - resolved_sentences, - entity_links) + passages_and_linked_entities = Resolver.get_passages_and_linked_entities(substitutions, + resolved_sentences, + entity_links) for p in passages_and_linked_entities: diff --git a/service.py b/service.py index 877ccc3..9558825 100644 --- a/service.py +++ b/service.py @@ -26,7 +26,7 @@ def resolve_wiki(): return make_response(jsonify({'message': '`text` and `uri` fields are mandatory.'}), 400) substitutions = Resolver.resolve(text, uri) - substituted, _ = Resolver.substituteInText(text, substitutions) + substituted = Resolver.substitute_in_text(text, substitutions) return jsonify({'text': substituted}) @@ -42,7 +42,7 @@ def resolve_text(): return make_response(jsonify({'message': '`text` field is mandatory.'}), 400) substitutions = Resolver.resolve(text, '') - substituted, _ = Resolver.substituteInText(text, substitutions) + substituted = Resolver.substitute_in_text(text, substitutions) return jsonify({'text': substituted}) diff --git a/src/Resolver.py b/src/Resolver.py new file mode 100644 index 0000000..8f604b4 --- /dev/null +++ b/src/Resolver.py @@ -0,0 +1,499 @@ +# -*- coding: utf-8 -*- +import logging +import os +import re +from collections import Counter + +from nltk.tokenize import sent_tokenize, word_tokenize +from pycorenlp import StanfordCoreNLP +from typing import List, Tuple, Dict, Union, Set + +from classifier import GenderClassifier +from constant_types import PronounType, QuantityType, GenderType, EntityType +from references import NameReference, NominalReference, PronominalReference +from references import ResolvedPassage +from references import Substitution +from references.TypeLookup import TypeLookup + +logging.getLogger('requests').setLevel(logging.WARNING) + + +# noinspection PyPep8Naming +class Resolver(object): + _type_lookup = TypeLookup( + os.path.normpath( + os.path.join( + os.path.abspath(__file__), + os.path.pardir, + 'resources', + 'instance.types.bz2' + ) + ) + ) + _separator = '---- <> ----' + _nlp = StanfordCoreNLP('http://localhost:9000') + # _nlp = StanfordCoreNLP('http://localhost:9000') + + _gender_classifier = GenderClassifier() + + _personal_pronouns = { + 'he': { + 'gender': GenderType.MALE, + 'quantity': QuantityType.SINGULAR + }, + 'she': { + 'gender': GenderType.FEMALE, + 'quantity': QuantityType.SINGULAR + }, + 'it': { + 'gender': GenderType.NEUTRAL, + 'quantity': QuantityType.SINGULAR + }, + 'they': { + 'gender': GenderType.NEUTRAL, + 'quantity': QuantityType.PLURAL + } + } + + _possessive_pronouns = { + 'his': { + 'gender': GenderType.MALE, + 'quantity': QuantityType.SINGULAR + }, + 'him': { + 'gender': GenderType.MALE, + 'quantity': QuantityType.SINGULAR + }, + 'her': { + 'gender': GenderType.FEMALE, + 'quantity': QuantityType.SINGULAR + } + } + + _reflexive_pronouns = { + 'himself': { + 'gender': GenderType.MALE, + 'quantity': QuantityType.SINGULAR + }, + 'herself': { + 'gender': GenderType.FEMALE, + 'quantity': QuantityType.SINGULAR + } + } + + @staticmethod + def resolve(text: str, entity_uri: str) -> List[Substitution]: + + substitutions = [] + + sentences = Resolver._clean_text(sent_tokenize(text)) + + if len(sentences) < 1: + return [] + + types = Resolver._type_lookup.type(entity_uri).split(' ') + + name_references, pronominal_references, nominal_references = Resolver._collect_references_and_coreferences( + text=sentences, + types=types) + + first_name_mention = Resolver._first_name_mention( + name_references=name_references, + predominant_gender=Resolver._predominant_gender(pronominal_references), + text=sentences) + + Resolver._resolve_name_coreferences( + first_name_mention=first_name_mention, + name_references=name_references, + substitutions=substitutions) + + Resolver._resolve_pronominal_coreferences( + first_name_mention=first_name_mention, + name_references=name_references, + pronominal_references=pronominal_references, + substitutions=substitutions) + + Resolver._resolve_nominal_coreferences( + first_name_mention=first_name_mention, + nominal_references=nominal_references, + substitutions=substitutions) + + return substitutions + + @staticmethod + def _collect_references_and_coreferences( + text: List[str], + types: List[str] + ) -> Tuple[List[NameReference], List[PronominalReference], List[NominalReference]]: + + name_references, pronominal_references, nominal_references = Resolver._collect_references(text, types) + return name_references, pronominal_references, nominal_references + + @staticmethod + def substitute_in_text( + text: str, + substitutions: List[Substitution] + ) -> str: + + out_text = [] + + for index, sentence in enumerate(sent_tokenize(text)): + sentence = ' ' + sentence.replace('—', ' ').replace('\'s', ' \'s') + for substitution in substitutions: + if index == substitution.sentence_index: + sentence = Resolver._substitute_coreference(substitution.original, + substitution.reference, + sentence) + out_text.append(sentence.strip()) + + return "\n".join(out_text) + + @staticmethod + def _resolve_name_coreferences( + first_name_mention: NameReference, + name_references: List[NameReference], + substitutions: List[Substitution]) -> None: + first_name = first_name_mention.term if first_name_mention is not None else "" + for ne in name_references: + + if ne.term in first_name: + reference = first_name + else: + reference = Resolver._longest_precedent_string_with_substring( + name_references, ne.term, ne.sentence_index) + + ne.resolved_term = reference + + substitutions.append(Substitution(ne.sentence_index, ne.term, reference)) + + @staticmethod + def _resolve_pronominal_coreferences( + first_name_mention: NameReference, + name_references: List[NameReference], + pronominal_references: List[PronominalReference], + substitutions: List[Substitution]) -> None: + + for pronoun in pronominal_references: + if pronoun.gender == first_name_mention.gender: + name_term = first_name_mention.term + else: + name_term = Resolver._closest_matching_name_mention(pronoun, name_references) + + if name_term is None: + name_term = pronoun.pronoun + + substitutions.append(Substitution(pronoun.sentence_index, pronoun.pronoun, name_term)) + + @staticmethod + def _resolve_nominal_coreferences( + first_name_mention: NameReference, + nominal_references: List[NominalReference], + substitutions) -> None: + + for nominal_reference in nominal_references: + substitutions.append( + Substitution(nominal_reference.sentence_index, nominal_reference.term, first_name_mention.term)) + + @staticmethod + def _clean_text(text: List[str]): + cleanText = [] + for sentence in text: + sentence = re.sub(r'\[\d+\]', '', sentence) + sentence = sentence.strip() + if sentence.endswith('.') or sentence.endswith('.\n'): + if sentence != '': + cleanText.append(sentence) + + return cleanText + + @staticmethod + def _substitute_coreference( + original_term: str, + reference_term: str, + sentence: str) -> str: + + original_term = ' ' + original_term + ' ' + reference_term = ' ' + reference_term + ' ' + + # print(originalTerm + ' ----> ' + referenceTerm) + + sentence = re.sub(r'([.,;])', r' \1', sentence) + + if Resolver._pronoun_type(original_term) != PronounType.NONE: + + if reference_term == '': + reference_term = original_term + sentence = sentence.replace(original_term, reference_term) + else: + if Resolver._pronoun_type(original_term) == PronounType.PERSONAL: + sentence = sentence.replace(original_term, reference_term) + elif Resolver._pronoun_type(original_term) == PronounType.POSSESSIVE: + sentence = sentence.replace(original_term, reference_term + "'s ") + else: + reference_term = original_term + sentence = sentence.replace(original_term, reference_term) + else: + sentence = sentence.replace(original_term, reference_term) + + sentence = re.sub(r'\s+([.,;])', r'\1', sentence) + + return sentence + + @staticmethod + def _closest_matching_name_mention(pronominal_reference: PronominalReference, + name_references: List[NameReference] + ) -> Union[None, str]: + for sentence_index in range(pronominal_reference.sentence_index - 1, 0, -1): + name = Resolver._name_at_sentence(name_references, sentence_index, pronominal_reference.gender, + pronominal_reference.entity_type) + + if name is not None: + return name.resolved_term + + return None + + @staticmethod + def _name_at_sentence(name_references: List[NameReference], sentence_index: int, gender: GenderType, + entity_type: EntityType): + for name in name_references: + if (sentence_index == name.sentence_index) \ + and (name.gender == gender) \ + and (name.entity_type == entity_type): + return name + + return None + + @staticmethod + def _first_name_mention(name_references: List[NameReference], predominant_gender: GenderType, text: List[str] + ) -> NameReference: + for name_reference in name_references: + if name_reference.sentence_index == 0 and name_reference.word_position < 10: + if predominant_gender in [GenderType.MALE, GenderType.FEMALE]: + return NameReference(name_reference.term, EntityType.PERSON, predominant_gender, + QuantityType.SINGULAR, 0, 0) + else: + return NameReference(name_reference.term, EntityType.OTHER, GenderType.NEUTRAL, + QuantityType.SINGULAR, 0, 0) + + for sentence in text: + sentence = Resolver._clean_sentence(sentence) + if sentence != '': + entity = '' + for token in word_tokenize(sentence): + if len(token) > 0 and token[0].isupper(): + entity += token + ' ' + else: + return NameReference(entity.strip(), EntityType.OTHER, GenderType.NEUTRAL, + QuantityType.SINGULAR, 0, 0) + + @staticmethod + def _predominant_gender(pronominal_references: List[PronominalReference]) -> GenderType: + cnt = Counter( + p.gender for p in pronominal_references + ).most_common(n=1) + + if len(cnt) == 1: + return cnt[0][0] + + return GenderType.NEUTRAL # TODO default value? + + @staticmethod + def _clean_sentence( + sentence: str) -> str: + sentence = sentence.replace('—', ' ').replace('–', ' ').replace("'s", " 's").replace('ʻ', '') + return ''.join([token if ord(token) < 128 else ' ' for token in sentence]) + + @staticmethod + def _collect_references(sentences: List[str], types: List[str] + ) -> Tuple[List[NameReference], List[PronominalReference], List[NominalReference]]: + names, pronouns, nominals = [], [], [] + + for i, sentence in enumerate(sentences): + cleaned = Resolver._clean_sentence(sentence) + annotated = Resolver._nlp.annotate(cleaned, properties={ + 'annotators': 'tokenize,ssplit,ner', + 'outputFormat': 'json' + }) + + names.extend(list(Resolver._getNER(annotated['sentences'][0]['tokens'], i))) + + if Resolver._has_pronoun(cleaned): + pronouns.extend(Resolver._pronominal_references(cleaned, i)) + + nominals.extend( + list(Resolver._nominal_references(annotated['sentences'][0]['tokens'], i, types)) + ) + + return names, pronouns, nominals + + @staticmethod + def _getNER(parsedString: List[Dict[str, str]], sentence_index: int) -> Set[NameReference]: + nes = set() + nerTerms = '' + nerType = [] + nerPos = 0 + previousNER = 'O' + for token in parsedString: + if token['ner'] is not 'O': + nerTerms += token['originalText'] + ' ' + nerType.append(token['ner']) + if previousNER is not 'O': + nerPos = token['index'] + else: + if len(nerType) > 0 and Resolver._valid_NER_type(nerType): + if nerTerms is not '': + ne = NameReference(term=nerTerms.strip(), + entity_type=nerType[0], + gender=Resolver._gender(nerTerms), + quantity=Resolver._quantity(nerTerms), + sentence_index=sentence_index, + word_position=nerPos) + + nes.add(ne) + nerTerms = '' + nerType = [] + nerPos = 0 + else: + nerTerms = '' + nerType = [] + nerPos = 0 + previousNER = token['ner'] + + return nes + + @staticmethod + def _nominal_references(annotated_tokens: List[Dict[str, str]], sentence_index: int, types: List[str]): + nominal_references = set() + previous_det = False + term = '' + for annotated_token in annotated_tokens: + if annotated_token['originalText'].lower() == 'the': + previous_det = True + term = annotated_token['originalText'] + ' ' + + if previous_det and 'NN' == annotated_token['pos']: + if annotated_token['originalText'].lower() in types: + term += annotated_token['originalText'] + nr = NominalReference(term.strip(), sentence_index) + nominal_references.add(nr) + + return nominal_references + + @staticmethod + def _has_pronoun(sentence: str): + return any( + token.lower() in Resolver._personal_pronouns + or token.lower() in Resolver._possessive_pronouns + or token.lower() in Resolver._reflexive_pronouns + for token in word_tokenize(sentence)) + + @staticmethod + def _pronominal_references(sentence: str, sentence_index: int) -> List[PronominalReference]: + pronominal_references = [] + for token in word_tokenize(sentence): + + possible_pronoun = token.lower() + + if possible_pronoun in Resolver._personal_pronouns: + lookup = Resolver._personal_pronouns[possible_pronoun] + elif possible_pronoun in Resolver._possessive_pronouns: + lookup = Resolver._possessive_pronouns[possible_pronoun] + elif possible_pronoun in Resolver._reflexive_pronouns: + lookup = Resolver._reflexive_pronouns[possible_pronoun] + else: + continue + + pronominal_references.append(PronominalReference(pronoun=token, + gender=lookup.get('gender'), + quantity=lookup.get('quantity'), + pronoun_type=Resolver._pronoun_type(possible_pronoun), + entity_type=Resolver._entity_type(possible_pronoun), + sentence_index=sentence_index, + word_position=sentence.index(token))) + + return pronominal_references + + @staticmethod + def _pronoun_type(token: str) -> PronounType: + + token = token.strip().lower() + + if token in Resolver._personal_pronouns: + return PronounType.PERSONAL + if token in Resolver._possessive_pronouns: + return PronounType.POSSESSIVE + if token in Resolver._reflexive_pronouns: + return PronounType.REFLEXIVE + + return PronounType.NONE + + @staticmethod + def _entity_type(token: str) -> EntityType: + if token.lower() in ['he', 'she', 'his', 'her', 'him', 'hers', 'they', 'their', 'theirs']: + return EntityType.PERSON + + return EntityType.THING + + @staticmethod + def _longest_precedent_string_with_substring( + name_references: List[NameReference], + substring: str, + sentence_index: int) -> str: + + output = substring + for name_reference in name_references: + terms = name_reference.term.split(' ') + if output in terms: + if name_reference.sentence_index < sentence_index: + if len(terms) > len(output.split(' ')): + output = name_reference.term + + return output + + @staticmethod + def _gender( + name: str) -> GenderType: + return Resolver._gender_classifier.classify(name.split(' ')[0]) + + # noinspection PyUnusedLocal + @staticmethod + def _quantity(name: str) -> QuantityType: + return QuantityType.SINGULAR + + @staticmethod + def _valid_NER_type(terms: List[str]) -> bool: + return all(x not in terms for x in ['DATE', 'NUMBER', 'SET', 'MONEY', 'PERCENT', 'DURATION', 'MISC', 'ORDINAL']) + + @staticmethod + def get_passages_and_linked_entities( + substitutions: List[Substitution], + resolved_sentences: Dict[int, str], + entity_links: Dict[str, str]) -> List[ResolvedPassage]: + linked_passages = [] + + for index, sentence in resolved_sentences.items(): + linked_entities = {} + for substitution in substitutions: + if substitution.sentence_index == index: + for entityLabel, entityLink in entity_links.items(): + if substitution.reference.lower() == entityLabel.lower(): + linked_entities[substitution.reference] = entityLink + if substitution.reference not in linked_entities: + linked_entities[substitution.reference] = '' + + if 'p.' not in sentence and '(' not in sentence and len(sentence) >= 25: + linked_passages.append( + ResolvedPassage(index, linked_entities) + ) + + return linked_passages + + @staticmethod + def get_entity_links(article_id: str, links: List[Dict[str, str]]) -> Dict[str, str]: + entity_links = {} + label = article_id[article_id.rfind('/') + 1:].replace('_', ' ') + entity_links[label] = article_id + for link in links: + entity_links[link['anchorText']] = link['link'] + + return entity_links diff --git a/classifier/GenderClassifier.py b/src/classifier/GenderClassifier.py similarity index 73% rename from classifier/GenderClassifier.py rename to src/classifier/GenderClassifier.py index 3cf6d68..76cc8df 100644 --- a/classifier/GenderClassifier.py +++ b/src/classifier/GenderClassifier.py @@ -3,6 +3,8 @@ import nltk from nltk.corpus import names +from constant_types import GenderType + class GenderClassifier(object): def __init__(self): @@ -17,8 +19,8 @@ def gender_features(word): def __train(self): # Training a gender classifier - labeled_names = ([(name, 'male') for name in names.words('male.txt')] + - [(name, 'female') for name in names.words('female.txt')]) + labeled_names = ([(name, GenderType.MALE) for name in names.words('male.txt')] + + [(name, GenderType.FEMALE) for name in names.words('female.txt')]) random.shuffle(labeled_names) @@ -26,5 +28,5 @@ def __train(self): train_set, test_set = feature_sets[500:], feature_sets[:500] return nltk.NaiveBayesClassifier.train(train_set) - def classify(self, name: str) -> str: + def classify(self, name: str) -> GenderType: return self.classifier.classify(GenderClassifier.gender_features(name)) diff --git a/classifier/LexiconGenderClassifier.py b/src/classifier/LexiconGenderClassifier.py similarity index 100% rename from classifier/LexiconGenderClassifier.py rename to src/classifier/LexiconGenderClassifier.py diff --git a/classifier/__init__.py b/src/classifier/__init__.py similarity index 100% rename from classifier/__init__.py rename to src/classifier/__init__.py diff --git a/src/constant_types/EntityType.py b/src/constant_types/EntityType.py new file mode 100644 index 0000000..589fe4d --- /dev/null +++ b/src/constant_types/EntityType.py @@ -0,0 +1,8 @@ +from enum import Enum + + +class EntityType(Enum): + NONE = 0 + PERSON = 1 + OTHER = 2 + THING = 3 diff --git a/src/constant_types/GenderType.py b/src/constant_types/GenderType.py new file mode 100644 index 0000000..1b4085d --- /dev/null +++ b/src/constant_types/GenderType.py @@ -0,0 +1,8 @@ +from enum import Enum + + +class GenderType(Enum): + NONE = 0 + MALE = 1 + FEMALE = 2 + NEUTRAL = 3 diff --git a/src/constant_types/PronounType.py b/src/constant_types/PronounType.py new file mode 100644 index 0000000..d483829 --- /dev/null +++ b/src/constant_types/PronounType.py @@ -0,0 +1,8 @@ +from enum import Enum + + +class PronounType(Enum): + NONE = 0 + PERSONAL = 1 + POSSESSIVE = 2 + REFLEXIVE = 3 diff --git a/src/constant_types/QuantityType.py b/src/constant_types/QuantityType.py new file mode 100644 index 0000000..0f05425 --- /dev/null +++ b/src/constant_types/QuantityType.py @@ -0,0 +1,7 @@ +from enum import Enum + + +class QuantityType(Enum): + NONE = 0 + SINGULAR = 1 + PLURAL = 2 diff --git a/src/constant_types/__init__.py b/src/constant_types/__init__.py new file mode 100644 index 0000000..a0b52eb --- /dev/null +++ b/src/constant_types/__init__.py @@ -0,0 +1,4 @@ +from .EntityType import EntityType +from .GenderType import GenderType +from .PronounType import PronounType +from .QuantityType import QuantityType diff --git a/src/references/NameReference.py b/src/references/NameReference.py new file mode 100644 index 0000000..1f5e31f --- /dev/null +++ b/src/references/NameReference.py @@ -0,0 +1,42 @@ +from constant_types import EntityType +from constant_types import GenderType +from constant_types import QuantityType + + +class NameReference(object): + def __init__(self, + term: str, + entity_type: EntityType, + gender: GenderType, + quantity: QuantityType, + sentence_index: int, + word_position: int): + self.term = term + self.entity_type = entity_type + if entity_type == EntityType.PERSON: + self.gender = gender + else: + self.gender = GenderType.NEUTRAL + self.quantity = quantity + self.sentence_index = sentence_index + self.word_position = word_position + self.resolved_term = term + + def get_sentence_index(self): + return self.sentence_index + + def __str__(self): + return "{self.term} {self.entity_type} {self.gender} {self.quantity} " \ + "{self.sentence_index} {self.word_position}\n".format(self=self) + + def __repr__(self): + return self.__str__() + + def __eq__(self, other): + return (self.term == other.term) \ + and (self.entity_type == other.entity_type) \ + and (self.sentence_index == other.sentence_index) \ + and (self.word_position == other.word_position) + + def __hash__(self): + return hash(self.term + str(self.entity_type) + str(self.sentence_index)) diff --git a/src/references/NominalReference.py b/src/references/NominalReference.py new file mode 100644 index 0000000..90d2b56 --- /dev/null +++ b/src/references/NominalReference.py @@ -0,0 +1,16 @@ +class NominalReference(object): + def __init__(self, term: str, sentence_index: int): + self.term = term + self.sentence_index = sentence_index + + def __str__(self): + return "{self.term} {self.sentence}".format(self=self) + + def __repr__(self): + return self.__str__() + + def __eq__(self, other): + return (self.term == other.term) and (self.sentence_index == other.sentence_index) + + def __hash__(self): + return hash(self.term + str(self.sentence_index)) diff --git a/src/references/PronominalReference.py b/src/references/PronominalReference.py new file mode 100644 index 0000000..2600a77 --- /dev/null +++ b/src/references/PronominalReference.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- + + +# noinspection PyPep8Naming +from constant_types import EntityType +from constant_types import GenderType +from constant_types import PronounType +from constant_types import QuantityType + + +class PronominalReference(object): + def __init__(self, + pronoun: str = '', + gender: GenderType = GenderType.NONE, + quantity: QuantityType = QuantityType.NONE, + pronoun_type: PronounType = PronounType.NONE, + entity_type: EntityType = EntityType.NONE, + sentence_index: int = -1, + word_position: int = -1): + self.pronoun = pronoun + self.gender = gender + self.quantity = quantity + self.pronoun_type = pronoun_type + self.entity_type = entity_type + self.sentence_index = sentence_index + self.word_position = word_position + + def __str__(self): + return "{self.pronoun} {self.gender} {self.quantity} {self.sentence_index} {self.word_position}".format( + self=self) + + def __repr__(self): + return self.__str__() + + def __eq__(self, other): + return (self.pronoun == other.pronoun) \ + and (self.sentence_index == other.sentence_index) \ + and (self.word_position == other.word_position) + + def __hash__(self): + return hash(self.pronoun + ' ' + str(self.sentence_index) + ' ' + str(self.word_position)) diff --git a/references/ResolvedPassage.py b/src/references/ResolvedPassage.py similarity index 100% rename from references/ResolvedPassage.py rename to src/references/ResolvedPassage.py diff --git a/src/references/Substitution.py b/src/references/Substitution.py new file mode 100644 index 0000000..e5be56f --- /dev/null +++ b/src/references/Substitution.py @@ -0,0 +1,12 @@ +# -*- coding: utf-8 -*- + + +# noinspection PyPep8Naming +class Substitution(object): + def __init__(self, sentence_index: int, original: str, reference: str): + self.sentence_index = sentence_index + self.original = original + self.reference = reference + + def __str__(self): + return "{self.sentence_index} '{self.original}' -> '{self.reference}'".format(self=self) diff --git a/references/TypeLookup.py b/src/references/TypeLookup.py similarity index 92% rename from references/TypeLookup.py rename to src/references/TypeLookup.py index c95858f..29c3f44 100644 --- a/references/TypeLookup.py +++ b/src/references/TypeLookup.py @@ -12,5 +12,5 @@ def __init__(self, filename: str): fields = line.split(self.__separator) self.__instanceTypes[fields[0].replace('<', '').replace('>', '')] = fields[1].replace('\n', '') - def getType(self, uri: str) -> str: + def type(self, uri: str) -> str: return self.__instanceTypes[uri] if uri in self.__instanceTypes else '' diff --git a/references/__init__.py b/src/references/__init__.py similarity index 100% rename from references/__init__.py rename to src/references/__init__.py diff --git a/resources/instance.types.bz2 b/src/resources/instance.types.bz2 similarity index 100% rename from resources/instance.types.bz2 rename to src/resources/instance.types.bz2 diff --git a/Tests.py b/src/tests/Tests.py similarity index 90% rename from Tests.py rename to src/tests/Tests.py index b7e4253..a229935 100644 --- a/Tests.py +++ b/src/tests/Tests.py @@ -9,7 +9,7 @@ def resolve(filename, uri): text = f.read() substitutions, _ = Resolver.resolve(text, uri) - substituted_text = Resolver.substituteInText(text, substitutions) + substituted_text = Resolver.substitute_in_text(text, substitutions) print(substituted_text) @@ -34,16 +34,16 @@ def resolveAndLinkArticles(filename): substitutions, resolvedSentences = Resolver.resolve( text=dataText['text'], - entityURI=dataText['id']) + entity_uri=dataText['id']) - substitutedText = Resolver.substituteInText( + substitutedText = Resolver.substitute_in_text( text=dataText['text'], substitutions=substitutions) - linkedPassages = Resolver.getPassagesAndLinkedEntities( + linkedPassages = Resolver.get_passages_and_linked_entities( substitutions=substitutions, - resolvedSentences=resolvedSentences, - entityLinks=entityLinks) + resolved_sentences=resolvedSentences, + entity_links=entityLinks) for linkedPassage in linkedPassages: print(linkedPassage) diff --git a/tests/test_Resolver.py b/src/tests/test_Resolver.py similarity index 90% rename from tests/test_Resolver.py rename to src/tests/test_Resolver.py index 15e7848..3aa6aa4 100644 --- a/tests/test_Resolver.py +++ b/src/tests/test_Resolver.py @@ -26,7 +26,7 @@ def test_clean_text(self): Probably this is a clean sentence.""" ) - cleaned_text = Resolver._cleanText(input_text) + cleaned_text = Resolver._clean_text(input_text) expected_output = sent_tokenize( """I have multiple sentences for years. @@ -38,7 +38,7 @@ def test_clean_text(self): def test_simple_sentence(self): input_text = "Donald Trump is the president of USA. He is a business man." - substituted, _ = Resolver.substituteInText( + substituted = Resolver.substitute_in_text( input_text, Resolver.resolve(input_text, '')) self.assertEqual( @@ -49,7 +49,7 @@ def test_simple_sentence(self): def test_simple_sentence_2(self): input_text = "Donald Trump is the president of USA. He is a business man." - substituted, _ = Resolver.substituteInText( + substituted = Resolver.substitute_in_text( input_text, Resolver.resolve(input_text, '')) self.assertEqual( diff --git a/src/tests/test_WikiArticle.py b/src/tests/test_WikiArticle.py new file mode 100644 index 0000000..faa9052 --- /dev/null +++ b/src/tests/test_WikiArticle.py @@ -0,0 +1,50 @@ +import logging +import os +import sys +import unittest + +from nltk import sent_tokenize + +logging.getLogger('requests').setLevel(logging.WARNING) + +sys.path.append( + os.path.dirname( + os.path.dirname( + os.path.abspath(__file__) + ) + ) +) + +from Resolver import Resolver + + +# noinspection PyProtectedMember +class TestResolverForBarackObama(unittest.TestCase): + def setUp(self): + self.maxDiff = None + + def test_resolver_for_first_paragraphs(self): + with open(os.path.join('examples', 'original.txt')) as i: + sentences = [l for l in i] + + self.assertIsNotNone(sentences) + self.assertEquals(25, len(sentences)) + + text = "\n".join(sentences) + + substitutions = Resolver.resolve(text, "") + substituted = Resolver.substitute_in_text(text, substitutions) + + with open(os.path.join('examples', 'expected.txt')) as i: + expected = [l.strip() for l in i] + + substituted = [s.strip() for s in sent_tokenize(substituted)] + + self.assertEquals(len(substituted), len(expected)) + + for s, e in zip(substituted, expected): + self.assertEquals(s, e) + + +if __name__ == '__main__': + unittest.main()