tests/en/test_phraselet_production_EN.py

import unittest
import holmes_extractor as holmes
import os

script_directory = os.path.dirname(os.path.realpath(__file__))
ontology = holmes.Ontology(os.sep.join(
    (script_directory, 'test_ontology.owl')))
ontology_holmes_manager = holmes.Manager(model='en_core_web_trf',
                                         perform_coreference_resolution=False,
                                         ontology=ontology,
                                         number_of_workers=1)
ontology_holmes_manager_adm_false = holmes.Manager(model='en_core_web_trf',
                                                   perform_coreference_resolution=False,
                                                   ontology=ontology,
                                                   analyze_derivational_morphology=False,
                                                   number_of_workers=1)
symmetric_ontology = holmes.Ontology(os.sep.join((script_directory, 'test_ontology.owl')),
                                     symmetric_matching=True)
symmetric_ontology_nocoref_holmes_manager = holmes.Manager(model='en_core_web_trf',
                                                           perform_coreference_resolution=False,
                                                           ontology=symmetric_ontology,
                                                           number_of_workers=1)
no_ontology_coref_holmes_manager = holmes.Manager(model='en_core_web_trf',
                                                  perform_coreference_resolution=True,
                                                  number_of_workers=1)


class EnglishPhraseletProductionTest(unittest.TestCase):

    def _check_equals(self, manager, text_to_match, phraselet_labels,
                      replace_with_hypernym_ancestors=True, match_all_words=False,
                      include_reverse_only=False, process_initial_question_words=False):
        manager.remove_all_search_phrases()
        doc = manager.semantic_analyzer.parse(text_to_match)
        phraselet_labels_to_phraselet_infos = {}
        manager.linguistic_object_factory.add_phraselets_to_dict(doc,
                                                          phraselet_labels_to_phraselet_infos=phraselet_labels_to_phraselet_infos,
                                                          replace_with_hypernym_ancestors=replace_with_hypernym_ancestors,
                                                          match_all_words=match_all_words,
                                                          ignore_relation_phraselets=False,
                                                          include_reverse_only=include_reverse_only,
                                                          stop_lemmas=manager.semantic_matching_helper.topic_matching_phraselet_stop_lemmas,
                                                          stop_tags=manager.semantic_matching_helper.topic_matching_phraselet_stop_tags,
                                                          reverse_only_parent_lemmas=manager.semantic_matching_helper.
                                                          topic_matching_reverse_only_parent_lemmas,
                                                          words_to_corpus_frequencies=None,
                                                          maximum_corpus_frequency=None,
                                                          process_initial_question_words=process_initial_question_words)
        self.assertEqual(
            set(phraselet_labels_to_phraselet_infos.keys()),
            set(phraselet_labels))
        self.assertEqual(len(phraselet_labels_to_phraselet_infos.keys()),
                         len(phraselet_labels))

    def _get_phraselet_dict(self, manager, text_to_match, words_to_corpus_frequencies=None,
        maximum_corpus_frequency=None, match_all_words=True):
        manager.remove_all_search_phrases()
        doc = manager.semantic_analyzer.parse(text_to_match)
        phraselet_labels_to_phraselet_infos = {}
        manager.linguistic_object_factory.add_phraselets_to_dict(doc,
                                                          phraselet_labels_to_phraselet_infos=phraselet_labels_to_phraselet_infos,
                                                          replace_with_hypernym_ancestors=False,
                                                          match_all_words=match_all_words,
                                                          ignore_relation_phraselets=False,
                                                          include_reverse_only=True,
                                                          stop_lemmas=manager.semantic_matching_helper.topic_matching_phraselet_stop_lemmas,
                                                          stop_tags=manager.semantic_matching_helper.topic_matching_phraselet_stop_tags,
                                                          reverse_only_parent_lemmas=manager.semantic_matching_helper.
                                                          topic_matching_reverse_only_parent_lemmas,
                                                          words_to_corpus_frequencies=words_to_corpus_frequencies,
                                                          maximum_corpus_frequency=maximum_corpus_frequency,
                                                          process_initial_question_words=False)
        return phraselet_labels_to_phraselet_infos

    
    def test_verb_subject_no_entry_in_ontology(self):
        self._check_equals(ontology_holmes_manager, "A plant grows",
                           ['predicate-actor: grow-plant', 'word: plant'])

    
    def test_phrasal_verb_subject_no_entry_in_ontology(self):
        self._check_equals(ontology_holmes_manager, "A plant grows up quickly",
                           ['governor-adjective: grow up-quick', 'predicate-actor: grow up-plant',
                            'word: plant'])

    
    def test_phrasal_verb_subject_no_entry_in_ontology_adm_false(self):
        self._check_equals(ontology_holmes_manager_adm_false, "A plant grows up quickly",
                           ['governor-adjective: grow up-quickly', 'predicate-actor: grow up-plant',
                            'word: plant'])

    
    def test_verb_direct_object_no_entry_in_ontology(self):
        self._check_equals(ontology_holmes_manager, "A plant is grown",
                           ['predicate-passivesubject: grow-plant', 'word: plant'])

    
    def test_verb_indirect_object_no_entry_in_ontology(self):
        self._check_equals(ontology_holmes_manager, "Somebody gives something to a plant",
                           ['predicate-recipient: gift-plant', 'word: plant'])

    
    def test_noun_adjective_no_entry_in_ontology(self):
        self._check_equals(ontology_holmes_manager, "A healthy plant",
                           ['governor-adjective: plant-healthy', 'word: plant'])

    
    def test_verb_adverb_no_entry_in_ontology(self):
        self._check_equals(ontology_holmes_manager, "They sailed rapidly",
                           ['governor-adjective: sail-rapid'])

    
    def test_verb_adverb_no_entry_in_ontology_adm_false(self):
        self._check_equals(ontology_holmes_manager_adm_false, "They sailed rapidly",
                           ['governor-adjective: sail-rapidly'])

    
    def test_noun_noun_no_entry_in_ontology(self):
        self._check_equals(ontology_holmes_manager, "A hobby plant",
                           ['noun-noun: plant-hobby', 'word: plant', 'word: hobby'])

    
    def test_possessor_possessed_no_entry_in_ontology(self):
        self._check_equals(ontology_holmes_manager, "A gardener's plant",
                           ['word-ofword: plant-gardener', 'word: plant', 'word: gardener'])

    
    def test_combined_no_entry_in_ontology(self):
        self._check_equals(ontology_holmes_manager,
                           "A gardener's healthy hobby plant grows in the sun",
                           ['predicate-actor: grow-plant', 'governor-adjective: plant-healthy',
                            'noun-noun: plant-hobby', 'word-ofword: plant-gardener',
                            'prepgovernor-noun: grow-sun', 'word: plant', 'word: hobby', 'word: gardener',
                            'word: sun'])

    
    def test_class_entry_in_ontology(self):
        self._check_equals(ontology_holmes_manager, "A dog progresses",
                           ['predicate-actor: progress-animal', 'word: animal'])

    
    def test_multiword_class_entry_in_ontology(self):
        self._check_equals(ontology_holmes_manager, "A small cat creature",
                           ['governor-adjective: animal-small', 'word: animal'])

    
    def test_individual_entry_in_ontology(self):
        self._check_equals(ontology_holmes_manager, "Fido progresses",
                           ['predicate-actor: progress-animal', 'word: animal'])

    
    def test_multiword_individual_entry_in_ontology(self):
        self._check_equals(ontology_holmes_manager, "Mimi Momo progresses",
                           ['predicate-actor: progress-animal', 'word: animal'])

    
    def test_class_entry_in_ontology_no_hypernym_replacement(self):
        self._check_equals(ontology_holmes_manager, "A dog progresses",
                           ['predicate-actor: progress-dog', 'word: dog'], False)

    
    def test_multiword_class_entry_in_ontology_no_hypernym_replacement(self):
        self._check_equals(ontology_holmes_manager, "A small cat creature",
                           ['governor-adjective: cat creature-small', 'word: cat creature'], False)

    
    def test_individual_entry_in_ontology_no_hypernym_replacement(self):
        self._check_equals(ontology_holmes_manager, "Fido progresses",
                           ['predicate-actor: progress-fido', 'word: fido'], False)

    
    def test_multiword_individual_entry_in_ontology_no_hypernym_replacement(self):
        self._check_equals(ontology_holmes_manager, "Mimi Momo progresses",
                           ['predicate-actor: progress-mimi momo', 'word: mimi momo'], False)

    
    def test_multiword_in_ontology_no_hypernym(self):
        self._check_equals(ontology_holmes_manager, "School gear progresses",
                           ['predicate-actor: progress-school gear', 'word: school gear'])

    
    def test_multiword_not_in_ontology(self):
        self._check_equals(ontology_holmes_manager,
                           "Information extraction progresses with information",
                           ['predicate-actor: progress-extract', 'noun-noun: extract-inform',
                            'prepgovernor-noun: progress-inform', 'word: inform', 'word: extract'])

    
    def test_multiword_not_in_ontology_analyze_derivational_morphology_false(self):
        self._check_equals(ontology_holmes_manager_adm_false,
                           "Information extraction progresses with information",
                           ['predicate-actor: progress-extraction', 'noun-noun: extraction-information',
                            'prepgovernor-noun: progress-information', 'word: information', 'word: extraction'])

    
    def test_text_in_ontology_lemma_not_in_ontology(self):
        self._check_equals(ontology_holmes_manager,
                           "He saw rainbows",
                           ['predicate-patient: see-arc', 'word: arc'])

    
    def test_text_in_ontology_lemma_not_in_ontology_no_hypernym_replacement(self):
        self._check_equals(ontology_holmes_manager,
                           "He saw rainbows",
                           ['predicate-patient: see-rainbows', 'word: rainbows'], False)

    
    def test_class_entry_in_ontology_symmetric_ontology(self):
        self._check_equals(symmetric_ontology_nocoref_holmes_manager, "A dog progresses",
                           ['predicate-actor: progress-animal', 'word: animal'])

    
    def test_multiword_class_entry_in_ontology_symmetric_ontology(self):
        self._check_equals(symmetric_ontology_nocoref_holmes_manager, "A small cat creature",
                           ['governor-adjective: animal-small', 'word: animal'])

    
    def test_individual_entry_in_ontology_symmetric_ontology(self):
        self._check_equals(symmetric_ontology_nocoref_holmes_manager, "Fido progresses",
                           ['predicate-actor: progress-animal', 'word: animal'])

    
    def test_multiword_individual_entry_in_ontology_symmetric_ontology(self):
        self._check_equals(symmetric_ontology_nocoref_holmes_manager, "Mimi Momo progresses",
                           ['predicate-actor: progress-animal', 'word: animal'])

    
    def test_class_entry_in_ontology_no_hypernym_replacement_symmetric_ontology(self):
        self._check_equals(symmetric_ontology_nocoref_holmes_manager, "A dog progresses",
                           ['predicate-actor: progress-dog', 'word: dog'], False)

    
    def test_multiword_class_entry_in_ontology_no_hypernym_replacement_symmetric_ontology(self):
        self._check_equals(symmetric_ontology_nocoref_holmes_manager, "A small cat creature",
                           ['governor-adjective: cat creature-small', 'word: cat creature'], False)

    
    def test_individual_entry_in_ontology_no_hypernym_replacement_symmetric_ontology(self):
        self._check_equals(symmetric_ontology_nocoref_holmes_manager, "Fido progresses",
                           ['predicate-actor: progress-fido', 'word: fido'], False)

    
    def test_multiword_individual_entry_in_ontology_no_hypernym_replacement_symm_ontology(self):
        self._check_equals(symmetric_ontology_nocoref_holmes_manager, "Mimi Momo progresses",
                           ['predicate-actor: progress-mimi momo', 'word: mimi momo'], False)

    
    def test_multiword_not_in_ontology_symmetric_ontology(self):
        self._check_equals(symmetric_ontology_nocoref_holmes_manager, "Information extraction progresses",
                           ['predicate-actor: progress-extract', 'noun-noun: extract-inform',
                            'word: inform', 'word: extract'])

    
    def test_text_in_ontology_lemma_not_in_ontology_symmetric_ontology(self):
        self._check_equals(symmetric_ontology_nocoref_holmes_manager,
                           "He saw rainbows",
                           ['predicate-patient: see-arc', 'word: arc'])

    
    def test_text_in_ontology_lemma_not_in_ontology_no_hypernym_replacement_symm_ontology(self):
        self._check_equals(symmetric_ontology_nocoref_holmes_manager,
                           "He saw rainbows",
                           ['predicate-patient: see-rainbows', 'word: rainbows'], False)

    def test_prepposs(self):
        self._check_equals(symmetric_ontology_nocoref_holmes_manager,
                           "He needs insurance for five years",
                           ['predicate-patient: need-insurance', 'number-noun: year-five',
                            'prepgovernor-noun: need-year', 'prepgovernor-noun: insurance-year',
                            'word: insurance', 'word: year'], False)

    def test_reverse_only(self):
        self._check_equals(symmetric_ontology_nocoref_holmes_manager,
                           "He needs insurance for five years",
                           ['predicate-patient: need-insurance', 'number-noun: year-five',
                            'prepgovernor-noun: need-year', 'prepgovernor-noun: insurance-year',
                            'word: insurance', 'word: year', 'prep-noun: for-year'], False,
                           include_reverse_only=True)

    def test_coref(self):
        self._check_equals(no_ontology_coref_holmes_manager,
                           "I saw a dog. He was chasing a cat and a cat",
                           ['predicate-patient: see-dog', 'predicate-actor: chase-dog',
                            'predicate-patient: chase-cat', 'word: dog', 'word: cat'])

    def test_reverse_only_parent_lemma(self):
        self._check_equals(no_ontology_coref_holmes_manager,
                           "Always he had it", ['governor-adjective: have-always'], include_reverse_only=True)

    def test_reverse_only_parent_lemma_suppressed(self):
        self._check_equals(no_ontology_coref_holmes_manager,
                           "Always he had it", ['word: have', 'word: always'], include_reverse_only=False)

    def test_phraselet_stop_words_governed(self):
        self._check_equals(no_ontology_coref_holmes_manager,
                           "So he did it at home", ['word: home', 'prepgovernor-noun: do-home',
                                                    'prep-noun: at-home'],
                           include_reverse_only=True)

    def test_phraselet_stop_words_governed_suppressed(self):
        self._check_equals(no_ontology_coref_holmes_manager,
                           "So he did it at home", ['word: home'],
                           include_reverse_only=False)

    def test_question_word(self):
        self._check_equals(no_ontology_coref_holmes_manager,
                           "Who opened the door?",
                           ['head-WHsubj: open-who', 'predicate-patient: open-door', 'word: door'],
                           process_initial_question_words=True)

    def test_question_word_control(self):
        self._check_equals(no_ontology_coref_holmes_manager,
                           "Who opened the door?", ['predicate-patient: open-door', 'word: door'],
                           process_initial_question_words=False)

    def test_coref_and_phraselet_labels(self):
        no_ontology_coref_holmes_manager.remove_all_search_phrases()
        doc = no_ontology_coref_holmes_manager.semantic_analyzer.parse(
            "I saw a dog. He was chasing a cat and a cat")
        phraselet_labels_to_phraselet_infos = {}
        no_ontology_coref_holmes_manager.linguistic_object_factory.add_phraselets_to_dict(
            doc,
            phraselet_labels_to_phraselet_infos=phraselet_labels_to_phraselet_infos,
            replace_with_hypernym_ancestors=False,
            match_all_words=False,
            include_reverse_only=False,
            ignore_relation_phraselets=False,
            stop_lemmas=no_ontology_coref_holmes_manager.
            semantic_matching_helper.topic_matching_phraselet_stop_lemmas,
            stop_tags=no_ontology_coref_holmes_manager.
            semantic_matching_helper.topic_matching_phraselet_stop_tags,
            reverse_only_parent_lemmas=no_ontology_coref_holmes_manager.semantic_matching_helper.
            topic_matching_reverse_only_parent_lemmas,
            words_to_corpus_frequencies=None,
            maximum_corpus_frequency=None,
            process_initial_question_words=False)
        self.assertEqual(set(
            phraselet_labels_to_phraselet_infos.keys()),
            set(['predicate-patient: see-dog', 'predicate-actor: chase-dog',
                 'predicate-patient: chase-cat', 'word: dog', 'word: cat']))

    def test_only_verb(self):
        self._check_equals(ontology_holmes_manager, "jump",
                           ['word: jump'])

    def test_only_preposition(self):
        self._check_equals(ontology_holmes_manager, "in",
                           ['word: in'])

    def test_match_all_words(self):
        self._check_equals(no_ontology_coref_holmes_manager,
                           "I saw a dog. He was chasing a cat and a cat",
                           ['predicate-actor: chase-dog', 'predicate-patient: chase-cat',
                            'predicate-patient: see-dog', 'word: dog', 'word: cat',
                            'word: see', 'word: chase'], False, True)

    def test_entity_defined_multiword_not_match_all_words(self):
        self._check_equals(no_ontology_coref_holmes_manager,
                           "Richard Paul Hudson came",
                           ['predicate-actor: come-richard paul hudson',
                            'word: richard paul hudson'], False, False)

    def test_entity_defined_multiword_not_match_all_words_with_adjective(self):
        self._check_equals(no_ontology_coref_holmes_manager,
                           "The big Richard Paul Hudson",
                           ['governor-adjective: richard paul hudson-big',
                            'word: richard paul hudson'], False, False)

    def test_ontology_defined_multiword_not_match_all_words_with_adjective(self):
        self._check_equals(ontology_holmes_manager,
                           "The big Mimi Momo",
                           ['governor-adjective: mimi momo-big',
                            'word: mimi momo'], False, False)

    def test_entity_defined_multiword_match_all_words(self):
        self._check_equals(no_ontology_coref_holmes_manager,
                           "Richard Paul Hudson came",
                           ['predicate-actor: come-richard paul hudson',
                            'word: richard', 'word: paul', 'word: hudson', 'word: come'], False, True)

    def test_entity_defined_multiword_match_all_words_with_adjective(self):
        self._check_equals(no_ontology_coref_holmes_manager,
                           "The big Richard Paul Hudson",
                           ['governor-adjective: richard paul hudson-big',
                            'word: richard', 'word: paul', 'word: hudson', 'word: big'], False, True)

    
    def test_ontology_defined_multiword_match_all_words_with_adjective(self):
        self._check_equals(ontology_holmes_manager,
                           "The big Mimi Momo",
                           ['governor-adjective: mimi momo-big',
                            'word: mimi', 'word: momo', 'word: big'], False, True)

    
    def test_ontology_and_entity_defined_multiword_match_all_words_with_adjective(self):
        self._check_equals(ontology_holmes_manager,
                           "The big Richard Pranaya Jones",
                           ['governor-adjective: pranaya jones-big', 'noun-noun: pranaya jones-richard',
                            'word: pranaya', 'word: jones', 'word: richard', 'word: big'], False, True)

    def test_ontology_and_entity_defined_multiword_not_match_all_words_with_adjective(self):
        self._check_equals(ontology_holmes_manager,
                           "The big Richard Pranaya Jones",
                           ['governor-adjective: pranaya jones-big', 'noun-noun: pranaya jones-richard',
                            'word: pranaya jones', 'word: richard'], False, False)

    def test_matching_reprs(self):
        dict = self._get_phraselet_dict(no_ontology_coref_holmes_manager,
                                        "The sun shone. They had an anonymous discussion.")
        word_phraselet_1 = dict['word: sun']
        self.assertEqual(word_phraselet_1.parent_direct_matching_reprs, ['sun'])
        self.assertEqual(word_phraselet_1.parent_derivation_matching_reprs, None)
        self.assertEqual(word_phraselet_1.child_direct_matching_reprs, None)
        self.assertEqual(word_phraselet_1.child_derivation_matching_reprs, None)
        word_phraselet_2 = dict['word: discuss']
        self.assertEqual(word_phraselet_2.parent_direct_matching_reprs, ['discussion'])
        self.assertEqual(word_phraselet_2.parent_derivation_matching_reprs, ['discuss'])
        self.assertEqual(word_phraselet_2.child_direct_matching_reprs, None)
        self.assertEqual(word_phraselet_2.child_derivation_matching_reprs, None)
        relation_phraselet_1 = dict['predicate-actor: shine-sun']
        self.assertEqual(relation_phraselet_1.parent_direct_matching_reprs, ['shine'])
        self.assertEqual(relation_phraselet_1.parent_derivation_matching_reprs, None)
        self.assertEqual(relation_phraselet_1.child_direct_matching_reprs, ['sun'])
        self.assertEqual(relation_phraselet_1.child_derivation_matching_reprs, None)
        relation_phraselet_2 = dict['governor-adjective: discuss-anonymity']
        self.assertEqual(relation_phraselet_2.parent_direct_matching_reprs, ['discussion'])
        self.assertEqual(relation_phraselet_2.parent_derivation_matching_reprs, ['discuss'])
        self.assertEqual(relation_phraselet_2.child_direct_matching_reprs, ['anonymous'])
        self.assertEqual(relation_phraselet_2.child_derivation_matching_reprs, ['anonymity'])        

    def test_noun_lemmas_preferred_noun_lemma_first(self):
        dict = self._get_phraselet_dict(no_ontology_coref_holmes_manager,
                                        "They wanted allowances. They wanted to allow it.")
        self.assertFalse('word: allowance' in dict)
        self.assertFalse('predicate-patient: want-allowance' in dict)
        word_phraselet = dict['word: allow']
        self.assertEqual(word_phraselet.parent_lemma, 'allowance')
        self.assertEqual(word_phraselet.parent_derived_lemma, 'allow')
        relation_phraselet = dict['predicate-patient: want-allow']
        self.assertEqual(relation_phraselet.child_lemma, 'allowance')
        self.assertEqual(relation_phraselet.child_derived_lemma, 'allow')

    def test_noun_lemmas_preferred_noun_lemma_second(self):
        dict = self._get_phraselet_dict(no_ontology_coref_holmes_manager,
                                        "They wanted to allow it. They wanted allowances.")
        self.assertFalse('word: allowance' in dict)
        self.assertFalse('predicate-patient: want-allowance' in dict)
        word_phraselet = dict['word: allow']
        self.assertEqual(word_phraselet.parent_lemma, 'allowance')
        self.assertEqual(word_phraselet.parent_derived_lemma, 'allow')
        relation_phraselet = dict['predicate-patient: want-allow']
        self.assertEqual(relation_phraselet.child_lemma, 'allowance')
        self.assertEqual(relation_phraselet.child_derived_lemma, 'allow')

    def test_noun_lemmas_preferred_control_1(self):
        dict = self._get_phraselet_dict(no_ontology_coref_holmes_manager,
                                        "They wanted to allow it.")
        self.assertFalse('word: allowance' in dict)
        self.assertFalse('predicate-patient: want-allowance' in dict)
        word_phraselet = dict['word: allow']
        self.assertEqual(word_phraselet.parent_lemma, 'allow')
        self.assertEqual(word_phraselet.parent_derived_lemma, 'allow')
        relation_phraselet = dict['predicate-patient: want-allow']
        self.assertEqual(relation_phraselet.child_lemma, 'allow')
        self.assertEqual(relation_phraselet.child_derived_lemma, 'allow')

    def test_noun_lemmas_preferred_control_2(self):
        dict = self._get_phraselet_dict(no_ontology_coref_holmes_manager,
                                        "They wanted allowances.")
        self.assertFalse('word: allowance' in dict)
        self.assertFalse('predicate-patient: want-allowance' in dict)
        word_phraselet = dict['word: allow']
        self.assertEqual(word_phraselet.parent_lemma, 'allowance')
        self.assertEqual(word_phraselet.parent_derived_lemma, 'allow')
        relation_phraselet = dict['predicate-patient: want-allow']
        self.assertEqual(relation_phraselet.child_lemma, 'allowance')
        self.assertEqual(relation_phraselet.child_derived_lemma, 'allow')

    def test_shorter_lemmas_preferred_shorter_lemma_first(self):
        dict = self._get_phraselet_dict(no_ontology_coref_holmes_manager,
                                        "They discussed behavior. They discussed behaviour.")
        self.assertFalse('word: behaviour' in dict)
        self.assertFalse('word: behavior' in dict)
        self.assertFalse('predicate-patient: discuss-behaviour' in dict)
        self.assertFalse('predicate-patient: discuss-behavior' in dict)
        word_phraselet = dict['word: behave']
        self.assertEqual(word_phraselet.parent_lemma, 'behavior')
        self.assertEqual(word_phraselet.parent_derived_lemma, 'behave')
        relation_phraselet = dict['predicate-patient: discuss-behave']
        self.assertEqual(relation_phraselet.child_lemma, 'behavior')
        self.assertEqual(relation_phraselet.child_derived_lemma, 'behave')

    def test_shorter_lemmas_preferred_adm_false_control(self):
        dict = self._get_phraselet_dict(ontology_holmes_manager_adm_false,
                                        "They discussed behavior. They discussed behaviour.")
        self.assertTrue('word: behaviour' in dict)
        self.assertTrue('word: behavior' in dict)
        self.assertFalse('word: behave' in dict)
        self.assertTrue('predicate-patient: discuss-behaviour' in dict)
        self.assertTrue('predicate-patient: discuss-behavior' in dict)
        self.assertFalse('predicate-patient: discuss-behave' in dict)
        word_phraselet = dict['word: behavior']
        self.assertEqual(word_phraselet.parent_lemma, 'behavior')
        self.assertEqual(word_phraselet.parent_derived_lemma, 'behavior')
        relation_phraselet = dict['predicate-patient: discuss-behavior']
        self.assertEqual(relation_phraselet.child_lemma, 'behavior')
        self.assertEqual(relation_phraselet.child_derived_lemma, 'behavior')

    def test_shorter_lemmas_preferred_shorter_lemma_second(self):
        dict = self._get_phraselet_dict(no_ontology_coref_holmes_manager,
                                        "They discussed behaviour. They discussed behavior.")
        self.assertFalse('word: behaviour' in dict)
        self.assertFalse('word: behavior' in dict)
        self.assertFalse('predicate-patient: discuss-behaviour' in dict)
        self.assertFalse('predicate-patient: discuss-behavior' in dict)
        word_phraselet = dict['word: behave']
        self.assertEqual(word_phraselet.parent_lemma, 'behavior')
        self.assertEqual(word_phraselet.parent_derived_lemma, 'behave')
        relation_phraselet = dict['predicate-patient: discuss-behave']
        self.assertEqual(relation_phraselet.child_lemma, 'behavior')
        self.assertEqual(relation_phraselet.child_derived_lemma, 'behave')

    def test_shorter_lemmas_preferred_control(self):
        dict = self._get_phraselet_dict(no_ontology_coref_holmes_manager,
                                        "They discussed behaviour. They behaved")
        self.assertFalse('word: behaviour' in dict)
        self.assertFalse('word: behavior' in dict)
        self.assertFalse('predicate-patient: discuss-behaviour' in dict)
        self.assertFalse('predicate-patient: discuss-behavior' in dict)
        word_phraselet = dict['word: behave']
        self.assertEqual(word_phraselet.parent_lemma, 'behaviour')
        self.assertEqual(word_phraselet.parent_derived_lemma, 'behave')
        relation_phraselet = dict['predicate-patient: discuss-behave']
        self.assertEqual(relation_phraselet.child_lemma, 'behaviour')
        self.assertEqual(relation_phraselet.child_derived_lemma, 'behave')

    
    def test_reverse_derived_lemmas_in_ontology_one_lemma_1(self):
        dict = self._get_phraselet_dict(ontology_holmes_manager,
                                        "He ate moodily")
        self.assertFalse('word: moody' in dict)
        self.assertFalse('governor-adjective: eat-moody' in dict)
        word_phraselet = dict['word: moodiness']
        self.assertEqual(word_phraselet.parent_lemma, 'moodily')
        self.assertEqual(word_phraselet.parent_derived_lemma, 'moodiness')
        relation_phraselet = dict['governor-adjective: eat-moodiness']
        self.assertEqual(relation_phraselet.child_lemma, 'moodily')
        self.assertEqual(relation_phraselet.child_derived_lemma, 'moodiness')

    
    def test_reverse_derived_lemmas_in_ontology_one_lemma_2(self):
        dict = self._get_phraselet_dict(ontology_holmes_manager,
                                        "He offended the cat")
        self.assertFalse('word: offend' in dict)
        self.assertFalse('predicate-patient: offend-cat' in dict)
        word_phraselet = dict['word: offence']
        self.assertEqual(word_phraselet.parent_lemma, 'offend')
        self.assertEqual(word_phraselet.parent_derived_lemma, 'offence')
        relation_phraselet = dict['predicate-patient: offence-cat']
        self.assertEqual(relation_phraselet.parent_lemma, 'offend')
        self.assertEqual(relation_phraselet.parent_derived_lemma, 'offence')
        doc = ontology_holmes_manager.semantic_analyzer.parse(
            'He took offense')
        ontology_holmes_manager.linguistic_object_factory.add_phraselets_to_dict(doc,
                                                                          phraselet_labels_to_phraselet_infos=dict,
                                                                          replace_with_hypernym_ancestors=False,
                                                                          match_all_words=True,
                                                                          ignore_relation_phraselets=False,
                                                                          include_reverse_only=True,
                                                                          stop_lemmas=ontology_holmes_manager.semantic_matching_helper.topic_matching_phraselet_stop_lemmas,
                                                                          stop_tags=ontology_holmes_manager.semantic_matching_helper.topic_matching_phraselet_stop_tags,
                                                                          reverse_only_parent_lemmas=ontology_holmes_manager.semantic_matching_helper.
                                                                          topic_matching_reverse_only_parent_lemmas,
                                                                          words_to_corpus_frequencies=None,
                                                                          maximum_corpus_frequency=None,
                                                                          process_initial_question_words=False)
        word_phraselet = dict['word: offence']
        self.assertEqual(word_phraselet.parent_lemma, 'offense')
        self.assertEqual(word_phraselet.parent_derived_lemma, 'offence')
        doc = ontology_holmes_manager.semantic_analyzer.parse(
            'He took offence')
        ontology_holmes_manager.linguistic_object_factory.add_phraselets_to_dict(doc,
                                                                          phraselet_labels_to_phraselet_infos=dict,
                                                                          replace_with_hypernym_ancestors=False,
                                                                          match_all_words=True,
                                                                          ignore_relation_phraselets=False,
                                                                          include_reverse_only=True,
                                                                          stop_lemmas=ontology_holmes_manager.semantic_matching_helper.topic_matching_phraselet_stop_lemmas,
                                                                          stop_tags=ontology_holmes_manager.semantic_matching_helper.topic_matching_phraselet_stop_tags,
                                                                          reverse_only_parent_lemmas=ontology_holmes_manager.semantic_matching_helper.
                                                                          topic_matching_reverse_only_parent_lemmas,
                                                                          words_to_corpus_frequencies=None,
                                                                          maximum_corpus_frequency=None,
                                                                          process_initial_question_words=False)
        word_phraselet = dict['word: offence']
        self.assertEqual(word_phraselet.parent_lemma, 'offense')
        self.assertEqual(word_phraselet.parent_derived_lemma, 'offence')

    
    def test_reverse_derived_lemmas_in_ontology_multiword_match_all_words(self):
        dict = self._get_phraselet_dict(ontology_holmes_manager,
                                        "He used a waste horse")
        self.assertTrue('word: waste' in dict)
        self.assertTrue('word: horse' in dict)
        relation_phraselet = dict['predicate-patient: use-wastage horse']
        self.assertEqual(relation_phraselet.child_lemma, 'wastage horse')
        self.assertEqual(
            relation_phraselet.child_derived_lemma, 'wastage horse')


    def test_reverse_derived_lemmas_in_ontology_multiword_not_match_all_words(self):
        dict = self._get_phraselet_dict(ontology_holmes_manager,
                                        "He used a waste horse", match_all_words=False)
        self.assertFalse('word: waste horse' in dict)
        self.assertFalse('predicate-patient: use-waste horse' in dict)
        word_phraselet = dict['word: wastage horse']
        self.assertEqual(word_phraselet.parent_lemma, 'wastage horse')
        self.assertEqual(word_phraselet.parent_derived_lemma, 'wastage horse')
        relation_phraselet = dict['predicate-patient: use-wastage horse']
        self.assertEqual(relation_phraselet.child_lemma, 'wastage horse')
        self.assertEqual(
            relation_phraselet.child_derived_lemma, 'wastage horse')


    def test_frequency_factors_small(self):
        dict = self._get_phraselet_dict(ontology_holmes_manager,
                                        "The dog chased the cat",
                                        words_to_corpus_frequencies={'dog': 1, 'chasing': 1, 'cat': 2}, maximum_corpus_frequency=5)
        dog_phraselet = dict['word: dog']
        self.assertEqual(str(dog_phraselet.frequency_factor), '1.0')
        cat_phraselet = dict['word: cat']
        self.assertEqual(str(cat_phraselet.frequency_factor), '1.0')
        chase_phraselet = dict['word: chasing']
        self.assertEqual(str(chase_phraselet.frequency_factor), '1.0')
        chase_dog_phraselet = dict['predicate-actor: chasing-dog']
        self.assertEqual(str(chase_dog_phraselet.frequency_factor), '1.0')
        chase_cat_phraselet = dict['predicate-patient: chasing-cat']
        self.assertEqual(str(chase_cat_phraselet.frequency_factor), '1.0')

    
    def test_frequency_factors_small_with_small_mcf(self):
        dict = self._get_phraselet_dict(ontology_holmes_manager,
                                        "The dog chased the cat",
                                        words_to_corpus_frequencies={'dog': 1, 'chasing': 1, 'cat': 2}, maximum_corpus_frequency=2)
        dog_phraselet = dict['word: dog']
        self.assertEqual(str(dog_phraselet.frequency_factor), '1.0')
        cat_phraselet = dict['word: cat']
        self.assertEqual(str(cat_phraselet.frequency_factor), '1.0')
        chase_phraselet = dict['word: chasing']
        self.assertEqual(str(chase_phraselet.frequency_factor), '1.0')
        chase_dog_phraselet = dict['predicate-actor: chasing-dog']
        self.assertEqual(str(chase_dog_phraselet.frequency_factor), '1.0')
        chase_cat_phraselet = dict['predicate-patient: chasing-cat']
        self.assertEqual(str(chase_cat_phraselet.frequency_factor), '1.0')

    
    def test_frequency_factors_large(self):
        dict = self._get_phraselet_dict(ontology_holmes_manager,
                                        "The dog chased the cat",
                                        words_to_corpus_frequencies={'dog': 3, 'chasing': 4, 'cat': 5}, maximum_corpus_frequency=5)
        dog_phraselet = dict['word: dog']
        self.assertEqual(str(dog_phraselet.frequency_factor), '0.5693234419266069')
        cat_phraselet = dict['word: cat']
        self.assertEqual(str(cat_phraselet.frequency_factor), '0.1386468838532139')
        chase_phraselet = dict['word: chasing']
        self.assertEqual(str(chase_phraselet.frequency_factor), '0.31739380551401464')
        chase_dog_phraselet = dict['predicate-actor: chasing-dog']
        self.assertEqual(str(chase_dog_phraselet.frequency_factor), '0.18069973380142287')
        chase_cat_phraselet = dict['predicate-patient: chasing-cat']
        self.assertEqual(str(chase_cat_phraselet.frequency_factor), '0.044005662088831145')

    
    def test_frequency_factors_large_with_ontology_match(self):
        dict = self._get_phraselet_dict(ontology_holmes_manager,
                                        "The dog chased the cat",
                                        words_to_corpus_frequencies={'dog': 2, 'puppy': 4, 'chasing': 4, 'cat': 5}, maximum_corpus_frequency=5)
        dog_phraselet = dict['word: dog']
        self.assertEqual(str(dog_phraselet.frequency_factor), '0.31739380551401464')
        cat_phraselet = dict['word: cat']
        self.assertEqual(str(cat_phraselet.frequency_factor), '0.1386468838532139')
        chase_phraselet = dict['word: chasing']
        self.assertEqual(str(chase_phraselet.frequency_factor), '0.31739380551401464')
        chase_dog_phraselet = dict['predicate-actor: chasing-dog']
        self.assertEqual(str(chase_dog_phraselet.frequency_factor), '0.10073882777866815')
        chase_cat_phraselet = dict['predicate-patient: chasing-cat']
        self.assertEqual(str(chase_cat_phraselet.frequency_factor), '0.044005662088831145')

    
    def test_frequency_factors_very_large(self):
        dict = self._get_phraselet_dict(ontology_holmes_manager,
                                        "The dog chased the cat",
                                        words_to_corpus_frequencies={'dog': 97, 'chasing': 98, 'cat': 99}, maximum_corpus_frequency=100)
        dog_phraselet = dict['word: dog']
        self.assertEqual(str(dog_phraselet.frequency_factor), '0.008864383480215898')
        cat_phraselet = dict['word: cat']
        self.assertEqual(str(cat_phraselet.frequency_factor), '0.0043869621537525605')
        chase_phraselet = dict['word: chasing']
        self.assertEqual(str(chase_phraselet.frequency_factor), '0.00661413286687762')
        chase_dog_phraselet = dict['predicate-actor: chasing-dog']
        self.assertEqual(str(chase_dog_phraselet.frequency_factor), '5.863021012110299e-05')
        chase_cat_phraselet = dict['predicate-patient: chasing-cat']
        self.assertEqual(str(chase_cat_phraselet.frequency_factor), '2.9015950566883042e-05')

    def test_ent_types(self):
        dict = self._get_phraselet_dict(ontology_holmes_manager,
                                        "The big Richard came home.")
        self.assertEqual(dict['word: richard'].parent_ent_type, 'PERSON')
        self.assertEqual(dict['predicate-actor: come-richard'].parent_ent_type, '')
        self.assertEqual(dict['predicate-actor: come-richard'].child_ent_type, 'PERSON')
        self.assertEqual(dict['governor-adjective: richard-big'].parent_ent_type, 'PERSON')
        self.assertEqual(dict['governor-adjective: richard-big'].child_ent_type, '')

    def test_parent_lemma_replacement(self):
        ontology_holmes_manager.remove_all_search_phrases()
        ontology_holmes_manager.parse_and_register_document("They discussed loudly", '1')
        doc = ontology_holmes_manager.get_document('1')
        phraselet_labels_to_phraselet_infos = {}
        ontology_holmes_manager.linguistic_object_factory.add_phraselets_to_dict(doc,
                                                          phraselet_labels_to_phraselet_infos=phraselet_labels_to_phraselet_infos,
                                                          replace_with_hypernym_ancestors=False,
                                                          match_all_words=False,
                                                          ignore_relation_phraselets=False,
                                                          include_reverse_only=False,
                                                          stop_lemmas=[],
                                                          stop_tags=[],
                                                          reverse_only_parent_lemmas=[],
                                                          words_to_corpus_frequencies=None,
                                                          maximum_corpus_frequency=None,
                                                          process_initial_question_words=False)
        self.assertEqual(phraselet_labels_to_phraselet_infos['governor-adjective: discuss-loud'].parent_lemma, 'discuss')
        ontology_holmes_manager.parse_and_register_document("A loud discussion", '2')
        doc = ontology_holmes_manager.get_document('2')
        ontology_holmes_manager.linguistic_object_factory.add_phraselets_to_dict(doc,
                                                          phraselet_labels_to_phraselet_infos=phraselet_labels_to_phraselet_infos,
                                                          replace_with_hypernym_ancestors=False,
                                                          match_all_words=False,
                                                          ignore_relation_phraselets=False,
                                                          include_reverse_only=False,
                                                          stop_lemmas=[],
                                                          stop_tags=[],
                                                          reverse_only_parent_lemmas=[],
                                                          words_to_corpus_frequencies=None,
                                                          maximum_corpus_frequency=None,
                                                          process_initial_question_words=False)
        self.assertEqual(phraselet_labels_to_phraselet_infos['governor-adjective: discuss-loud'].parent_lemma, 'discussion')
        
    def test_child_lemma_replacement(self):
        ontology_holmes_manager.remove_all_search_phrases()
        ontology_holmes_manager.parse_and_register_document("They started to discuss", '3')
        doc = ontology_holmes_manager.get_document('3')
        phraselet_labels_to_phraselet_infos = {}
        ontology_holmes_manager.linguistic_object_factory.add_phraselets_to_dict(doc,
                                                          phraselet_labels_to_phraselet_infos=phraselet_labels_to_phraselet_infos,
                                                          replace_with_hypernym_ancestors=False,
                                                          match_all_words=False,
                                                          ignore_relation_phraselets=False,
                                                          include_reverse_only=False,
                                                          stop_lemmas=[],
                                                          stop_tags=[],
                                                          reverse_only_parent_lemmas=[],
                                                          words_to_corpus_frequencies=None,
                                                          maximum_corpus_frequency=None,
                                                          process_initial_question_words=False)
        self.assertEqual(phraselet_labels_to_phraselet_infos['predicate-patient: start-discuss'].child_lemma, 'discuss')
        ontology_holmes_manager.parse_and_register_document("They started a discussion", '4')
        doc = ontology_holmes_manager.get_document('4')
        ontology_holmes_manager.linguistic_object_factory.add_phraselets_to_dict(doc,
                                                          phraselet_labels_to_phraselet_infos=phraselet_labels_to_phraselet_infos,
                                                          replace_with_hypernym_ancestors=False,
                                                          match_all_words=False,
                                                          ignore_relation_phraselets=False,
                                                          include_reverse_only=False,
                                                          stop_lemmas=[],
                                                          stop_tags=[],
                                                          reverse_only_parent_lemmas=[],
                                                          words_to_corpus_frequencies=None,
                                                          maximum_corpus_frequency=None,
                                                          process_initial_question_words=False)
        self.assertEqual(phraselet_labels_to_phraselet_infos['predicate-patient: start-discuss'].child_lemma, 'discussion')