-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathWordSubstitutor.py
81 lines (65 loc) · 3.24 KB
/
WordSubstitutor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
WordSubstitutor class takes in a Word
Provides a method to return a word of similar meaning and higher complexity
substituted. Only substitutes for verbs, non-comparator adjectives, adverbs, and
non-pronoun nouns are considered.
"""
from Word import Word
from DataMuseQuerier import DataMuseQuerier
# Maybe change to a Sentence class input later or a string of words
class WordSubstitutor:
# constructor
def __init__(self):
self.__synonym_querier = DataMuseQuerier()
# substitute words
# ADD IN AN ARGUMENT FOR CHECKING STOP WORDS LATER
def get_best_synonym(self, word, left_context = "", right_context = ""):
""" Takes in a Word obejct, looks at the top 5 synonyms according to DataMuse API,
then returns the word witht the most complex synonym.
If the most complex word is the original,
that word returned
Args:
word: Word object whose synonyms the user wants to find
left_context: optionally specified string as left context
right_context: optionally specified string as right context
Returns:
Word object of most complex synonym
"""
# Only look to substitute adj, adv, verbs, and non-pronoun nouns
if word.get_part_of_speech() not in ['adj', 'adv', 'v', 'n']:
return word
else:
# Query results from API = synonym list, with part of speech tags
synonym_list = self.__synonym_querier.get_synonym_query_results(word, left_context, right_context)
# find and set our word's frequency
word.set_frequency(self.__synonym_querier.get_frequency(word))
best_synonym = Word("", "")
max_synonym_score = 0
for syn in synonym_list:
# If synonym is same part of spech, compute complexity
if word.get_part_of_speech() in syn['tags']:
# only consider word if "score" > 20000
score = syn['score']
if score < 20000:
pass
# only consider word if it is one word, not a multi-word phrase
elif len(syn['word'].split(' ')) > 1:
pass
else:
freq = float(syn['tags'][len(syn['tags']) - 1].split(':')[1])
synonym = Word(syn['word'], word.get_part_of_speech(), freq)
syn_score = synonym.compute_complexity_score()
if syn_score > max_synonym_score:
best_synonym = synonym
max_synonym_score = syn_score
# Now check if best synonym is more complex than original word
if max_synonym_score > word.compute_complexity_score():
word = best_synonym
return word
# Example code:
# test_word = Word("dog", "n")
# word_sub = WordSubstitutor()
# test_word = word_sub.get_best_synonym(test_word)
# print(test_word.get_word()) --> prints 'canis familiaris'