forked from hossein-amirkhani/VocabLevel
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathVocabLevel.py
112 lines (100 loc) · 3.88 KB
/
VocabLevel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
import os
import codecs
from collections import Counter
import operator
import enchant
from nltk.metrics import edit_distance
import csv
class SpellCorrection(object):
def __init__(self, dict_name='en-US', max_dist=2):
self.spell_dict = enchant.Dict(dict_name)
self.max_dist = max_dist
def replace(self, word): # If the edit distance is greater than max_dist, it returns None
if self.spell_dict.check(word):
return word
suggestions = self.spell_dict.suggest(word)
if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist:
return suggestions[0]
else:
return None
### To Do: we can consider words with different POS as different words
### To Do: we can check whether the user really knows the answer or not (for example by multiple choices questions)
if os.path.exists('word_list.csv'):
with open('word_list.csv', 'r') as f:
csv_in = csv.reader(f)
sorted_words = [row for row in csv_in]
else:
# extract words
# a. Remove proper nouns
# b. Keep only alphabetic words
# c. Correct misspellings
# d. Lemmatize words
# e. Lower-case words
words = []
for subdir, _, files in os.walk('Subtitles/'):
for file in files:
f = codecs.open(os.path.join(subdir, file), 'r', 'latin-1')
tagged_sent = pos_tag(word_tokenize(f.read()))
words.extend([word for word,pos in tagged_sent if pos != 'NNP'])
replacer = SpellCorrection()
lemmatizer = WordNetLemmatizer()
filtered = []
for w in words:
if str.isalpha(w):
modif_word = replacer.replace(w)
if modif_word and str.isalpha(modif_word):
mw = modif_word.lower()
# Lemmatize as 'verb' and 'noun' and hold whichever that makes the word different
# (Need more investigation for adjectives and adverbs. Better to use the real POS of the word)
mwv = lemmatizer.lemmatize(mw, pos='v') # verb lemma
if mwv != mw:
filtered.append(mwv)
else:
filtered.append(lemmatizer.lemmatize(mw)) # noun lemma
# count words and sort according to their frequencies
sorted_words = sorted(Counter(filtered).items(), key=operator.itemgetter(1), reverse=True)
# save the word lists
with open('word_list.csv','w') as f:
csv_out = csv.writer(f)
for row in sorted_words:
csv_out.writerow(row)
# interact with user to find his/her vocabulary level
# Previous way:
"""start = 0
end = len(sorted_words)-1
while start <= end:
pos = int((start+end)/2)
print('Do you know the meaning of "%s" (y,n)?' % sorted_words[pos][0])
if input()=='n':
end = pos - 1
pos -= 1 # position of the last word that you know
else:
start = pos + 1"""
# New way:
start = 0
end = len(sorted_words)-1
while start <= end:
pos = int((start+end)/2)
print('Do you know the meaning of "%s" (y,n)?' % sorted_words[pos][0])
print end, pos, start
if raw_input()=='n':
if end - start < 100:
end = pos - 1
else:
end = start + 9*(end - start)/10 - 1
pos -= 1
else:
if end - start < 100:
start = pos + 1
else:
start = start + (end - start)/10 + 1
# pos: the position where the user does not knows its meaning and the meaning of the words after that
total = len(sorted_words)
pos += 1
print("************************RESULTS***********************")
print("You approximately knows %d words out of %d (%%%.1f)" %(pos,total,pos*100/total))
print("******************************************************")
# We can give the separate lists of words that the user knows and does not know