-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsimilar_words.py
171 lines (163 loc) · 7.67 KB
/
similar_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#-------------------------------------------------------------------------------
# Name: SimilarWords
# Purpose: checks finds the label with phrases most similar to a
# phrase interpreted by sst
#
# Author: SGreenstein
#
# Created: 30/11/2013
# Copyright: (c) SGreenstein 2013
# Licence: <your licence>
#-------------------------------------------------------------------------------
import csv
import stt_google
from collections import Counter
class SimilarWords:
def __init__(self, fname, penalty = 3):
"""Returns an instance of SimilarWords for finding the best label for a phrase
Keyword arguments:
fname -- string, specifies csv file of training data
penalty -- int, penalty for having the same words in multiple labels (default min(3, number of labels - 1))
"""
self._instances = {} #dictionary. Key: label, value: Counter of word frequencies
self._bigram_instances = {} #dictionary. Key: label, value: Counter of bigram frequencies
self._train(fname, penalty)
def _train(self, fname, penalty):
"""Reads a file of training data and creates self._instances for future use
Keyword arguments:
fname -- string, filename of csv file with training data
training data is of the following format:
label1, hypothesis number one, hypothesis number two,
label1, hypothesis one,
label2, hypothesis one, et cetera
penalty -- int, penalty for having the same words in multiple labels
"""
#open file
csvfile = open(fname, 'rb')
reader = csv.reader(csvfile)
instances = self._instances
bi_inst = self._bigram_instances
num_training_instances = Counter()
#read file into correct format, counting weighted word frequencies
for row in reader:
label = row.pop(0)
#keep up with how many examples of this label there are.
#Used later for normalization so that labels with more
#training examples aren't selected more often
num_training_instances[label] += 1
if(label in instances):
word_freqs = instances[label]
bi_freqs = bi_inst[label]
else:
word_freqs = Counter()
bi_freqs = Counter()
#if google guess fewer than 5 things, copy the first thing
for i in range(len(row),5):
row.append(row[0])
for index, phrase in enumerate(row):
last_word = ''
for word in phrase.split():
#weight by the order Google guessed it in
#i.e. first guesses weighted more
word_freqs[word] += 1 / float(index + 1)
if(last_word != ''):
bi_freqs[last_word + ' ' + word] += 1 / float(index + 1)
last_word = word
instances[label] = word_freqs
bi_inst[label] = bi_freqs
csvfile.close
#reduce weight of words common to many labels
#find total frequencies of words
avg_word_freqs = Counter()
avg_bi_freqs = Counter()
penalty = min(len(instances) - 1, penalty)
for label in instances:
word_freqs = instances[label]
bi_freqs = bi_inst[label]
for word in word_freqs:
#normalize by number of training instances
word_freqs[word] /= float(num_training_instances[label])
for bi in bi_freqs:
bi_freqs[bi] /= float(num_training_instances[label])
avg_word_freqs += word_freqs
avg_bi_freqs += bi_freqs
instances[label] = word_freqs
bi_inst[label] = bi_freqs
#convert from total to a multiple of the average
for word in avg_word_freqs:
avg_word_freqs[word] *= (penalty / float(len(instances)))
for bi in avg_bi_freqs:
avg_bi_freqs[bi] *= (penalty / float(len(bi_inst)))
#subtract the average frequency of each word
for label in instances:
instances[label] -= avg_word_freqs
bi_inst[label] -= avg_bi_freqs
self._instances = instances
self._bigram_instances = bi_inst
def classify(self, hypotheses, threshold = 0.2, confirm_cushion = 0.5, bigram_weight = 1.5):
"""Returns the best label based on word frequencies
or empty string if confidence doesn't exceed threshhold.
Second return value is boolean indicating whether the result
needs to be confirmed or not
Keyword arguments:
hypotheses -- result from Google stt to classify
threshold -- similarity threshold necessary to return a label (default 0.1)
confirm_cushion -- will need confirmation if best similarity is not this many
times higher than the second best similarity (default 0.5)
bigram_weight -- relative to unigrams, how much bigrams should matter (default 1.5)
"""
bestsimilarity = 0
secbestsim = 0
## avgsim = 0 #avg similarity
## avgbisim = 0 #avg bigram similarity
#calculate each label's similarity with the interpeted text
for label in self._instances:
word_freqs = self._instances[label]
bi_freqs = self._bigram_instances[label]
similarity = 0
bisimilarity = 0 #similarity based on bigrams
matched_words = ""
matched_bigrams = ""
for index, hypothesis in enumerate(hypotheses):
phrase = hypothesis['utterance']
lastword = ''
for word in phrase.split():
#if words match, increase similarity score
#weight by the order google guessed them in
similarity += word_freqs[word] / float(index + 1)
bisimilarity += bi_freqs[lastword + ' ' + word] / float(index + 1)
#print matching words
if(word_freqs[word] / float(index + 1) > 0):
matched_words += "\t" + word + ' %.2f\n' % word_freqs[word]
if(bi_freqs[lastword + ' ' + word] / float(index + 1) > 0):
matched_bigrams += "\t" + lastword + ' ' + word + ' %.2f\n' % bi_freqs[lastword + ' ' + word]
lastword = word
## avgsim += similarity
## avgbisim += bisimilarity
combinedsim = 10 * (similarity + bisimilarity * bigram_weight) / (1 + bigram_weight)
if combinedsim >= bestsimilarity:
secbestsim = bestsimilarity
bestsimilarity = combinedsim
bestlabel = label
print label, 'is the best-----------------------------------'
print label, '\t%.2f' % similarity
print label, '\t%.2f' % bisimilarity
if(matched_words):
print matched_words
if(matched_bigrams):
print matched_bigrams
## avgsim /= len(self._instances)
## avgbisim /= len(self._bigram_instances)
print ''
## print "Best sim:", bestsimilarity, "Avg sim:", avgsim
print "Best sim:", bestsimilarity, "2nd best sim:", secbestsim
if(bestlabel == 'none'):
#matched to special 'none' label
return '', False
elif(bestsimilarity >= threshold):
#if bestsim isn't enough higher than the average sim, should confirm
should_confirm = (bestsimilarity < secbestsim * confirm_cushion)
return bestlabel, should_confirm
else:
#nothing matched with sufficient confidence
return '', False