-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrhyme.py
112 lines (97 loc) · 3.8 KB
/
rhyme.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from __future__ import with_statement
import numpy as np
phoneticDict = {}
with open('cmudict.0.7a') as rhymelist:
for line in rhymelist:
if line.startswith(';;;'): continue
word, phon = line.strip().split(' ')
phon = phon.split(' ')
phoneticDict[word] = phon
def get_phonetic(text):
parts = []
for word in text.split():
parts.extend(phoneticDict.get(word.upper(), list(word.upper())))
return parts
def edit_distance(list1, list2):
m = len(list1)
n = len(list2)
data = np.zeros((m+1, n+1), 'i')
data[0,:] = np.arange(n+1)
data[:,0] = np.arange(m+1)
data[0,0] = 0
for a in xrange(1, m+1):
for b in xrange(1, n+1):
if list1[a-1] == list2[b-1]:
data[a,b] = data[a-1,b-1]
else:
data[a,b] = 1 + min(data[a-1,b], data[a,b-1], data[a-1,b-1])
return data[m,n]
def longest_match(list1, list2):
m = len(list1)
n = len(list2)
data = np.zeros((m+1, n+1), 'i')
for a in xrange(1, m+1):
for b in xrange(1, n+1):
if list1[a-1] == list2[b-1]:
data[a,b] = 1 + data[a-1,b-1]
else:
data[a,b] = 0
return np.max(data)
def prefix_match(list1, list2):
for i in xrange(min(len(list1), len(list2)), 0, -1):
if list1[:i] == list2[:i]:
return i
return 0
def suffix_match(list1, list2):
for i in xrange(min(len(list1), len(list2)), 0, -1):
if list1[-i:] == list2[-i:]:
return i
return 0
def scaled_edit_distance(list1, list2):
# was max
return 1 - float(edit_distance(list1, list2)) / min(len(list1), len(list2))
def scaled_suffix_match(list1, list2):
return float(suffix_match(list1, list2)) / min(len(list1), len(list2))
def scaled_prefix_match(list1, list2):
return float(prefix_match(list1, list2)) / min(len(list1), len(list2))
def scaled_longest_match(list1, list2):
# was max
return float(longest_match(list1, list2)) / min(len(list1), len(list2))
def combined_score(list1, list2):
return (scaled_edit_distance(list1, list2)
+ scaled_suffix_match(list1, list2)
+ scaled_prefix_match(list1, list2)
+ scaled_longest_match(list1, list2)) / 4
def _sounds_like_score(word1, word2):
result = max(combined_score(word1.replace(' ', ''), word2.replace(' ', '')),
combined_score(get_phonetic(word1), get_phonetic(word2)))
return result
def sounds_like_score(word1, word2):
subscores = []
for subword in word2.split():
subscores.append(_sounds_like_score(word1, subword))
scores = [_sounds_like_score(word1, word2),
sum(subscores)/len(subscores)]
return max(scores)
def test(cutoff=0.35):
print combined_score('love', 'of')
assert get_phonetic('cow') == ['K', 'AW1']
assert sounds_like_score('ham', 'spam') > cutoff
assert sounds_like_score('research', 're search') > cutoff
assert sounds_like_score('spam', 'eggs') < cutoff
assert sounds_like_score('cow', 'lojbanistan') < cutoff
assert sounds_like_score('feet', 'eat') > cutoff
assert sounds_like_score('sister', 'brother') < cutoff
assert sounds_like_score('mother', 'other') > cutoff
assert sounds_like_score('a', 'b') < cutoff
assert sounds_like_score('fish', 'chips') < cutoff
assert sounds_like_score('fish', 'swish') > cutoff
assert sounds_like_score('behind', 'not') < cutoff
assert sounds_like_score('name', 'nomenclature') < cutoff
assert sounds_like_score('clothing', 'covering') < cutoff
assert sounds_like_score('heat', 'feat meat') > cutoff
assert sounds_like_score('love', 'above') > cutoff
assert sounds_like_score('love', 'of') > cutoff
assert sounds_like_score('love', 'of another') < cutoff
if __name__ == '__main__':
test()