-
Notifications
You must be signed in to change notification settings - Fork 0
/
ngram_util.py
86 lines (64 loc) · 1.87 KB
/
ngram_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# -*- coding: utf-8 -*-
"""
Created on Sun May 6 14:02:16 2018
@author: Panangam
"""
import json
import re
# bigram
bigramfreq = {}
with open('bigram_to_pair/bigrams.json') as bigramfile:
bigramfreq_list = json.load(bigramfile)
for (bigram, freq) in bigramfreq_list:
bigramfreq[bigram] = freq
freqsum = 0
for bigram, freq in bigramfreq.items():
freqsum += freq
for bigram, freq in bigramfreq.items():
bigramfreq[bigram] = freq/freqsum
# trigram
trigramfreq = {}
freqsum = 0
with open('english_trigrams.txt') as trigramfile:
for line in trigramfile:
trigramfreq[line[:3].lower()] = int(line[4:-1])
freqsum += int(line[4:-1])
for trigram, freq in trigramfreq.items():
trigramfreq[trigram] = freq/freqsum
def getBigramProb(phrase):
freqsum = 0
j = 0
if re.search('[0-9]', phrase): return 0
for k in range(len(phrase)-1):
if not bigramFilter(phrase[k:k+2]): return 0
if (phrase[k]!=' ' and phrase[k]!='.' and phrase[k+1]!=' ' and phrase[k+1]!='.'):
freqsum += bigramfreq[phrase[k:k+2]]
j += 1
freqsum /= (j+1)
return freqsum
def getTrigramProb(phrase):
freqsum = 0
j = 0
if re.search('[0-9]', phrase): return 0
for k in range(len(phrase)-2):
if not trigramFilter(phrase[k:k+3]): return 0
if ('.' not in phrase[k:k+3] and ' ' not in phrase[k:k+3]):
if phrase[k:k+3] in trigramfreq:
freqsum += trigramfreq[phrase[k:k+3]]
j += 1
if j != 0: freqsum /= j
return freqsum
def bigramFilter(t):
return not ((t[0]=='.' and t[1]!=' '))
def trigramFilter(t):
return not ((t[0]=='.' and t[2]=='.') or
(t[0]=='.' and t[1]!=' ') or
(t[1]=='.' and t[2]!=' ') or
(' .' in t) or
('..' in t) or
(' ' in t))
if __name__ == '__main__':
print(bigramfreq)
print('abcd'[0:2])
print(len('abcd'))
print(trigramFilter('a. '))