-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathunigram.py
40 lines (34 loc) · 1.22 KB
/
unigram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import re
import random
import pickle
import os
class Unigram:
def __init__(self, file):
with open(file) as f:
self.source = f.read()
self.bag_of_words = []
self.tokens = []
self._generate_bag_of_words()
def random_token(self):
return self.bag_of_words[random.randint(0, len(self.bag_of_words) - 1)]
def _process_token(self, token):
return token.replace("\n", "").replace("\r", "")
def _valid_token(self, token):
if token == "": return False
if re.match("^\s+$", token): return False
return True
def _tokenize_source(self):
tokens = re.split("\s+", self.source)
tokens = [self._process_token(t) for t in tokens if self._valid_token(t)]
self.tokens = tokens
return self.tokens
def _generate_bag_of_words(self):
if os.path.exists('unigram_word_map.dat'):
with open('unigram_word_map.dat') as w:
self.bag_of_words = pickle.load(w)
return
tokens = self._tokenize_source()
while len(tokens) > 0:
self.bag_of_words.append(tokens.pop(0))
with open("unigram_word_map.dat", 'w') as w:
pickle.dump(self.bag_of_words, w)