-
Notifications
You must be signed in to change notification settings - Fork 2
/
sentence.py
88 lines (73 loc) · 2.68 KB
/
sentence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
class Sentence:
def __init__(self, tokens):
self._sen = list(tokens)
def __len__(self):
return len(self._sen)
def __iter__(self):
return iter(self._sen)
def __getitem__(self, key):
return self._sen[key]
def __setitem__(self, key, value):
self._sen[key] = value
def __delitem__(self, key):
del self._sen[key]
def __contains__(self, item):
return item in self._sen
def __str__(self):
return " ".join(self._sen)
def ngram_positions(self, ngram):
result = []
for starter_index in (i for i, tok in enumerate(self._sen) if tok == ngram[0]):
good = True
for tok_i, tok in enumerate(ngram[1:]):
try:
# search for remaining tokens
if self[starter_index + 1 + tok_i] == tok:
pass
else:
good = False
break
except IndexError:
good = False
break
if good:
result.append(starter_index)
return result
def init_backup(self):
if not "_backup_sen" in self.__dict__:
self._backup_sen = list(self._sen)
def remove_ngram(self, ngram, backup=False):
positions = self.ngram_positions(ngram)
if backup:
self.init_backup()
# look for position in backup sentence
backup_positions = []
for pos in positions:
backup_pos = 0
for i, tok in enumerate(self._backup_sen):
if type(tok) == tuple:
continue
else:
if backup_pos == pos:
backup_positions.append(i)
break
backup_pos += 1
for i, pos in enumerate(positions):
if backup:
for ngram_pos in xrange(len(ngram)):
self._backup_sen[backup_positions[i]+ngram_pos] = (self._backup_sen[backup_positions[i]+ngram_pos],)
shift = i * len(ngram)
del self._sen[pos-shift:pos-shift+len(ngram)]
def remove_toks(self, toks, backup=False):
if backup:
self.init_backup()
for i, tok in enumerate(self._sen):
if tok in toks:
self._backup_sen[i] = (tok,)
self._sen = filter(lambda x: x not in toks, self._sen)
def get_tokens(self, backup=False):
if backup:
self.init_backup()
return self._backup_sen
else:
return self._sen