-
Notifications
You must be signed in to change notification settings - Fork 9
/
processor.py
309 lines (230 loc) · 9.38 KB
/
processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
import re
from collections import OrderedDict
from typing import Union, Pattern, Iterator
from nltk import sent_tokenize, word_tokenize
from tps.utils import split_to_tokens
from tps.modules.ssml.elements import Pause
from tps.symbols import separator, shields
char_map = OrderedDict({
". ": Pause.eos,
"? ": Pause.eos,
"! ": Pause.eos,
": ": Pause.colon,
"; ": Pause.semicolon,
", ": Pause.comma,
" ": Pause.space
})
_spaced_punctuation = re.compile(r" [{}]".format("".join([char for char in char_map if char != " "])))
class Processor:
def __init__(self, max_unit_length: int=None, name="Processor"):
"""
Base class for all text processors.
:param max_unit_length: Optional[int]
If not None, passed text will be split into units less than max_unit_length each.
See Processor.__call__ and Processor.split_to_units
:param name: str
"""
self.max_unit_length = max_unit_length
self.name = name
def __call__(self, sentence: str, **kwargs) -> str:
"""
Processes the passed sentence. Exactly one sentence should be passed for proper work.
If the self.max_unit_length attribute is not None, then the sentence will be split into units,
and each unit will be processed independently and joined at the end.
:param sentence: str
:param kwargs: dict
See tps.Handler.generate_text
:return: str
"""
if self.max_unit_length is not None:
parts = self.split_to_units(sentence, self.max_unit_length)
else:
parts = (sentence,)
parts = [self.process(part, **kwargs) for part in parts]
return " ".join(parts)
def __str__(self):
return "<{}: max unit length {}>".format(self.name, self.max_unit_length)
def process(self, string: str, **kwargs) -> str:
"""
Must be implemented in the descendant classes. Processes the passed string.
:param string: str
:param kwargs: dict
See tps.Handler.generate_text
:return: str
"""
raise NotImplementedError
def process_text(self, text: Union[str, list], keep_delimiters: bool=False, **kwargs) -> Union[str, list]:
"""
Process any text: first of all splits it to sentences, if it's possible.
The Processor.__call__ method is applied to each sentence after that.
Wraps the Processor.generate_text method, converting iterator to a list of values.
:param text: Union[str, list]
See Processor.generate_text
:param keep_delimiters: bool
See Processor.generate_text
:param kwargs:
See tps.Handler.generate_text
:return: Union[str, list]
Returns text as a list of processed sentences (with Pause tokens, if keep_delimiters == True)
or just a processed string.
Cases:
* list - if list was passed;
* list - if string was passed and keep_delimiters == True;
* str - if string was passed and keep_delimiters == False;
"""
return_string = isinstance(text, str) and not keep_delimiters
processed = list(self.generate_text(text, keep_delimiters, **kwargs))
return " ".join(processed) if return_string else processed
def generate_text(self, text: Union[str, list], keep_delimiters: bool=False,
**kwargs) -> Iterator[Union[str, Pause]]:
"""
Produces a generator of processed sentences or units (with Pause tokens, if keep_delimiters == True).
:param text: Union[str, list]
Text that needs to be processed.
Cases:
* str - just an ordinary string;
* list - it's assumed, that user submits text that has already been split into sentences
(with or without delimiters), for example:
[
text_part_0,
<Pause.eos: 500ms>,
text_part_1
]
:param keep_delimiters: bool
If True, final list will contain sentences and Pause tokens between them.
:param kwargs:
See tps.Handler.generate_text
:return: Iterator[Union[str, tps.modules.ssml.Pause]]
"""
if isinstance(text, str):
sentences = self.split_to_sentences(text)
elif isinstance(text, list):
sentences = text
else:
raise TypeError
for sentence in sentences:
if not isinstance(sentence, Pause):
processed = self(sentence, **kwargs)
yield processed
elif keep_delimiters:
yield sentence
else:
continue
def _calc_weight(self, text):
"""
Calculates weight of the each unit. For example, we do not want to take the shield symbols into account,
when calculating how many chars there are in the text.
:param text: str
:return: int
"""
_text = text
for symb in shields:
_text = _text.replace(symb, "")
_text = Processor.split_to_tokens(_text)
weight = sum(len(s.split(separator)) if separator in s else len(s) for s in _text)
return weight
def _distribute_parts(self, parts, delimiter):
"""
Auxiliary function for Processor.split_to_units.
:param parts: list
:param delimiter: str
:return:
"""
_delimiter = "" if delimiter == " " else delimiter.replace(" ", "")
parts_grouped = [
delimiter.join(parts[:len(parts) // 2]) + _delimiter,
delimiter.join(parts[len(parts) // 2:])
]
return parts_grouped
def split_to_units(self, text: str, max_unit_length: int, keep_delimiter: bool=False) -> list:
"""
Splits specified text into units, whose weight less than max_unit_length.
:param text: str
:param max_unit_length: int
:param keep_delimiter: bool
If True, final list will contain units and Pause tokens between them.
:return: list
"""
if self._calc_weight(text) <= max_unit_length:
return [text]
for delimiter in char_map:
found = text.find(delimiter)
if found != -1 and found != len(text) - 1:
break
if found != -1:
parts = [p.strip() for p in text.split(delimiter)]
else:
parts = [text[:len(text) // 2], text[len(text) // 2:]]
_parts_grouped = self._distribute_parts(parts, delimiter)
if keep_delimiter and len(_parts_grouped) > 1:
_parts_grouped.insert(1, char_map[delimiter]())
parts_grouped = []
for part in _parts_grouped:
if isinstance(part, Pause) or self._calc_weight(part) <= max_unit_length:
parts_grouped.append(part)
else:
parts_grouped.extend(self.split_to_units(part, max_unit_length, keep_delimiter))
return parts_grouped
@staticmethod
def split_to_sentences(text: str, keep_delimiters: bool=False, language: str="russian") -> list:
"""
Splits specified text into sentences using nltk library.
:param text: str
:param keep_delimiters: bool
If True, final list will contain sentences and Pause tokens between them.
:param language: str
The model name in the nltk Punkt corpus
:return: list
"""
parts = sent_tokenize(text, language)
if keep_delimiters:
for i in range(1, len(parts)):
parts.insert(i * 2 - 1, Pause.eos())
return parts
@staticmethod
def split_to_words(text: str) -> list:
"""
Splits specified text into words using nltk library.
:param text: str
:return: list
"""
return word_tokenize(text)
@staticmethod
def join_words(words: list) -> str:
"""
Reverses the self.split_to_words method.
:param words: list
List of words got from the self.split_to_words method.
:return: str
"""
words = " ".join(words)
words = _spaced_punctuation.sub(lambda elem: elem.group(0)[-1], words)
return words
@staticmethod
def split_to_tokens(text: str, punct_re: Pattern=None) -> list:
"""
Splits specified text into words, treating whitespaces as independent elements.
Unlike the self.split_to_words method can not recognize complex cases such as 'e.g.' and
works a little faster.
:param text: str
:param punct_re: Pattern
:return: str
Example:
--------
>>> proc = Processor()
>>> text = "splitting sentence, e.g. this one."
>>> proc.split_to_words(text)
['splitting', 'sentence', ',', 'e.g', '.', 'this', 'one', '.']
>>> proc.split_to_tokens(text)
['splitting', ' ', 'sentence', ',', ' ', 'e', '.', 'g', '.', ' ', 'this', ' ', 'one', '.']
"""
return split_to_tokens(text, punct_re)
@staticmethod
def join_tokens(tokens: list) -> str:
"""
Reverses the self.split_to_tokens method.
:param tokens: list
List of tokens got from the self.split_to_tokens method.
:return: str
"""
return "".join(tokens)