-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathipadic.py
40 lines (29 loc) · 1012 Bytes
/
ipadic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from fugashi import GenericTagger
import ipadic
import textspan
import tokenizers
class IpadicTagger(GenericTagger):
"""
fugashi with ipadic
"""
def __init__(self):
super().__init__(ipadic.MECAB_ARGS)
class IpadicPreTokenizer(object):
"""
PreTokenizer with IpadicTagger
Note that, since this PreTokenizer is not serializable,
we have to load model and pretokenizer separately.
"""
@classmethod
def make(cls):
"""instantiate a PreTokenizer object."""
return tokenizers.pre_tokenizers.PreTokenizer.custom(cls())
def __init__(self):
self.tagger = IpadicTagger()
def _pre_tokenize(self, _id, ns):
text = ns.normalized
tokens = [n.surface for n in self.tagger.parseToNodeList(text)]
tokens_spans = textspan.get_original_spans(tokens, text)
return [ns[sp:ep] for sub_spans in tokens_spans for sp, ep in sub_spans]
def pre_tokenize(self, pretok):
pretok.split(self._pre_tokenize)