-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtokenizer.py
94 lines (78 loc) · 2.75 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import re
class Tokenizer:
def tokenizer(self, text):
raise NotImplementedError
def restore(self, tokens):
raise NotImplementedError
def __call__(self, *args, **kwargs):
return self.tokenizer(*args, **kwargs)
class BaseTokenizer(Tokenizer):
def tokenizer(self, text):
return text.split(" "), text.split(" ")
def restore(self, tokens):
return ' '.join(tokens)
class CharTokenizer(Tokenizer):
def tokenizer(self, text):
if text.startswith('@'):
return [text], [text]
return list(text), list(text)
def restore(self, tokens):
return ''.join(tokens)
class ChnTokenizer(Tokenizer):
def __init__(self):
import jionlp as jio
self.jio = jio
def tokenizer(self, text):
if text.startswith('@'):
return [text], [text]
# Remove punctuations
text = self.jio.remove_exception_char(text)
text = text.lower()
text = text.replace(';', '')
text = re.sub("\s+", " ", text).strip()
text = re.sub("“|”", " ", text)
tag_token = '$' # For tagging non-Chinese characters
tag_re = re.compile(r'\[sub]|\[/sub]|\[pre]|[a-zA-Z0-9.]+')
number_re = re.compile(r'^[0-9.]+$')
raw_tag = tag_re.findall(text)
text = tag_re.sub(tag_token, text)
tokens = list(text)
raw_tokens = []
tag_idx = 0
for token_idx, token in enumerate(tokens):
if token == tag_token:
# if raw_tag[tag_idx].isalpha():
# tokens[token_idx] = '[eng]'
# elif raw_tag[tag_idx].isdigit() or number_re.match(raw_tag[tag_idx]):
# tokens[token_idx] = '[num]'
# elif raw_tag[tag_idx] == '[pre]':
# tokens[token_idx] = '[pre]'
# else:
# tokens[token_idx] = '[eng_num]'
tokens[token_idx] = raw_tag[tag_idx]
raw_tokens.append(raw_tag[tag_idx])
tag_idx += 1
else:
raw_tokens.append(token)
return tokens, raw_tokens
def restore(self, tokens):
return ''.join(tokens)
def load_tokenizer(tokenizer_name):
"""
Tokenizer result:
(tokens, raw_tokens)
raw_tokens contains the original text
:param tokenizer_name
:return: Tokenizer
"""
if tokenizer_name == "base":
base_tokenizer = BaseTokenizer()
return base_tokenizer
elif tokenizer_name == "chn":
chn_tokenizer = ChnTokenizer()
return chn_tokenizer
elif tokenizer_name == "char":
char_tokenizer = CharTokenizer()
return char_tokenizer
else:
raise ValueError("Tokenizer {} not found".format(tokenizer_name))