-
Notifications
You must be signed in to change notification settings - Fork 41
/
Copy pathbuild_pretrain_dataset.py
125 lines (93 loc) · 3.63 KB
/
build_pretrain_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import json
import argparse
import os
import re
import unicodedata
import gzip
from urllib.request import urlretrieve
from tqdm import tqdm
import fugashi
from corpus.jp_wiki.config import Config
config = Config()
class MeCabSentenceSplitter(object):
def __init__(self, mecab_dict_path=None):
if mecab_dict_path is not None:
self.mecab = fugashi.Tagger('-d {}'.format(mecab_dict_path))
else:
self.mecab = fugashi.Tagger()
def __call__(self, text):
sentences = []
start = 0
end = 0
for line in self.mecab.parse(text).split("\n"):
if line == "EOS":
if len(text[start:]) > 0:
sentences.append(text[start:])
break
token, token_info = line.split("\t", maxsplit=1)
end = text.index(token, end) + len(token)
if "記号" in token_info and "句点" in token_info:
sentences.append(text[start:end])
start = end
return sentences
def download_data():
if not os.path.exists(config.raw_data_path):
print(f'Downloading {config.download_link} to {config.raw_data_path}')
urlretrieve(config.download_link, config.raw_data_path)
print(f'Successfully downloaded {config.raw_data_path}')
def preprocess_text(text, title=None):
text = unicodedata.normalize("NFKC", text)
# remove invisible characters
text = "".join(c for c in text if c.isprintable())
# remove templates
text = re.sub(r"\[\d+?\]", "", text)
text = re.sub(r"\[要.+?\]", "", text)
text = re.sub(r"\{\{+[^{}]+?\}\}+", "", text)
# remove navigation
if title is not None:
text = re.sub(r"^.+? \> " + re.escape(title), "", text)
# remove footnotes
text = re.sub(r" \^ .+", "", text)
# remove annotations
text = re.sub(r"\[(要出典|リンク切れ|.+?\?)\]", "", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def filter_text(text):
# filter out text containing equations
if "\displaystyle" in text:
return False
return True
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--mecab_dict_path", type=str)
parser.add_argument('--min_length', type=int, default=1, help='only extract sentences with no less than N characters')
parser.add_argument('--max_length', type=int, default=1024, help='only extract sentences with no more than N characters')
args = parser.parse_args()
if not os.path.exists(config.raw_data_dir):
os.makedirs(config.raw_data_dir)
download_data()
sent_splitter = MeCabSentenceSplitter(args.mecab_dict_path)
with gzip.open(config.raw_data_path, "rt") as input_file, \
open(config.extracted_data_path, "w") as output_file:
for line in tqdm(input_file):
json_item = json.loads(line)
text = json_item.get("text")
if text is None:
continue
title = json_item.get("title")
text = preprocess_text(text, title=title)
is_processed = False
for sentence in sent_splitter(text):
sentence = sentence.strip()
if len(sentence) < args.min_length:
continue
if len(sentence) > args.max_length:
continue
if not filter_text(sentence):
continue
assert "\n" not in text
assert sentence != ""
print(sentence, file=output_file)
is_processed = True
if is_processed:
print("", file=output_file)