-
-
Notifications
You must be signed in to change notification settings - Fork 467
/
tokenize_dataset.py
42 lines (36 loc) · 1.18 KB
/
tokenize_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import re
import spacy
import typer
from itertools import islice
from pathlib import Path
from datasets import load_dataset
def main(
lang: str,
oscar_dataset: str,
max_texts: int,
output_file: Path,
n_process: int = 8,
batch_size: int = 100,
):
if lang == "ko":
nlp = spacy.blank(
"ko", config={"nlp": {"tokenizer": {"@tokenizers": "spacy.Tokenizer.v1"}}}
)
elif lang == "zh":
nlp = spacy.blank("zh", config={"nlp": {"tokenizer": {"segmenter": "pkuseg"}}})
nlp.tokenizer.initialize(pkuseg_model="spacy_ontonotes")
else:
nlp = spacy.blank(lang)
nlp.add_pipe("sentencizer")
nlp.max_length = 10 ** 8
dataset = load_dataset("oscar", oscar_dataset, split="train", streaming=True)
with open(output_file, "w") as output_fileh:
texts = (
re.sub("\s+", " ", line["text"].strip())
for line in islice(iter(dataset), max_texts)
)
for doc in nlp.pipe(texts, n_process=n_process, batch_size=batch_size):
for sent in doc.sents:
output_fileh.write(" ".join([t.text for t in sent]) + "\n")
if __name__ == "__main__":
typer.run(main)