-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add
spacy.PlainTextCorpusReader.v1
(#12122)
* Add `spacy.PlainTextCorpusReader.v1` This is a corpus reader that reads plain text corpora with the following format: - UTF-8 encoding - One line per document. - Blank lines are ignored. It is useful for applications where we deal with very large corpora, such as distillation, and don't want to deal with the space overhead of serialized formats. Additionally, many large corpora already use such a text format, keeping the necessary preprocessing to a minimum. * Update spacy/training/corpus.py Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * docs: add version to `PlainTextCorpus` * Add docstring to registry function * Add plain text corpus tests * Only strip newline/carriage return * Add return type _string_to_tmp_file helper * Use a temporary directory in place of file name Different OS auto delete/sharing semantics are just wonky. * This will be new in 3.5.1 (rather than 4) * Test improvements from code review Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
- Loading branch information
1 parent
a37117a
commit 8d69874
Showing
4 changed files
with
215 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
from typing import IO, Generator, Iterable, List, TextIO, Tuple | ||
from contextlib import contextmanager | ||
from pathlib import Path | ||
import pytest | ||
import tempfile | ||
|
||
from spacy.lang.en import English | ||
from spacy.training import Example, PlainTextCorpus | ||
from spacy.util import make_tempdir | ||
|
||
# Intentional newlines to check that they are skipped. | ||
PLAIN_TEXT_DOC = """ | ||
This is a doc. It contains two sentences. | ||
This is another doc. | ||
A third doc. | ||
""" | ||
|
||
PLAIN_TEXT_DOC_TOKENIZED = [ | ||
[ | ||
"This", | ||
"is", | ||
"a", | ||
"doc", | ||
".", | ||
"It", | ||
"contains", | ||
"two", | ||
"sentences", | ||
".", | ||
], | ||
["This", "is", "another", "doc", "."], | ||
["A", "third", "doc", "."], | ||
] | ||
|
||
|
||
@pytest.mark.parametrize("min_length", [0, 5]) | ||
@pytest.mark.parametrize("max_length", [0, 5]) | ||
def test_plain_text_reader(min_length, max_length): | ||
nlp = English() | ||
with _string_to_tmp_file(PLAIN_TEXT_DOC) as file_path: | ||
corpus = PlainTextCorpus( | ||
file_path, min_length=min_length, max_length=max_length | ||
) | ||
|
||
check = [ | ||
doc | ||
for doc in PLAIN_TEXT_DOC_TOKENIZED | ||
if len(doc) >= min_length and (max_length == 0 or len(doc) <= max_length) | ||
] | ||
reference, predicted = _examples_to_tokens(corpus(nlp)) | ||
|
||
assert reference == check | ||
assert predicted == check | ||
|
||
|
||
@contextmanager | ||
def _string_to_tmp_file(s: str) -> Generator[Path, None, None]: | ||
with make_tempdir() as d: | ||
file_path = Path(d) / "string.txt" | ||
with open(file_path, "w", encoding="utf-8") as f: | ||
f.write(s) | ||
yield file_path | ||
|
||
|
||
def _examples_to_tokens( | ||
examples: Iterable[Example], | ||
) -> Tuple[List[List[str]], List[List[str]]]: | ||
reference = [] | ||
predicted = [] | ||
|
||
for eg in examples: | ||
reference.append([t.text for t in eg.reference]) | ||
predicted.append([t.text for t in eg.predicted]) | ||
|
||
return reference, predicted |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters