Skip to content

Commit

Permalink
Add spacy.PlainTextCorpusReader.v1 (#12122)
Browse files Browse the repository at this point in the history
* Add `spacy.PlainTextCorpusReader.v1`

This is a corpus reader that reads plain text corpora with the following
format:

- UTF-8 encoding
- One line per document.
- Blank lines are ignored.

It is useful for applications where we deal with very large corpora,
such as distillation, and don't want to deal with the space overhead of
serialized formats. Additionally, many large corpora already use such
a text format, keeping the necessary preprocessing to a minimum.

* Update spacy/training/corpus.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* docs: add version to `PlainTextCorpus`

* Add docstring to registry function

* Add plain text corpus tests

* Only strip newline/carriage return

* Add return type _string_to_tmp_file helper

* Use a temporary directory in place of file name

Different OS auto delete/sharing semantics are just wonky.

* This will be new in 3.5.1 (rather than 4)

* Test improvements from code review

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
  • Loading branch information
danieldk and adrianeboyd authored Jan 26, 2023
1 parent a37117a commit 8d69874
Show file tree
Hide file tree
Showing 4 changed files with 215 additions and 1 deletion.
78 changes: 78 additions & 0 deletions spacy/tests/training/test_corpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from typing import IO, Generator, Iterable, List, TextIO, Tuple
from contextlib import contextmanager
from pathlib import Path
import pytest
import tempfile

from spacy.lang.en import English
from spacy.training import Example, PlainTextCorpus
from spacy.util import make_tempdir

# Intentional newlines to check that they are skipped.
PLAIN_TEXT_DOC = """
This is a doc. It contains two sentences.
This is another doc.
A third doc.
"""

PLAIN_TEXT_DOC_TOKENIZED = [
[
"This",
"is",
"a",
"doc",
".",
"It",
"contains",
"two",
"sentences",
".",
],
["This", "is", "another", "doc", "."],
["A", "third", "doc", "."],
]


@pytest.mark.parametrize("min_length", [0, 5])
@pytest.mark.parametrize("max_length", [0, 5])
def test_plain_text_reader(min_length, max_length):
nlp = English()
with _string_to_tmp_file(PLAIN_TEXT_DOC) as file_path:
corpus = PlainTextCorpus(
file_path, min_length=min_length, max_length=max_length
)

check = [
doc
for doc in PLAIN_TEXT_DOC_TOKENIZED
if len(doc) >= min_length and (max_length == 0 or len(doc) <= max_length)
]
reference, predicted = _examples_to_tokens(corpus(nlp))

assert reference == check
assert predicted == check


@contextmanager
def _string_to_tmp_file(s: str) -> Generator[Path, None, None]:
with make_tempdir() as d:
file_path = Path(d) / "string.txt"
with open(file_path, "w", encoding="utf-8") as f:
f.write(s)
yield file_path


def _examples_to_tokens(
examples: Iterable[Example],
) -> Tuple[List[List[str]], List[List[str]]]:
reference = []
predicted = []

for eg in examples:
reference.append([t.text for t in eg.reference])
predicted.append([t.text for t in eg.predicted])

return reference, predicted
2 changes: 1 addition & 1 deletion spacy/training/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .corpus import Corpus, JsonlCorpus # noqa: F401
from .corpus import Corpus, JsonlCorpus, PlainTextCorpus # noqa: F401
from .example import Example, validate_examples, validate_get_examples # noqa: F401
from .alignment import Alignment # noqa: F401
from .augment import dont_augment, orth_variants_augmenter # noqa: F401
Expand Down
71 changes: 71 additions & 0 deletions spacy/training/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,28 @@ def read_labels(path: Path, *, require: bool = False):
return srsly.read_json(path)


@util.registry.readers("spacy.PlainTextCorpus.v1")
def create_plain_text_reader(
path: Optional[Path],
min_length: int = 0,
max_length: int = 0,
) -> Callable[["Language"], Iterable[Doc]]:
"""Iterate Example objects from a file or directory of plain text
UTF-8 files with one line per doc.
path (Path): The directory or filename to read from.
min_length (int): Minimum document length (in tokens). Shorter documents
will be skipped. Defaults to 0, which indicates no limit.
max_length (int): Maximum document length (in tokens). Longer documents will
be skipped. Defaults to 0, which indicates no limit.
DOCS: https://spacy.io/api/corpus#plaintextcorpus
"""
if path is None:
raise ValueError(Errors.E913)
return PlainTextCorpus(path, min_length=min_length, max_length=max_length)


def walk_corpus(path: Union[str, Path], file_type) -> List[Path]:
path = util.ensure_path(path)
if not path.is_dir() and path.parts[-1].endswith(file_type):
Expand Down Expand Up @@ -257,3 +279,52 @@ def __call__(self, nlp: "Language") -> Iterator[Example]:
# We don't *need* an example here, but it seems nice to
# make it match the Corpus signature.
yield Example(doc, Doc(nlp.vocab, words=words, spaces=spaces))


class PlainTextCorpus:
"""Iterate Example objects from a file or directory of plain text
UTF-8 files with one line per doc.
path (Path): The directory or filename to read from.
min_length (int): Minimum document length (in tokens). Shorter documents
will be skipped. Defaults to 0, which indicates no limit.
max_length (int): Maximum document length (in tokens). Longer documents will
be skipped. Defaults to 0, which indicates no limit.
DOCS: https://spacy.io/api/corpus#plaintextcorpus
"""

file_type = "txt"

def __init__(
self,
path: Optional[Union[str, Path]],
*,
min_length: int = 0,
max_length: int = 0,
) -> None:
self.path = util.ensure_path(path)
self.min_length = min_length
self.max_length = max_length

def __call__(self, nlp: "Language") -> Iterator[Example]:
"""Yield examples from the data.
nlp (Language): The current nlp object.
YIELDS (Example): The example objects.
DOCS: https://spacy.io/api/corpus#plaintextcorpus-call
"""
for loc in walk_corpus(self.path, ".txt"):
with open(loc, encoding="utf-8") as f:
for text in f:
text = text.rstrip("\r\n")
if len(text):
doc = nlp.make_doc(text)
if self.min_length >= 1 and len(doc) < self.min_length:
continue
elif self.max_length >= 1 and len(doc) > self.max_length:
continue
# We don't *need* an example here, but it seems nice to
# make it match the Corpus signature.
yield Example(doc, doc.copy())
65 changes: 65 additions & 0 deletions website/docs/api/corpus.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -175,3 +175,68 @@ Yield examples from the data.
| ---------- | -------------------------------------- |
| `nlp` | The current `nlp` object. ~~Language~~ |
| **YIELDS** | The examples. ~~Example~~ |
## PlainTextCorpus {id="plaintextcorpus",tag="class",version="3.5.1"}
Iterate over documents from a plain text file. Can be used to read the raw text
corpus for language model
[pretraining](/usage/embeddings-transformers#pretraining). The expected file
format is:
- UTF-8 encoding
- One document per line
- Blank lines are ignored.
```text {title="Example"}
Can I ask where you work now and what you do, and if you enjoy it?
They may just pull out of the Seattle market completely, at least until they have autonomous vehicles.
My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in.
```
### PlainTextCorpus.\_\_init\_\_ {id="plaintextcorpus-init",tag="method"}
Initialize the reader.
> #### Example
>
> ```python
> from spacy.training import PlainTextCorpus
>
> corpus = PlainTextCorpus("./data/docs.txt")
> ```
>
> ```ini
> ### Example config
> [corpora.pretrain]
> @readers = "spacy.PlainTextCorpus.v1"
> path = "corpus/raw_text.txt"
> min_length = 0
> max_length = 0
> ```
| Name | Description |
| -------------- | -------------------------------------------------------------------------------------------------------------------------- |
| `path` | The directory or filename to read from. Expects newline-delimited documents in UTF8 format. ~~Union[str, Path]~~ |
| _keyword-only_ | |
| `min_length` | Minimum document length (in tokens). Shorter documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ |
| `max_length` | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ |
### PlainTextCorpus.\_\_call\_\_ {id="plaintextcorpus-call",tag="method"}
Yield examples from the data.
> #### Example
>
> ```python
> from spacy.training import PlainTextCorpus
> import spacy
>
> corpus = PlainTextCorpus("./docs.txt")
> nlp = spacy.blank("en")
> data = corpus(nlp)
> ```
| Name | Description |
| ---------- | -------------------------------------- |
| `nlp` | The current `nlp` object. ~~Language~~ |
| **YIELDS** | The examples. ~~Example~~ |

0 comments on commit 8d69874

Please sign in to comment.