From 354eff1e2b590bf12b195a1a7f4a04c7372ea48a Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Thu, 23 Feb 2023 12:19:13 -0500 Subject: [PATCH] build(deps): automatically download `nltk` models when required (#246) * code for downloading nltk packages * don't run nltk make command in ci * test for model downloads * remove nltk install from docs * update changelog and bump version --- .github/workflows/ci.yml | 1 - CHANGELOG.md | 4 ++++ Makefile | 1 - README.md | 4 ---- docs/source/installing.rst | 15 --------------- test_unstructured/nlp/test_tokenize.py | 19 +++++++++++++++++++ unstructured/__version__.py | 2 +- unstructured/nlp/tokenize.py | 18 ++++++++++++++++++ 8 files changed, 42 insertions(+), 22 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2681aefc50..dd59cd6ad9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -103,7 +103,6 @@ jobs: - name: Test run: | source .venv/bin/activate - make install-nltk-models make install-detectron2 sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr libreoffice make test diff --git a/CHANGELOG.md b/CHANGELOG.md index e1b360a8f7..338ddb528a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.4.14 + +* Automatically install `nltk` models in the `tokenize` module. + ## 0.4.13 * Fixes unstructured-ingest cli. diff --git a/Makefile b/Makefile index e92aa25b61..4fb4e34d19 100644 --- a/Makefile +++ b/Makefile @@ -36,7 +36,6 @@ install-huggingface: install-nltk-models: python -c "import nltk; nltk.download('punkt')" python -c "import nltk; nltk.download('averaged_perceptron_tagger')" - python -c "import nltk; nltk.download('words')" .PHONY: install-test install-test: diff --git a/README.md b/README.md index acb03118bc..13bed4b08e 100644 --- a/README.md +++ b/README.md @@ -62,10 +62,6 @@ installation. - `poppler-utils` (images and PDFs) - `tesseract-ocr` (images and PDFs) - `libreoffice` (MS Office docs) -- Run the following to install NLTK dependencies. `unstructured` will handle this automatically - soon. - - `python -c "import nltk; nltk.download('punkt')"` - - `python -c "import nltk; nltk.download('averaged_perceptron_tagger')"` - If you are parsing PDFs, run the following to install the `detectron2` model, which `unstructured` uses for layout detection: - `pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"` diff --git a/docs/source/installing.rst b/docs/source/installing.rst index 8867b7184b..e96d3e8da0 100644 --- a/docs/source/installing.rst +++ b/docs/source/installing.rst @@ -16,10 +16,6 @@ installation. * ``tesseract-ocr`` (images and PDFs) * ``libreoffice`` (MS Office docs) -* Run the following to install NLTK dependencies. ``unstructured`` will handle this automatically soon. - * ``python -c "import nltk; nltk.download('punkt')"`` - * ``python -c "import nltk; nltk.download('averaged_perceptron_tagger')"`` - * If you are parsing PDFs, run the following to install the ``detectron2`` model, which ``unstructured`` uses for layout detection: * ``pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"`` @@ -141,17 +137,6 @@ If you are on Windows using ``conda``, run: $ conda install -c conda-forge libmagic - -================= -NLTK Dependencies -================= - -The `NLTK `_ library is used for word and sentence tokenziation and -part of speech (POS) tagging. Tokenization and POS tagging help to identify sections of -narrative text within a document and are used across parsing families. The ``make install`` -command downloads the ``punkt`` and ``averaged_perceptron_tagger`` depdenencies from ``nltk``. -If they are not already installed, you can install them with ``make install-nltk``. - ====================== XML/HTML Depenedencies ====================== diff --git a/test_unstructured/nlp/test_tokenize.py b/test_unstructured/nlp/test_tokenize.py index 42ccde3cda..11cee6e014 100644 --- a/test_unstructured/nlp/test_tokenize.py +++ b/test_unstructured/nlp/test_tokenize.py @@ -1,10 +1,29 @@ from typing import List, Tuple +from unittest.mock import patch + +import nltk import unstructured.nlp.tokenize as tokenize from test_unstructured.nlp.mock_nltk import mock_sent_tokenize, mock_word_tokenize +def test_nltk_packages_download_if_not_present(): + with patch.object(nltk, "find", side_effect=LookupError): + with patch.object(nltk, "download") as mock_download: + tokenize._download_nltk_package_if_not_present("fake_package", "tokenizers") + + mock_download.assert_called_with("fake_package") + + +def test_nltk_packages_do_not_download_if(): + with patch.object(nltk, "find"): + with patch.object(nltk, "download") as mock_download: + tokenize._download_nltk_package_if_not_present("fake_package", "tokenizers") + + mock_download.assert_not_called() + + def mock_pos_tag(tokens: List[str]) -> List[Tuple[str, str]]: pos_tags: List[Tuple[str, str]] = list() for token in tokens: diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 9110917885..081b35f439 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.4.13" # pragma: no cover +__version__ = "0.4.14" # pragma: no cover diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py index 40f2ad34cc..64a9bd169f 100644 --- a/unstructured/nlp/tokenize.py +++ b/unstructured/nlp/tokenize.py @@ -7,6 +7,7 @@ else: from typing import Final +import nltk from nltk import ( pos_tag as _pos_tag, sent_tokenize as _sent_tokenize, @@ -16,6 +17,23 @@ CACHE_MAX_SIZE: Final[int] = 128 +def _download_nltk_package_if_not_present(package_name: str, package_category: str): + """If the required nlt package is not present, download it.""" + try: + nltk.find(f"{package_category}/{package_name}") + except LookupError: + nltk.download(package_name) + + +NLTK_PACKAGES = [ + ("tokenizers", "punkt"), + ("taggers", "averaged_perceptron_tagger"), +] + +for package_category, package_name in NLTK_PACKAGES: + _download_nltk_package_if_not_present(package_name, package_category) + + @lru_cache(maxsize=CACHE_MAX_SIZE) def sent_tokenize(text: str) -> List[str]: """A wrapper around the NLTK sentence tokenizer with LRU caching enabled."""