build(deps): automatically download nltk models when required (#246)

* code for downloading nltk packages * don't run nltk make command in ci * test for model downloads * remove nltk install from docs * update changelog and bump version
Unstructured-IO · Feb 23, 2023 · 354eff1 · 354eff1
1 parent 83f0454
commit 354eff1
Show file tree

Hide file tree

Showing 8 changed files with 42 additions and 22 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -103,7 +103,6 @@ jobs:
     - name: Test
       run: |
         source .venv/bin/activate
-        make install-nltk-models
         make install-detectron2
         sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr libreoffice
         make test

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.4.14
+
+* Automatically install `nltk` models in the `tokenize` module.
+
 ## 0.4.13
 
 * Fixes unstructured-ingest cli.

diff --git a/Makefile b/Makefile
@@ -36,7 +36,6 @@ install-huggingface:
 install-nltk-models:
 	python -c "import nltk; nltk.download('punkt')"
 	python -c "import nltk; nltk.download('averaged_perceptron_tagger')"
-	python -c "import nltk; nltk.download('words')"
 
 .PHONY: install-test
 install-test:

diff --git a/README.md b/README.md
@@ -62,10 +62,6 @@ installation.
     - `poppler-utils` (images and PDFs)
     - `tesseract-ocr` (images and PDFs)
     - `libreoffice` (MS Office docs)
-- Run the following to install NLTK dependencies. `unstructured` will handle this automatically
-  soon.
-    - `python -c "import nltk; nltk.download('punkt')"`
-    - `python -c "import nltk; nltk.download('averaged_perceptron_tagger')"`
 - If you are parsing PDFs, run the following to install the `detectron2` model, which
   `unstructured` uses for layout detection:
     - `pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"`

diff --git a/docs/source/installing.rst b/docs/source/installing.rst
@@ -16,10 +16,6 @@ installation.
 	* ``tesseract-ocr`` (images and PDFs)
 	* ``libreoffice`` (MS Office docs)
 
-* Run the following to install NLTK dependencies. ``unstructured`` will handle this automatically soon.
-	* ``python -c "import nltk; nltk.download('punkt')"``
-	* ``python -c "import nltk; nltk.download('averaged_perceptron_tagger')"``
-
 * If you are parsing PDFs, run the following to install the ``detectron2`` model, which ``unstructured`` uses for layout detection:
 	* ``pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"``
 
@@ -141,17 +137,6 @@ If you are on Windows using ``conda``, run:
 
 		$ conda install -c conda-forge libmagic
 
-
-=================
-NLTK Dependencies
-=================
-
-The `NLTK <https://www.nltk.org/>`_ library is used for word and sentence tokenziation and
-part of speech (POS) tagging. Tokenization and POS tagging help to identify sections of
-narrative text within a document and are used across parsing families. The ``make install``
-command downloads the ``punkt`` and ``averaged_perceptron_tagger`` depdenencies from ``nltk``.
-If they are not already installed, you can install them with ``make install-nltk``.
-
 ======================
 XML/HTML Depenedencies
 ======================

diff --git a/test_unstructured/nlp/test_tokenize.py b/test_unstructured/nlp/test_tokenize.py
@@ -1,10 +1,29 @@
 from typing import List, Tuple
+from unittest.mock import patch
+
+import nltk
 
 import unstructured.nlp.tokenize as tokenize
 
 from test_unstructured.nlp.mock_nltk import mock_sent_tokenize, mock_word_tokenize
 
 
+def test_nltk_packages_download_if_not_present():
+    with patch.object(nltk, "find", side_effect=LookupError):
+        with patch.object(nltk, "download") as mock_download:
+            tokenize._download_nltk_package_if_not_present("fake_package", "tokenizers")
+
+    mock_download.assert_called_with("fake_package")
+
+
+def test_nltk_packages_do_not_download_if():
+    with patch.object(nltk, "find"):
+        with patch.object(nltk, "download") as mock_download:
+            tokenize._download_nltk_package_if_not_present("fake_package", "tokenizers")
+
+    mock_download.assert_not_called()
+
+
 def mock_pos_tag(tokens: List[str]) -> List[Tuple[str, str]]:
     pos_tags: List[Tuple[str, str]] = list()
     for token in tokens:

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.4.13"  # pragma: no cover
+__version__ = "0.4.14"  # pragma: no cover
diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py
@@ -7,6 +7,7 @@
 else:
     from typing import Final
 
+import nltk
 from nltk import (
     pos_tag as _pos_tag,
     sent_tokenize as _sent_tokenize,
@@ -16,6 +17,23 @@
 CACHE_MAX_SIZE: Final[int] = 128
 
 
+def _download_nltk_package_if_not_present(package_name: str, package_category: str):
+    """If the required nlt package is not present, download it."""
+    try:
+        nltk.find(f"{package_category}/{package_name}")
+    except LookupError:
+        nltk.download(package_name)
+
+
+NLTK_PACKAGES = [
+    ("tokenizers", "punkt"),
+    ("taggers", "averaged_perceptron_tagger"),
+]
+
+for package_category, package_name in NLTK_PACKAGES:
+    _download_nltk_package_if_not_present(package_name, package_category)
+
+
 @lru_cache(maxsize=CACHE_MAX_SIZE)
 def sent_tokenize(text: str) -> List[str]:
     """A wrapper around the NLTK sentence tokenizer with LRU caching enabled."""
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.4.13" # pragma: no cover
		__version__ = "0.4.14" # pragma: no cover