From 354eff1e2b590bf12b195a1a7f4a04c7372ea48a Mon Sep 17 00:00:00 2001
From: Matt Robinson <mrobinson@unstructured.io>
Date: Thu, 23 Feb 2023 12:19:13 -0500
Subject: [PATCH] build(deps): automatically download `nltk` models when
 required (#246)

* code for downloading nltk packages

* don't run nltk make command in ci

* test for model downloads

* remove nltk install from docs

* update changelog and bump version
---
 .github/workflows/ci.yml               |  1 -
 CHANGELOG.md                           |  4 ++++
 Makefile                               |  1 -
 README.md                              |  4 ----
 docs/source/installing.rst             | 15 ---------------
 test_unstructured/nlp/test_tokenize.py | 19 +++++++++++++++++++
 unstructured/__version__.py            |  2 +-
 unstructured/nlp/tokenize.py           | 18 ++++++++++++++++++
 8 files changed, 42 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 2681aefc50..dd59cd6ad9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -103,7 +103,6 @@ jobs:
     - name: Test
       run: |
         source .venv/bin/activate
-        make install-nltk-models
         make install-detectron2
         sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr libreoffice
         make test
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e1b360a8f7..338ddb528a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.4.14
+
+* Automatically install `nltk` models in the `tokenize` module.
+
 ## 0.4.13
 
 * Fixes unstructured-ingest cli.
diff --git a/Makefile b/Makefile
index e92aa25b61..4fb4e34d19 100644
--- a/Makefile
+++ b/Makefile
@@ -36,7 +36,6 @@ install-huggingface:
 install-nltk-models:
 	python -c "import nltk; nltk.download('punkt')"
 	python -c "import nltk; nltk.download('averaged_perceptron_tagger')"
-	python -c "import nltk; nltk.download('words')"
 
 .PHONY: install-test
 install-test:
diff --git a/README.md b/README.md
index acb03118bc..13bed4b08e 100644
--- a/README.md
+++ b/README.md
@@ -62,10 +62,6 @@ installation.
     - `poppler-utils` (images and PDFs)
     - `tesseract-ocr` (images and PDFs)
     - `libreoffice` (MS Office docs)
-- Run the following to install NLTK dependencies. `unstructured` will handle this automatically
-  soon.
-    - `python -c "import nltk; nltk.download('punkt')"`
-    - `python -c "import nltk; nltk.download('averaged_perceptron_tagger')"`
 - If you are parsing PDFs, run the following to install the `detectron2` model, which
   `unstructured` uses for layout detection:
     - `pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"`
diff --git a/docs/source/installing.rst b/docs/source/installing.rst
index 8867b7184b..e96d3e8da0 100644
--- a/docs/source/installing.rst
+++ b/docs/source/installing.rst
@@ -16,10 +16,6 @@ installation.
 	* ``tesseract-ocr`` (images and PDFs)
 	* ``libreoffice`` (MS Office docs)
 
-* Run the following to install NLTK dependencies. ``unstructured`` will handle this automatically soon.
-	* ``python -c "import nltk; nltk.download('punkt')"``
-	* ``python -c "import nltk; nltk.download('averaged_perceptron_tagger')"``
-
 * If you are parsing PDFs, run the following to install the ``detectron2`` model, which ``unstructured`` uses for layout detection:
 	* ``pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"``
 
@@ -141,17 +137,6 @@ If you are on Windows using ``conda``, run:
 
 		$ conda install -c conda-forge libmagic
 
-
-=================
-NLTK Dependencies
-=================
-
-The `NLTK <https://www.nltk.org/>`_ library is used for word and sentence tokenziation and
-part of speech (POS) tagging. Tokenization and POS tagging help to identify sections of
-narrative text within a document and are used across parsing families. The ``make install``
-command downloads the ``punkt`` and ``averaged_perceptron_tagger`` depdenencies from ``nltk``.
-If they are not already installed, you can install them with ``make install-nltk``.
-
 ======================
 XML/HTML Depenedencies
 ======================
diff --git a/test_unstructured/nlp/test_tokenize.py b/test_unstructured/nlp/test_tokenize.py
index 42ccde3cda..11cee6e014 100644
--- a/test_unstructured/nlp/test_tokenize.py
+++ b/test_unstructured/nlp/test_tokenize.py
@@ -1,10 +1,29 @@
 from typing import List, Tuple
+from unittest.mock import patch
+
+import nltk
 
 import unstructured.nlp.tokenize as tokenize
 
 from test_unstructured.nlp.mock_nltk import mock_sent_tokenize, mock_word_tokenize
 
 
+def test_nltk_packages_download_if_not_present():
+    with patch.object(nltk, "find", side_effect=LookupError):
+        with patch.object(nltk, "download") as mock_download:
+            tokenize._download_nltk_package_if_not_present("fake_package", "tokenizers")
+
+    mock_download.assert_called_with("fake_package")
+
+
+def test_nltk_packages_do_not_download_if():
+    with patch.object(nltk, "find"):
+        with patch.object(nltk, "download") as mock_download:
+            tokenize._download_nltk_package_if_not_present("fake_package", "tokenizers")
+
+    mock_download.assert_not_called()
+
+
 def mock_pos_tag(tokens: List[str]) -> List[Tuple[str, str]]:
     pos_tags: List[Tuple[str, str]] = list()
     for token in tokens:
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 9110917885..081b35f439 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.4.13"  # pragma: no cover
+__version__ = "0.4.14"  # pragma: no cover
diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py
index 40f2ad34cc..64a9bd169f 100644
--- a/unstructured/nlp/tokenize.py
+++ b/unstructured/nlp/tokenize.py
@@ -7,6 +7,7 @@
 else:
     from typing import Final
 
+import nltk
 from nltk import (
     pos_tag as _pos_tag,
     sent_tokenize as _sent_tokenize,
@@ -16,6 +17,23 @@
 CACHE_MAX_SIZE: Final[int] = 128
 
 
+def _download_nltk_package_if_not_present(package_name: str, package_category: str):
+    """If the required nlt package is not present, download it."""
+    try:
+        nltk.find(f"{package_category}/{package_name}")
+    except LookupError:
+        nltk.download(package_name)
+
+
+NLTK_PACKAGES = [
+    ("tokenizers", "punkt"),
+    ("taggers", "averaged_perceptron_tagger"),
+]
+
+for package_category, package_name in NLTK_PACKAGES:
+    _download_nltk_package_if_not_present(package_name, package_category)
+
+
 @lru_cache(maxsize=CACHE_MAX_SIZE)
 def sent_tokenize(text: str) -> List[str]:
     """A wrapper around the NLTK sentence tokenizer with LRU caching enabled."""