From c7942ad06353211b76313271c1bd06ed426b8f25 Mon Sep 17 00:00:00 2001 From: christinestraub Date: Mon, 6 Jan 2025 11:14:37 -0800 Subject: [PATCH] feat:update download_nltk_packages() --- unstructured/nlp/tokenize.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py index 80fc273bec..cb04f80aef 100644 --- a/unstructured/nlp/tokenize.py +++ b/unstructured/nlp/tokenize.py @@ -16,8 +16,13 @@ def download_nltk_packages(): - nltk.download("averaged_perceptron_tagger_eng", quiet=True) - nltk.download("punkt_tab", quiet=True) + required_packages = [ + ("punkt_tab", "tokenizers"), + ("averaged_perceptron_tagger_eng", "taggers"), + ] + for package_name, category in required_packages: + if not check_for_nltk_package(package_name, category): + nltk.download(package_name, quiet=True) def check_for_nltk_package(package_name: str, package_category: str) -> bool: @@ -46,6 +51,7 @@ def validate_nltk_assets(): # Validate NLTK assets at import time +download_nltk_packages() validate_nltk_assets()