diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py index 80fc273bec..cb04f80aef 100644 --- a/unstructured/nlp/tokenize.py +++ b/unstructured/nlp/tokenize.py @@ -16,8 +16,13 @@ def download_nltk_packages(): - nltk.download("averaged_perceptron_tagger_eng", quiet=True) - nltk.download("punkt_tab", quiet=True) + required_packages = [ + ("punkt_tab", "tokenizers"), + ("averaged_perceptron_tagger_eng", "taggers"), + ] + for package_name, category in required_packages: + if not check_for_nltk_package(package_name, category): + nltk.download(package_name, quiet=True) def check_for_nltk_package(package_name: str, package_category: str) -> bool: @@ -46,6 +51,7 @@ def validate_nltk_assets(): # Validate NLTK assets at import time +download_nltk_packages() validate_nltk_assets()