Skip to content

Commit

Permalink
feat:update download_nltk_packages()
Browse files Browse the repository at this point in the history
  • Loading branch information
christinestraub committed Jan 6, 2025
1 parent 8b2f950 commit c7942ad
Showing 1 changed file with 8 additions and 2 deletions.
10 changes: 8 additions & 2 deletions unstructured/nlp/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,13 @@


def download_nltk_packages():
nltk.download("averaged_perceptron_tagger_eng", quiet=True)
nltk.download("punkt_tab", quiet=True)
required_packages = [
("punkt_tab", "tokenizers"),
("averaged_perceptron_tagger_eng", "taggers"),
]
for package_name, category in required_packages:
if not check_for_nltk_package(package_name, category):
nltk.download(package_name, quiet=True)


def check_for_nltk_package(package_name: str, package_category: str) -> bool:
Expand Down Expand Up @@ -46,6 +51,7 @@ def validate_nltk_assets():


# Validate NLTK assets at import time
download_nltk_packages()
validate_nltk_assets()


Expand Down

0 comments on commit c7942ad

Please sign in to comment.