nltk.data.find argument needs to be a path, not just a filename

gunthercox · Jan 2, 2017 · 305e22b · 305e22b
1 parent 6c44daa
commit 305e22b
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 7 deletions.
diff --git a/chatterbot/chatterbot.py b/chatterbot/chatterbot.py
@@ -77,10 +77,10 @@ def initialize(self):
         from .utils import nltk_download_corpus
 
         # Download required NLTK corpora if they have not already been downloaded
-        nltk_download_corpus('stopwords')
-        nltk_download_corpus('wordnet')
-        nltk_download_corpus('punkt')
-        nltk_download_corpus('vader_lexicon')
+        nltk_download_corpus('corpora/stopwords')
+        nltk_download_corpus('corpora/wordnet')
+        nltk_download_corpus('tokenizers/punkt')
+        nltk_download_corpus('sentiment/vader_lexicon')
 
     def get_response(self, input_item, session_id=None):
         """

diff --git a/chatterbot/utils.py b/chatterbot/utils.py
@@ -146,7 +146,7 @@ def input_function():
     return user_input
 
 
-def nltk_download_corpus(corpus_name):
+def nltk_download_corpus(resource_path):
     """
     Download the specified NLTK corpus file
     unless it has already been downloaded.
@@ -155,13 +155,24 @@ def nltk_download_corpus(corpus_name):
     """
     from nltk.data import find
     from nltk import download
+    from os.path import split
 
     # Download the wordnet data only if it is not already downloaded
-    zip_file = '{}.zip'.format(corpus_name)
+    _, corpus_name = split(resource_path)
+
+    ## From http://www.nltk.org/api/nltk.html ##
+    # When using find() to locate a directory contained in a zipfile,
+    # the resource name must end with the forward slash character.
+    # Otherwise, find() will not locate the directory.
+    ####
+    # Helps when resource_path=='sentiment/vader_lexicon''
+    if not resource_path.endswith('/'):
+        resource_path = resource_path + '/'
+
     downloaded = False
 
     try:
-        find(zip_file)
+        find(resource_path)
     except LookupError:
         download(corpus_name)
         downloaded = True