piskvorky · mpenkov · May 5, 2019 · Apr 29, 2019 · Apr 30, 2019 · Apr 30, 2019
diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
@@ -70,7 +70,7 @@
 except ImportError:
     from Queue import Queue  # noqa:F401
 
-from collections import namedtuple, defaultdict
+from collections import namedtuple, defaultdict, Iterable
 from timeit import default_timer
 
 from numpy import zeros, float32 as REAL, empty, ones, \
@@ -794,6 +794,23 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor
 
         """
         kwargs = {}
+
+        # Check if both documents and corpus_file are None
+        if corpus_file is None and documents is None:
+            raise TypeError("Either one of corpus_file or documents value must be provided.")
-            raise TypeError("Either one of corpus_file or documents value must be provided.")
+            raise TypeError("Either one of corpus_file or documents value must be provided")
-            raise TypeError("Either one of corpus_file or documents value must be provided.")
+            raise TypeError("Either one of corpus_file or documents value must be provided")
+
+        # Check if both documents and corpus_file are not None
+        if corpus_file is not None and documents is not None:
+            raise TypeError("Instead provide value to either of corpus_file or documents parameter but not both.")
-            raise TypeError("Instead provide value to either of corpus_file or documents parameter but not both.")
+            raise TypeError("Both corpus_file and documents may not be provided at the same time")
-            raise TypeError("Instead provide value to either of corpus_file or documents parameter but not both.")
+            raise TypeError("Both corpus_file and documents may not be provided at the same time")
+
+        # Check if corpus_file is string type
+        if documents is None and not os.path.isfile(corpus_file):
+            raise TypeError("Parameter corpus_file must be a valid path to a file, got %s instead." % corpus_file)
-            raise TypeError("Parameter corpus_file must be a valid path to a file, got %s instead." % corpus_file)
+            raise TypeError("Parameter corpus_file must be a valid path to a file, got %r instead." % corpus_file)
-            raise TypeError("Parameter corpus_file must be a valid path to a file, got %s instead." % corpus_file)
+            raise TypeError("Parameter corpus_file must be a valid path to a file, got %r instead." % corpus_file)
+
+        # Check if documents is iterable
+        if documents is not None and not isinstance(documents, Iterable):
+            raise TypeError("documents must be an iterable of list, got %s instead." % documents)
+
         if corpus_file is not None:
             # Calculate offsets for each worker along with initial doctags (doctag ~ document/line number in a file)
             offsets, start_doctags = self._get_offsets_and_start_doctags_for_corpusfile(corpus_file, self.workers)

diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py
@@ -173,6 +173,22 @@ def testLoadOldModel(self):
             sims_to_infer = loaded_model.docvecs.most_similar([doc0_inferred], topn=len(loaded_model.docvecs))
             self.assertTrue(sims_to_infer)
 
+    def testDoc2vecTrainParameters(self):
+
+        model = doc2vec.Doc2Vec(vector_size=50)
+        model.build_vocab(documents=list_corpus)
+
+        # check if corpus_file is not a string
+        self.assertRaises(TypeError, model.train, corpus_file=11111)
+        # check if documents is an iterable
+        self.assertRaises(TypeError, model.train, documents=11111)
+        # check is both the parameters are provided
+        self.assertRaises(TypeError, model.train, documents=sentences, corpus_file='test')
+        # check if both the parameters are left empty
+        self.assertRaises(TypeError, model.train, documents=None, corpus_file=None)
+        # check if corpus_file is an iterable
+        self.assertRaises(TypeError, model.train, corpus_file=sentences)
+
     @unittest.skipIf(os.name == 'nt', "See another test for Windows below")
     def test_get_offsets_and_start_doctags(self):
         # Each line takes 6 bytes (including '\n' character)
@@ -387,7 +403,7 @@ def model_sanity(self, model, keep_training=True):
             tmpf = get_tmpfile('gensim_doc2vec.tst')
             model.save(tmpf)
             loaded = doc2vec.Doc2Vec.load(tmpf)
-            loaded.train(sentences, total_examples=loaded.corpus_count, epochs=loaded.epochs)
+            loaded.train(documents=sentences, total_examples=loaded.corpus_count, epochs=loaded.epochs)
 
     def test_training(self):
         """Test doc2vec training."""