piskvorky · piskvorky · Jan 23, 2019 · Jan 21, 2019 · Jan 21, 2019 · Jan 22, 2019
diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py
@@ -660,7 +660,7 @@ def __init__(self, sentences=None, corpus_file=None, workers=3, vector_size=100,
         corpus_file : str, optional
             Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
             You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
-            `corpus_file` arguments need to be passed (or none of them).
+            `corpus_file` arguments need to be passed (or none of them, in that case, the model is left uninitialized).
         workers : int, optional
             Number of working threads, used for multiprocessing.
         vector_size : int, optional
@@ -754,7 +754,7 @@ def __init__(self, sentences=None, corpus_file=None, workers=3, vector_size=100,
             if corpus_file is not None and not isinstance(corpus_file, string_types):
                 raise TypeError("You must pass string as the corpus_file argument.")
             elif isinstance(sentences, GeneratorType):
-                raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.")
+                raise TypeError("You can't pass a generator as the sentences argument. Try a sequence.")
 
             self.build_vocab(sentences=sentences, corpus_file=corpus_file, trim_rule=trim_rule)
             self.train(

diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
@@ -482,13 +482,14 @@ def __init__(self, documents=None, corpus_file=None, dm_mean=None, dm=1, dbow_wo
         ----------
         documents : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional
             Input corpus, can be simply a list of elements, but for larger corpora,consider an iterable that streams
-            the documents directly from disk/network. If you don't supply `documents`, the model is
+            the documents directly from disk/network. If you don't supply `documents` (or `corpus_file`), the model is
             left uninitialized -- use if you plan to initialize it in some other way.
         corpus_file : str, optional
             Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
-            You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
-            `corpus_file` arguments need to be passed (or none of them). Documents' tags are assigned automatically
-            and are equal to line number, as in :class:`~gensim.models.doc2vec.TaggedLineDocument`.
+            You may use this argument instead of `documents` to get performance boost. Only one of `documents` or
+            `corpus_file` arguments need to be passed (or none of them, in that case, the model is left uninitialized).
+            Documents' tags are assigned automatically and are equal to line number, as in
+            :class:`~gensim.models.doc2vec.TaggedLineDocument`.
         dm : {1,0}, optional
             Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used.
             Otherwise, `distributed bag of words` (PV-DBOW) is employed.
@@ -616,7 +617,7 @@ def __init__(self, documents=None, corpus_file=None, dm_mean=None, dm=1, dbow_wo
             if corpus_file is not None and not isinstance(corpus_file, string_types):
                 raise TypeError("You must pass string as the corpus_file argument.")
             elif isinstance(documents, GeneratorType):
-                raise TypeError("You can't pass a generator as the documents argument. Try an iterator.")
+                raise TypeError("You can't pass a generator as the documents argument. Try a sequence.")
             self.build_vocab(documents=documents, corpus_file=corpus_file, trim_rule=trim_rule)
             self.train(
                 documents=documents, corpus_file=corpus_file, total_examples=self.corpus_count,
@@ -743,8 +744,8 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor
         """Update the model's neural weights.
 
         To support linear learning-rate decay from (initial) `alpha` to `min_alpha`, and accurate
-        progress-percentage logging, either `total_examples` (count of sentences) or `total_words` (count of
-        raw words in sentences) **MUST** be provided. If `sentences` is the same corpus
+        progress-percentage logging, either `total_examples` (count of documents) or `total_words` (count of
+        raw words in documents) **MUST** be provided. If `documents` is the same corpus
         that was provided to :meth:`~gensim.models.word2vec.Word2Vec.build_vocab` earlier,
         you can simply use `total_examples=self.corpus_count`.
 
@@ -757,15 +758,15 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor
         ----------
         documents : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional
             Can be simply a list of elements, but for larger corpora,consider an iterable that streams
-            the documents directly from disk/network. If you don't supply `documents`, the model is
+            the documents directly from disk/network. If you don't supply `documents` (or `corpus_file`), the model is
             left uninitialized -- use if you plan to initialize it in some other way.
         corpus_file : str, optional
             Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
-            You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
+            You may use this argument instead of `documents` to get performance boost. Only one of `documents` or
             `corpus_file` arguments need to be passed (not both of them). Documents' tags are assigned automatically
             and are equal to line number, as in :class:`~gensim.models.doc2vec.TaggedLineDocument`.
         total_examples : int, optional
-            Count of sentences.
+            Count of documents.
         total_words : int, optional
             Count of raw words in documents.
         epochs : int, optional
@@ -783,7 +784,7 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor
             the alpha learning-rate yourself (not recommended).
         word_count : int, optional
             Count of words already trained. Set this to 0 for the usual
-            case of training on all words in sentences.
+            case of training on all words in documents.
         queue_factor : int, optional
             Multiplier for size of queue (number of workers * queue_factor).
         report_delay : float, optional
@@ -906,6 +907,9 @@ def infer_vector(self, doc_words, alpha=None, min_alpha=None, epochs=None, steps
             The inferred paragraph vector for the new document.
 
         """
+        if isinstance(doc_words, string_types):
+            raise ValueError("You must pass a list of strings, not a single string.")
+
         alpha = alpha or self.alpha
         min_alpha = min_alpha or self.min_alpha
         epochs = epochs or steps or self.epochs
@@ -1131,7 +1135,7 @@ def estimate_memory(self, vocab_size=None, report=None):
 
     def build_vocab(self, documents=None, corpus_file=None, update=False, progress_per=10000, keep_raw_vocab=False,
                     trim_rule=None, **kwargs):
-        """Build vocabulary from a sequence of sentences (can be a once-only generator stream).
+        """Build vocabulary from a sequence of documents (can be a once-only generator stream).
 
         Parameters
         ----------
@@ -1141,11 +1145,11 @@ def build_vocab(self, documents=None, corpus_file=None, update=False, progress_p
             See :class:`~gensim.models.doc2vec.TaggedBrownCorpus` or :class:`~gensim.models.doc2vec.TaggedLineDocument`
         corpus_file : str, optional
             Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
-            You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
+            You may use this argument instead of `documents` to get performance boost. Only one of `documents` or
             `corpus_file` arguments need to be passed (not both of them). Documents' tags are assigned automatically
             and are equal to a line number, as in :class:`~gensim.models.doc2vec.TaggedLineDocument`.
         update : bool
-            If true, the new words in `sentences` will be added to model's vocab.
+            If true, the new words in `documents` will be added to model's vocab.
         progress_per : int
             Indicates how many words to process before showing/updating the progress.
         keep_raw_vocab : bool
@@ -1223,7 +1227,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No
             len(raw_vocab), sum(itervalues(raw_vocab))
         )
 
-        # Since no sentences are provided, this is to control the corpus_count
+        # Since no documents are provided, this is to control the corpus_count
         self.corpus_count = corpus_count or 0
         self.vocabulary.raw_vocab = raw_vocab
 
@@ -1337,7 +1341,7 @@ def scan_vocab(self, documents=None, corpus_file=None, docvecs=None, progress_pe
             The tagged documents used to create the vocabulary. Their tags can be either str tokens or ints (faster).
         corpus_file : str, optional
             Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
-            You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
+            You may use this argument instead of `documents` to get performance boost. Only one of `documents` or
             `corpus_file` arguments need to be passed (not both of them).
         docvecs : list of :class:`~gensim.models.keyedvectors.Doc2VecKeyedVectors`
             The vector representations of the documents in our corpus. Each of them has a size == `vector_size`.
@@ -1506,7 +1510,7 @@ def __iter__(self):
 
 
 class TaggedLineDocument(object):
-    """Iterate over a file that contains sentences: one line = :class:`~gensim.models.doc2vec.TaggedDocument` object.
+    """Iterate over a file that contains documents: one line = :class:`~gensim.models.doc2vec.TaggedDocument` object.
 
     Words are expected to be already preprocessed and separated by whitespace. Document tags are constructed
     automatically from the document line number (each document gets a unique integer tag).

diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
@@ -290,7 +290,7 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha
         corpus_file : str, optional
             Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
             You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
-            `corpus_file` arguments need to be passed (or none of them).
+            `corpus_file` arguments need to be passed (or none of them, in that case, the model is left uninitialized).
         min_count : int, optional
             The model ignores all words with total frequency lower than this.
         size : int, optional

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
@@ -677,7 +677,7 @@ def __init__(self, sentences=None, corpus_file=None, size=100, alpha=0.025, wind
         corpus_file : str, optional
             Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
             You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
-            `corpus_file` arguments need to be passed (or none of them).
+            `corpus_file` arguments need to be passed (or none of them, in that case, the model is left uninitialized).
         size : int, optional
             Dimensionality of the word vectors.
         window : int, optional