From e0bfb3f7ea5aca8b82e00ac6164e8027f4cfd497 Mon Sep 17 00:00:00 2001 From: Dmitry Persiyanov Date: Tue, 8 Jan 2019 06:09:27 +0300 Subject: [PATCH] Update `Doc2Vec` documentation: how tags are assigned in `corpus_file` mode (#2320) * add clarification regarding tags of documents in corpus_file mode for Doc2Vec * based on -> equal to --- gensim/models/doc2vec.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index a4fb34d1fa..3b5b9c960c 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -487,7 +487,8 @@ def __init__(self, documents=None, corpus_file=None, dm_mean=None, dm=1, dbow_wo corpus_file : str, optional Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or - `corpus_file` arguments need to be passed (or none of them). + `corpus_file` arguments need to be passed (or none of them). Documents' tags are assigned automatically + and are equal to line number, as in :class:`~gensim.models.doc2vec.TaggedLineDocument`. dm : {1,0}, optional Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used. Otherwise, `distributed bag of words` (PV-DBOW) is employed. @@ -761,7 +762,8 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor corpus_file : str, optional Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or - `corpus_file` arguments need to be passed (not both of them). + `corpus_file` arguments need to be passed (not both of them). Documents' tags are assigned automatically + and are equal to line number, as in :class:`~gensim.models.doc2vec.TaggedLineDocument`. total_examples : int, optional Count of sentences. total_words : int, optional @@ -1140,7 +1142,8 @@ def build_vocab(self, documents=None, corpus_file=None, update=False, progress_p corpus_file : str, optional Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or - `corpus_file` arguments need to be passed (not both of them). + `corpus_file` arguments need to be passed (not both of them). Documents' tags are assigned automatically + and are equal to a line number, as in :class:`~gensim.models.doc2vec.TaggedLineDocument`. update : bool If true, the new words in `sentences` will be added to model's vocab. progress_per : int