Skip to content

Commit

Permalink
Minor corrections and improvements in Any*Vec docstrings (#2345)
Browse files Browse the repository at this point in the history
  • Loading branch information
tobycheese authored and menshikh-iv committed Jan 22, 2019
1 parent 5c5b97b commit 9819ce8
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 21 deletions.
4 changes: 2 additions & 2 deletions gensim/models/base_any2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -660,7 +660,7 @@ def __init__(self, sentences=None, corpus_file=None, workers=3, vector_size=100,
corpus_file : str, optional
Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
`corpus_file` arguments need to be passed (or none of them).
`corpus_file` arguments need to be passed (or none of them, in that case, the model is left uninitialized).
workers : int, optional
Number of working threads, used for multiprocessing.
vector_size : int, optional
Expand Down Expand Up @@ -754,7 +754,7 @@ def __init__(self, sentences=None, corpus_file=None, workers=3, vector_size=100,
if corpus_file is not None and not isinstance(corpus_file, string_types):
raise TypeError("You must pass string as the corpus_file argument.")
elif isinstance(sentences, GeneratorType):
raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.")
raise TypeError("You can't pass a generator as the sentences argument. Try a sequence.")

self.build_vocab(sentences=sentences, corpus_file=corpus_file, trim_rule=trim_rule)
self.train(
Expand Down
35 changes: 18 additions & 17 deletions gensim/models/doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,13 +482,14 @@ def __init__(self, documents=None, corpus_file=None, dm_mean=None, dm=1, dbow_wo
----------
documents : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional
Input corpus, can be simply a list of elements, but for larger corpora,consider an iterable that streams
the documents directly from disk/network. If you don't supply `documents`, the model is
the documents directly from disk/network. If you don't supply `documents` (or `corpus_file`), the model is
left uninitialized -- use if you plan to initialize it in some other way.
corpus_file : str, optional
Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
`corpus_file` arguments need to be passed (or none of them). Documents' tags are assigned automatically
and are equal to line number, as in :class:`~gensim.models.doc2vec.TaggedLineDocument`.
You may use this argument instead of `documents` to get performance boost. Only one of `documents` or
`corpus_file` arguments need to be passed (or none of them, in that case, the model is left uninitialized).
Documents' tags are assigned automatically and are equal to line number, as in
:class:`~gensim.models.doc2vec.TaggedLineDocument`.
dm : {1,0}, optional
Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used.
Otherwise, `distributed bag of words` (PV-DBOW) is employed.
Expand Down Expand Up @@ -616,7 +617,7 @@ def __init__(self, documents=None, corpus_file=None, dm_mean=None, dm=1, dbow_wo
if corpus_file is not None and not isinstance(corpus_file, string_types):
raise TypeError("You must pass string as the corpus_file argument.")
elif isinstance(documents, GeneratorType):
raise TypeError("You can't pass a generator as the documents argument. Try an iterator.")
raise TypeError("You can't pass a generator as the documents argument. Try a sequence.")
self.build_vocab(documents=documents, corpus_file=corpus_file, trim_rule=trim_rule)
self.train(
documents=documents, corpus_file=corpus_file, total_examples=self.corpus_count,
Expand Down Expand Up @@ -743,8 +744,8 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor
"""Update the model's neural weights.
To support linear learning-rate decay from (initial) `alpha` to `min_alpha`, and accurate
progress-percentage logging, either `total_examples` (count of sentences) or `total_words` (count of
raw words in sentences) **MUST** be provided. If `sentences` is the same corpus
progress-percentage logging, either `total_examples` (count of documents) or `total_words` (count of
raw words in documents) **MUST** be provided. If `documents` is the same corpus
that was provided to :meth:`~gensim.models.word2vec.Word2Vec.build_vocab` earlier,
you can simply use `total_examples=self.corpus_count`.
Expand All @@ -757,15 +758,15 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor
----------
documents : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional
Can be simply a list of elements, but for larger corpora,consider an iterable that streams
the documents directly from disk/network. If you don't supply `documents`, the model is
the documents directly from disk/network. If you don't supply `documents` (or `corpus_file`), the model is
left uninitialized -- use if you plan to initialize it in some other way.
corpus_file : str, optional
Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
You may use this argument instead of `documents` to get performance boost. Only one of `documents` or
`corpus_file` arguments need to be passed (not both of them). Documents' tags are assigned automatically
and are equal to line number, as in :class:`~gensim.models.doc2vec.TaggedLineDocument`.
total_examples : int, optional
Count of sentences.
Count of documents.
total_words : int, optional
Count of raw words in documents.
epochs : int, optional
Expand All @@ -783,7 +784,7 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor
the alpha learning-rate yourself (not recommended).
word_count : int, optional
Count of words already trained. Set this to 0 for the usual
case of training on all words in sentences.
case of training on all words in documents.
queue_factor : int, optional
Multiplier for size of queue (number of workers * queue_factor).
report_delay : float, optional
Expand Down Expand Up @@ -1131,7 +1132,7 @@ def estimate_memory(self, vocab_size=None, report=None):

def build_vocab(self, documents=None, corpus_file=None, update=False, progress_per=10000, keep_raw_vocab=False,
trim_rule=None, **kwargs):
"""Build vocabulary from a sequence of sentences (can be a once-only generator stream).
"""Build vocabulary from a sequence of documents (can be a once-only generator stream).
Parameters
----------
Expand All @@ -1141,11 +1142,11 @@ def build_vocab(self, documents=None, corpus_file=None, update=False, progress_p
See :class:`~gensim.models.doc2vec.TaggedBrownCorpus` or :class:`~gensim.models.doc2vec.TaggedLineDocument`
corpus_file : str, optional
Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
You may use this argument instead of `documents` to get performance boost. Only one of `documents` or
`corpus_file` arguments need to be passed (not both of them). Documents' tags are assigned automatically
and are equal to a line number, as in :class:`~gensim.models.doc2vec.TaggedLineDocument`.
update : bool
If true, the new words in `sentences` will be added to model's vocab.
If true, the new words in `documents` will be added to model's vocab.
progress_per : int
Indicates how many words to process before showing/updating the progress.
keep_raw_vocab : bool
Expand Down Expand Up @@ -1223,7 +1224,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No
len(raw_vocab), sum(itervalues(raw_vocab))
)

# Since no sentences are provided, this is to control the corpus_count
# Since no documents are provided, this is to control the corpus_count
self.corpus_count = corpus_count or 0
self.vocabulary.raw_vocab = raw_vocab

Expand Down Expand Up @@ -1337,7 +1338,7 @@ def scan_vocab(self, documents=None, corpus_file=None, docvecs=None, progress_pe
The tagged documents used to create the vocabulary. Their tags can be either str tokens or ints (faster).
corpus_file : str, optional
Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
You may use this argument instead of `documents` to get performance boost. Only one of `documents` or
`corpus_file` arguments need to be passed (not both of them).
docvecs : list of :class:`~gensim.models.keyedvectors.Doc2VecKeyedVectors`
The vector representations of the documents in our corpus. Each of them has a size == `vector_size`.
Expand Down Expand Up @@ -1506,7 +1507,7 @@ def __iter__(self):


class TaggedLineDocument(object):
"""Iterate over a file that contains sentences: one line = :class:`~gensim.models.doc2vec.TaggedDocument` object.
"""Iterate over a file that contains documents: one line = :class:`~gensim.models.doc2vec.TaggedDocument` object.
Words are expected to be already preprocessed and separated by whitespace. Document tags are constructed
automatically from the document line number (each document gets a unique integer tag).
Expand Down
2 changes: 1 addition & 1 deletion gensim/models/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha
corpus_file : str, optional
Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
`corpus_file` arguments need to be passed (or none of them).
`corpus_file` arguments need to be passed (or none of them, in that case, the model is left uninitialized).
min_count : int, optional
The model ignores all words with total frequency lower than this.
size : int, optional
Expand Down
2 changes: 1 addition & 1 deletion gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -677,7 +677,7 @@ def __init__(self, sentences=None, corpus_file=None, size=100, alpha=0.025, wind
corpus_file : str, optional
Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
`corpus_file` arguments need to be passed (or none of them).
`corpus_file` arguments need to be passed (or none of them, in that case, the model is left uninitialized).
size : int, optional
Dimensionality of the word vectors.
window : int, optional
Expand Down

0 comments on commit 9819ce8

Please sign in to comment.