Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add warning when string is used as argument to infer_vector() #2347

Merged
merged 5 commits into from
Jan 23, 2019
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions gensim/models/base_any2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -660,7 +660,7 @@ def __init__(self, sentences=None, corpus_file=None, workers=3, vector_size=100,
corpus_file : str, optional
Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
`corpus_file` arguments need to be passed (or none of them).
`corpus_file` arguments need to be passed (or none of them, in that case, the model is left uninitialized).
workers : int, optional
Number of working threads, used for multiprocessing.
vector_size : int, optional
Expand Down Expand Up @@ -754,7 +754,7 @@ def __init__(self, sentences=None, corpus_file=None, workers=3, vector_size=100,
if corpus_file is not None and not isinstance(corpus_file, string_types):
raise TypeError("You must pass string as the corpus_file argument.")
elif isinstance(sentences, GeneratorType):
raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.")
raise TypeError("You can't pass a generator as the sentences argument. Try a sequence.")

self.build_vocab(sentences=sentences, corpus_file=corpus_file, trim_rule=trim_rule)
self.train(
Expand Down
38 changes: 21 additions & 17 deletions gensim/models/doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,13 +482,14 @@ def __init__(self, documents=None, corpus_file=None, dm_mean=None, dm=1, dbow_wo
----------
documents : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional
Input corpus, can be simply a list of elements, but for larger corpora,consider an iterable that streams
the documents directly from disk/network. If you don't supply `documents`, the model is
the documents directly from disk/network. If you don't supply `documents` (or `corpus_file`), the model is
left uninitialized -- use if you plan to initialize it in some other way.
corpus_file : str, optional
Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
`corpus_file` arguments need to be passed (or none of them). Documents' tags are assigned automatically
and are equal to line number, as in :class:`~gensim.models.doc2vec.TaggedLineDocument`.
You may use this argument instead of `documents` to get performance boost. Only one of `documents` or
`corpus_file` arguments need to be passed (or none of them, in that case, the model is left uninitialized).
Documents' tags are assigned automatically and are equal to line number, as in
:class:`~gensim.models.doc2vec.TaggedLineDocument`.
dm : {1,0}, optional
Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used.
Otherwise, `distributed bag of words` (PV-DBOW) is employed.
Expand Down Expand Up @@ -616,7 +617,7 @@ def __init__(self, documents=None, corpus_file=None, dm_mean=None, dm=1, dbow_wo
if corpus_file is not None and not isinstance(corpus_file, string_types):
raise TypeError("You must pass string as the corpus_file argument.")
elif isinstance(documents, GeneratorType):
raise TypeError("You can't pass a generator as the documents argument. Try an iterator.")
raise TypeError("You can't pass a generator as the documents argument. Try a sequence.")
self.build_vocab(documents=documents, corpus_file=corpus_file, trim_rule=trim_rule)
self.train(
documents=documents, corpus_file=corpus_file, total_examples=self.corpus_count,
Expand Down Expand Up @@ -743,8 +744,8 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor
"""Update the model's neural weights.

To support linear learning-rate decay from (initial) `alpha` to `min_alpha`, and accurate
progress-percentage logging, either `total_examples` (count of sentences) or `total_words` (count of
raw words in sentences) **MUST** be provided. If `sentences` is the same corpus
progress-percentage logging, either `total_examples` (count of documents) or `total_words` (count of
raw words in documents) **MUST** be provided. If `documents` is the same corpus
that was provided to :meth:`~gensim.models.word2vec.Word2Vec.build_vocab` earlier,
you can simply use `total_examples=self.corpus_count`.

Expand All @@ -757,15 +758,15 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor
----------
documents : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional
Can be simply a list of elements, but for larger corpora,consider an iterable that streams
the documents directly from disk/network. If you don't supply `documents`, the model is
the documents directly from disk/network. If you don't supply `documents` (or `corpus_file`), the model is
left uninitialized -- use if you plan to initialize it in some other way.
corpus_file : str, optional
Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
You may use this argument instead of `documents` to get performance boost. Only one of `documents` or
`corpus_file` arguments need to be passed (not both of them). Documents' tags are assigned automatically
and are equal to line number, as in :class:`~gensim.models.doc2vec.TaggedLineDocument`.
total_examples : int, optional
Count of sentences.
Count of documents.
total_words : int, optional
Count of raw words in documents.
epochs : int, optional
Expand All @@ -783,7 +784,7 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor
the alpha learning-rate yourself (not recommended).
word_count : int, optional
Count of words already trained. Set this to 0 for the usual
case of training on all words in sentences.
case of training on all words in documents.
queue_factor : int, optional
Multiplier for size of queue (number of workers * queue_factor).
report_delay : float, optional
Expand Down Expand Up @@ -906,6 +907,9 @@ def infer_vector(self, doc_words, alpha=None, min_alpha=None, epochs=None, steps
The inferred paragraph vector for the new document.

"""
if isinstance(doc_words, string_types):
raise ValueError("You must pass a list of strings, not a single string.")
piskvorky marked this conversation as resolved.
Show resolved Hide resolved

alpha = alpha or self.alpha
min_alpha = min_alpha or self.min_alpha
epochs = epochs or steps or self.epochs
Expand Down Expand Up @@ -1131,7 +1135,7 @@ def estimate_memory(self, vocab_size=None, report=None):

def build_vocab(self, documents=None, corpus_file=None, update=False, progress_per=10000, keep_raw_vocab=False,
trim_rule=None, **kwargs):
"""Build vocabulary from a sequence of sentences (can be a once-only generator stream).
"""Build vocabulary from a sequence of documents (can be a once-only generator stream).

Parameters
----------
Expand All @@ -1141,11 +1145,11 @@ def build_vocab(self, documents=None, corpus_file=None, update=False, progress_p
See :class:`~gensim.models.doc2vec.TaggedBrownCorpus` or :class:`~gensim.models.doc2vec.TaggedLineDocument`
corpus_file : str, optional
Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
You may use this argument instead of `documents` to get performance boost. Only one of `documents` or
`corpus_file` arguments need to be passed (not both of them). Documents' tags are assigned automatically
and are equal to a line number, as in :class:`~gensim.models.doc2vec.TaggedLineDocument`.
update : bool
If true, the new words in `sentences` will be added to model's vocab.
If true, the new words in `documents` will be added to model's vocab.
progress_per : int
Indicates how many words to process before showing/updating the progress.
keep_raw_vocab : bool
Expand Down Expand Up @@ -1223,7 +1227,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No
len(raw_vocab), sum(itervalues(raw_vocab))
)

# Since no sentences are provided, this is to control the corpus_count
# Since no documents are provided, this is to control the corpus_count
self.corpus_count = corpus_count or 0
self.vocabulary.raw_vocab = raw_vocab

Expand Down Expand Up @@ -1337,7 +1341,7 @@ def scan_vocab(self, documents=None, corpus_file=None, docvecs=None, progress_pe
The tagged documents used to create the vocabulary. Their tags can be either str tokens or ints (faster).
corpus_file : str, optional
Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
You may use this argument instead of `documents` to get performance boost. Only one of `documents` or
`corpus_file` arguments need to be passed (not both of them).
docvecs : list of :class:`~gensim.models.keyedvectors.Doc2VecKeyedVectors`
The vector representations of the documents in our corpus. Each of them has a size == `vector_size`.
Expand Down Expand Up @@ -1506,7 +1510,7 @@ def __iter__(self):


class TaggedLineDocument(object):
"""Iterate over a file that contains sentences: one line = :class:`~gensim.models.doc2vec.TaggedDocument` object.
"""Iterate over a file that contains documents: one line = :class:`~gensim.models.doc2vec.TaggedDocument` object.

Words are expected to be already preprocessed and separated by whitespace. Document tags are constructed
automatically from the document line number (each document gets a unique integer tag).
Expand Down
2 changes: 1 addition & 1 deletion gensim/models/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha
corpus_file : str, optional
Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
`corpus_file` arguments need to be passed (or none of them).
`corpus_file` arguments need to be passed (or none of them, in that case, the model is left uninitialized).
min_count : int, optional
The model ignores all words with total frequency lower than this.
size : int, optional
Expand Down
2 changes: 1 addition & 1 deletion gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -677,7 +677,7 @@ def __init__(self, sentences=None, corpus_file=None, size=100, alpha=0.025, wind
corpus_file : str, optional
Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
`corpus_file` arguments need to be passed (or none of them).
`corpus_file` arguments need to be passed (or none of them, in that case, the model is left uninitialized).
size : int, optional
Dimensionality of the word vectors.
window : int, optional
Expand Down