From a3b9cde37f62d2dc235b419be4a88c52820c9f11 Mon Sep 17 00:00:00 2001 From: Gordon Mohr Date: Wed, 8 Feb 2017 21:59:04 -0800 Subject: [PATCH 1/2] require explicit corpus size, epochs for train() --- gensim/models/word2vec.py | 46 +++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 8a62d02588..91e5e1de16 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -448,7 +448,8 @@ def __init__( if isinstance(sentences, GeneratorType): raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.") self.build_vocab(sentences, trim_rule=trim_rule) - self.train(sentences) + self.train(sentences, total_examples=self.corpus_count, epochs=self.iter, + start_alpha=self.alpha, end_alpha=self.min_alpha) def initialize_word_vectors(self): self.wv = KeyedVectors() @@ -729,16 +730,23 @@ def _raw_word_count(self, job): """Return the number of words in a given job.""" return sum(len(sentence) for sentence in job) - def train(self, sentences, total_words=None, word_count=0, - total_examples=None, queue_factor=2, report_delay=1.0): + def train(self, sentences, total_examples=None, total_words=None, + epochs=None, start_alpha=None, end_alpha=None, + word_count=0, + queue_factor=2, report_delay=1.0): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). For Word2Vec, each sentence must be a list of unicode strings. (Subclasses may accept other examples.) - To support linear learning-rate decay from (initial) alpha to min_alpha, either total_examples - (count of sentences) or total_words (count of raw words in sentences) should be provided, unless the - sentences are the same as those that were used to initially build the vocabulary. + To support linear learning-rate decay from (initial) alpha to min_alpha, and accurate + progres-percentage logging, either total_examples (count of sentences) or total_words (count of + raw words in sentences) MUST be provided. (If the corpus is the same as was provided to + `build_vocab()`, the count of examples in that corpus will be available in the model's + `corpus_count` property.) + To avoid common mistakes around the model's ability to do multiple training passes itself, an + explicit `epochs` argument MUST be provided. In the common and recommended case, where `train()` + is only called once, the model's cached `iter` value should be supplied as `epochs` value. """ if (self.model_trimmed_post_training): raise RuntimeError("Parameters for training were discarded using model_trimmed_post_training method") @@ -770,18 +778,18 @@ def train(self, sentences, total_words=None, word_count=0, "Instead start with a blank model, scan_vocab on the new corpus, intersect_word2vec_format with the old model, then train.") if total_words is None and total_examples is None: - if self.corpus_count: - total_examples = self.corpus_count - logger.info("expecting %i sentences, matching count from corpus used for vocabulary survey", total_examples) - else: - raise ValueError("you must provide either total_words or total_examples, to enable alpha and progress calculations") + raise ValueError("you must specify either total_examples or total_words, for proper alpha and progress calculations") + if epochs is None: + raise ValueError("you must specify an explict epochs count") + start_alpha = start_alpha or self.alpha + end_alpha = end_alpha or self.min_alpha job_tally = 0 - if self.iter > 1: - sentences = utils.RepeatCorpusNTimes(sentences, self.iter) - total_words = total_words and total_words * self.iter - total_examples = total_examples and total_examples * self.iter + if epochs > 1: + sentences = utils.RepeatCorpusNTimes(sentences, epochs) + total_words = total_words and total_words * epochs + total_examples = total_examples and total_examples * epochs def worker_loop(): """Train the model, lifting lists of sentences from the job_queue.""" @@ -803,7 +811,7 @@ def job_producer(): """Fill jobs queue using the input `sentences` iterator.""" job_batch, batch_size = [], 0 pushed_words, pushed_examples = 0, 0 - next_alpha = self.alpha + next_alpha = start_alpha if next_alpha > self.min_alpha_yet_reached: logger.warn("Effective 'alpha' higher than previous training cycles") self.min_alpha_yet_reached = next_alpha @@ -826,7 +834,7 @@ def job_producer(): job_queue.put((job_batch, next_alpha)) # update the learning rate for the next job - if self.min_alpha < next_alpha: + if end_alpha < next_alpha: if total_examples: # examples-based decay pushed_examples += len(job_batch) @@ -835,8 +843,8 @@ def job_producer(): # words-based decay pushed_words += self._raw_word_count(job_batch) progress = 1.0 * pushed_words / total_words - next_alpha = self.alpha - (self.alpha - self.min_alpha) * progress - next_alpha = max(self.min_alpha, next_alpha) + next_alpha = start_alpha - (start_alpha - end_alpha) * progress + next_alpha = max(end_alpha, next_alpha) # add the sentence that didn't fit as the first item of a new job job_batch, batch_size = [sentence], sentence_length From 0eb9a6661ca094e089eb3bc35b7c5271c5fb3aae Mon Sep 17 00:00:00 2001 From: Gordon Mohr Date: Wed, 8 Feb 2017 21:59:55 -0800 Subject: [PATCH 2/2] make all train() calls use explicit count, epochs --- gensim/models/doc2vec.py | 2 +- gensim/test/test_doc2vec.py | 4 ++-- gensim/test/test_word2vec.py | 20 ++++++++++---------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 80ba234573..7b3b95672c 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -628,7 +628,7 @@ def __init__(self, documents=None, dm_mean=None, self.comment = comment if documents is not None: self.build_vocab(documents, trim_rule=trim_rule) - self.train(documents) + self.train(documents, total_examples=self.corpus_count, epochs=self.iter) @property def dm(self): diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index ad658250f5..57d04ae81a 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -185,7 +185,7 @@ def test_training(self): model = doc2vec.Doc2Vec(size=100, min_count=2, iter=20, workers=1) model.build_vocab(corpus) self.assertEqual(model.docvecs.doctag_syn0.shape, (300, 100)) - model.train(corpus) + model.train(corpus, total_examples=model.corpus_count, epochs=model.iter) self.model_sanity(model) @@ -341,7 +341,7 @@ def testTrainWarning(self, l): model = doc2vec.Doc2Vec(alpha=0.025, min_alpha=0.025, min_count=1, workers=8, size=5) model.build_vocab(sentences) for epoch in range(10): - model.train(sentences) + model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) model.alpha -= 0.002 model.min_alpha = model.alpha if epoch == 5: diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 035765e8a0..e1db27d60e 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -104,12 +104,12 @@ def onlineSanity(self, model): others.append(l) self.assertTrue(all(['terrorism' not in l for l in others])) model.build_vocab(others) - model.train(others) + model.train(others, total_examples=model.corpus_count, epochs=model.iter) self.assertFalse('terrorism' in model.wv.vocab) model.build_vocab(terro, update=True) self.assertTrue('terrorism' in model.wv.vocab) orig0 = np.copy(model.wv.syn0) - model.train(terro) + model.train(terro, total_examples=len(terro), epochs=model.iter) self.assertFalse(np.allclose(model.wv.syn0, orig0)) sim = model.n_similarity(['war'], ['terrorism']) self.assertLess(0., sim) @@ -349,7 +349,7 @@ def testTraining(self): self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2)) self.assertTrue(model.syn1.shape == (len(model.wv.vocab), 2)) - model.train(sentences) + model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) sims = model.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar @@ -385,7 +385,7 @@ def testLocking(self): # lock the vector in slot 0 against change model.syn0_lockf[0] = 0.0 - model.train(corpus) + model.train(corpus, total_examples=model.corpus_count, epochs=model.iter) self.assertFalse((unlocked1 == model.wv.syn0[1]).all()) # unlocked vector should vary self.assertTrue((locked0 == model.wv.syn0[0]).all()) # locked vector should not vary @@ -414,7 +414,7 @@ def model_sanity(self, model, train=True): if train: model.build_vocab(list_corpus) orig0 = np.copy(model.wv.syn0[0]) - model.train(list_corpus) + model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter) self.assertFalse((orig0 == model.wv.syn0[1]).all()) # vector should vary after training sims = model.most_similar('war', topn=len(model.index2word)) t_rank = [word for word, score in sims].index('terrorism') @@ -456,7 +456,7 @@ def testTrainingCbow(self): self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2)) self.assertTrue(model.syn1.shape == (len(model.wv.vocab), 2)) - model.train(sentences) + model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) sims = model.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar @@ -479,7 +479,7 @@ def testTrainingSgNegative(self): self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2)) self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), 2)) - model.train(sentences) + model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) sims = model.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar @@ -502,7 +502,7 @@ def testTrainingCbowNegative(self): self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2)) self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), 2)) - model.train(sentences) + model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) sims = model.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar @@ -521,7 +521,7 @@ def testSimilarities(self): # The model is trained using CBOW model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2) model.build_vocab(sentences) - model.train(sentences) + model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) self.assertTrue(model.n_similarity(['graph', 'trees'], ['trees', 'graph'])) self.assertTrue(model.n_similarity(['graph'], ['trees']) == model.similarity('graph', 'trees')) @@ -612,7 +612,7 @@ def testTrainWarning(self, l): model = word2vec.Word2Vec(min_count=1) model.build_vocab(sentences) for epoch in range(10): - model.train(sentences) + model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) model.alpha -= 0.002 model.min_alpha = model.alpha if epoch == 5: