From a3b9cde37f62d2dc235b419be4a88c52820c9f11 Mon Sep 17 00:00:00 2001
From: Gordon Mohr <gojogit@gmail.com>
Date: Wed, 8 Feb 2017 21:59:04 -0800
Subject: [PATCH 1/2] require explicit corpus size, epochs for train()

---
 gensim/models/word2vec.py | 46 +++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 19 deletions(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 8a62d02588..91e5e1de16 100644
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -448,7 +448,8 @@ def __init__(
             if isinstance(sentences, GeneratorType):
                 raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.")
             self.build_vocab(sentences, trim_rule=trim_rule)
-            self.train(sentences)
+            self.train(sentences, total_examples=self.corpus_count, epochs=self.iter,
+                       start_alpha=self.alpha, end_alpha=self.min_alpha)
 
     def initialize_word_vectors(self):
         self.wv = KeyedVectors()
@@ -729,16 +730,23 @@ def _raw_word_count(self, job):
         """Return the number of words in a given job."""
         return sum(len(sentence) for sentence in job)
 
-    def train(self, sentences, total_words=None, word_count=0,
-              total_examples=None, queue_factor=2, report_delay=1.0):
+    def train(self, sentences, total_examples=None, total_words=None,
+              epochs=None, start_alpha=None, end_alpha=None,
+              word_count=0,
+              queue_factor=2, report_delay=1.0):
         """
         Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
         For Word2Vec, each sentence must be a list of unicode strings. (Subclasses may accept other examples.)
 
-        To support linear learning-rate decay from (initial) alpha to min_alpha, either total_examples
-        (count of sentences) or total_words (count of raw words in sentences) should be provided, unless the
-        sentences are the same as those that were used to initially build the vocabulary.
+        To support linear learning-rate decay from (initial) alpha to min_alpha, and accurate
+        progres-percentage logging, either total_examples (count of sentences) or total_words (count of
+        raw words in sentences) MUST be provided. (If the corpus is the same as was provided to
+        `build_vocab()`, the count of examples in that corpus will be available in the model's
+        `corpus_count` property.)
 
+        To avoid common mistakes around the model's ability to do multiple training passes itself, an
+        explicit `epochs` argument MUST be provided. In the common and recommended case, where `train()`
+        is only called once, the model's cached `iter` value should be supplied as `epochs` value.
         """
         if (self.model_trimmed_post_training):
             raise RuntimeError("Parameters for training were discarded using model_trimmed_post_training method")
@@ -770,18 +778,18 @@ def train(self, sentences, total_words=None, word_count=0,
                 "Instead start with a blank model, scan_vocab on the new corpus, intersect_word2vec_format with the old model, then train.")
 
         if total_words is None and total_examples is None:
-            if self.corpus_count:
-                total_examples = self.corpus_count
-                logger.info("expecting %i sentences, matching count from corpus used for vocabulary survey", total_examples)
-            else:
-                raise ValueError("you must provide either total_words or total_examples, to enable alpha and progress calculations")
+            raise ValueError("you must specify either total_examples or total_words, for proper alpha and progress calculations")
+        if epochs is None:
+            raise ValueError("you must specify an explict epochs count")
+        start_alpha = start_alpha or self.alpha
+        end_alpha = end_alpha or self.min_alpha
 
         job_tally = 0
 
-        if self.iter > 1:
-            sentences = utils.RepeatCorpusNTimes(sentences, self.iter)
-            total_words = total_words and total_words * self.iter
-            total_examples = total_examples and total_examples * self.iter
+        if epochs > 1:
+            sentences = utils.RepeatCorpusNTimes(sentences, epochs)
+            total_words = total_words and total_words * epochs
+            total_examples = total_examples and total_examples * epochs
 
         def worker_loop():
             """Train the model, lifting lists of sentences from the job_queue."""
@@ -803,7 +811,7 @@ def job_producer():
             """Fill jobs queue using the input `sentences` iterator."""
             job_batch, batch_size = [], 0
             pushed_words, pushed_examples = 0, 0
-            next_alpha = self.alpha
+            next_alpha = start_alpha
             if next_alpha > self.min_alpha_yet_reached:
                 logger.warn("Effective 'alpha' higher than previous training cycles")
             self.min_alpha_yet_reached = next_alpha
@@ -826,7 +834,7 @@ def job_producer():
                     job_queue.put((job_batch, next_alpha))
 
                     # update the learning rate for the next job
-                    if self.min_alpha < next_alpha:
+                    if end_alpha < next_alpha:
                         if total_examples:
                             # examples-based decay
                             pushed_examples += len(job_batch)
@@ -835,8 +843,8 @@ def job_producer():
                             # words-based decay
                             pushed_words += self._raw_word_count(job_batch)
                             progress = 1.0 * pushed_words / total_words
-                        next_alpha = self.alpha - (self.alpha - self.min_alpha) * progress
-                        next_alpha = max(self.min_alpha, next_alpha)
+                        next_alpha = start_alpha - (start_alpha - end_alpha) * progress
+                        next_alpha = max(end_alpha, next_alpha)
 
                     # add the sentence that didn't fit as the first item of a new job
                     job_batch, batch_size = [sentence], sentence_length

From 0eb9a6661ca094e089eb3bc35b7c5271c5fb3aae Mon Sep 17 00:00:00 2001
From: Gordon Mohr <gojogit@gmail.com>
Date: Wed, 8 Feb 2017 21:59:55 -0800
Subject: [PATCH 2/2] make all train() calls use explicit count, epochs

---
 gensim/models/doc2vec.py     |  2 +-
 gensim/test/test_doc2vec.py  |  4 ++--
 gensim/test/test_word2vec.py | 20 ++++++++++----------
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
index 80ba234573..7b3b95672c 100644
--- a/gensim/models/doc2vec.py
+++ b/gensim/models/doc2vec.py
@@ -628,7 +628,7 @@ def __init__(self, documents=None, dm_mean=None,
         self.comment = comment
         if documents is not None:
             self.build_vocab(documents, trim_rule=trim_rule)
-            self.train(documents)
+            self.train(documents, total_examples=self.corpus_count, epochs=self.iter)
 
     @property
     def dm(self):
diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py
index ad658250f5..57d04ae81a 100644
--- a/gensim/test/test_doc2vec.py
+++ b/gensim/test/test_doc2vec.py
@@ -185,7 +185,7 @@ def test_training(self):
         model = doc2vec.Doc2Vec(size=100, min_count=2, iter=20, workers=1)
         model.build_vocab(corpus)
         self.assertEqual(model.docvecs.doctag_syn0.shape, (300, 100))
-        model.train(corpus)
+        model.train(corpus, total_examples=model.corpus_count, epochs=model.iter)
 
         self.model_sanity(model)
 
@@ -341,7 +341,7 @@ def testTrainWarning(self, l):
         model = doc2vec.Doc2Vec(alpha=0.025, min_alpha=0.025, min_count=1, workers=8, size=5)
         model.build_vocab(sentences)
         for epoch in range(10):
-            model.train(sentences)
+            model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
             model.alpha -= 0.002
             model.min_alpha = model.alpha
             if epoch == 5:
diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py
index 035765e8a0..e1db27d60e 100644
--- a/gensim/test/test_word2vec.py
+++ b/gensim/test/test_word2vec.py
@@ -104,12 +104,12 @@ def onlineSanity(self, model):
                 others.append(l)
         self.assertTrue(all(['terrorism' not in l for l in others]))
         model.build_vocab(others)
-        model.train(others)
+        model.train(others, total_examples=model.corpus_count, epochs=model.iter)
         self.assertFalse('terrorism' in model.wv.vocab)
         model.build_vocab(terro, update=True)
         self.assertTrue('terrorism' in model.wv.vocab)
         orig0 = np.copy(model.wv.syn0)
-        model.train(terro)
+        model.train(terro, total_examples=len(terro), epochs=model.iter)
         self.assertFalse(np.allclose(model.wv.syn0, orig0))
         sim = model.n_similarity(['war'], ['terrorism'])
         self.assertLess(0., sim)
@@ -349,7 +349,7 @@ def testTraining(self):
         self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2))
         self.assertTrue(model.syn1.shape == (len(model.wv.vocab), 2))
 
-        model.train(sentences)
+        model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
         sims = model.most_similar('graph', topn=10)
         # self.assertTrue(sims[0][0] == 'trees', sims)  # most similar
 
@@ -385,7 +385,7 @@ def testLocking(self):
             # lock the vector in slot 0 against change
             model.syn0_lockf[0] = 0.0
 
-            model.train(corpus)
+            model.train(corpus, total_examples=model.corpus_count, epochs=model.iter)
             self.assertFalse((unlocked1 == model.wv.syn0[1]).all())  # unlocked vector should vary
             self.assertTrue((locked0 == model.wv.syn0[0]).all())  # locked vector should not vary
 
@@ -414,7 +414,7 @@ def model_sanity(self, model, train=True):
         if train:
             model.build_vocab(list_corpus)
             orig0 = np.copy(model.wv.syn0[0])
-            model.train(list_corpus)
+            model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter)
             self.assertFalse((orig0 == model.wv.syn0[1]).all())  # vector should vary after training
         sims = model.most_similar('war', topn=len(model.index2word))
         t_rank = [word for word, score in sims].index('terrorism')
@@ -456,7 +456,7 @@ def testTrainingCbow(self):
         self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2))
         self.assertTrue(model.syn1.shape == (len(model.wv.vocab), 2))
 
-        model.train(sentences)
+        model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
         sims = model.most_similar('graph', topn=10)
         # self.assertTrue(sims[0][0] == 'trees', sims)  # most similar
 
@@ -479,7 +479,7 @@ def testTrainingSgNegative(self):
         self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2))
         self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), 2))
 
-        model.train(sentences)
+        model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
         sims = model.most_similar('graph', topn=10)
         # self.assertTrue(sims[0][0] == 'trees', sims)  # most similar
 
@@ -502,7 +502,7 @@ def testTrainingCbowNegative(self):
         self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2))
         self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), 2))
 
-        model.train(sentences)
+        model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
         sims = model.most_similar('graph', topn=10)
         # self.assertTrue(sims[0][0] == 'trees', sims)  # most similar
 
@@ -521,7 +521,7 @@ def testSimilarities(self):
         # The model is trained using CBOW
         model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2)
         model.build_vocab(sentences)
-        model.train(sentences)
+        model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
 
         self.assertTrue(model.n_similarity(['graph', 'trees'], ['trees', 'graph']))
         self.assertTrue(model.n_similarity(['graph'], ['trees']) == model.similarity('graph', 'trees'))
@@ -612,7 +612,7 @@ def testTrainWarning(self, l):
         model = word2vec.Word2Vec(min_count=1)
         model.build_vocab(sentences)
         for epoch in range(10):
-            model.train(sentences)
+            model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
             model.alpha -= 0.002
             model.min_alpha = model.alpha
             if epoch == 5: