piskvorky · alreadytaikeune · Jul 19, 2018 · Jul 19, 2018 · Jul 19, 2018 · Jul 19, 2018
diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py
@@ -124,7 +124,21 @@ def _clear_post_train(self):
         raise NotImplementedError()
 
     def _do_train_job(self, data_iterable, job_parameters, thread_private_mem):
-        """Train a single batch. Return 2-tuple `(effective word count, total word count)`."""
+        """Train a single batch.
+        `
+        Returns
+        -------
+        (int, int, int)
+        effective_word_count: int
+            The number of words processed after ignoring unknown words and sentence length trimming.
+        total_word_count: int
+            The total number of words in this batch.
+        total_samples_used: int
+            The total samples used while training on this data. This is the same as the effective word count when using
+            CBOW, but it can differ with Skip-Gram, since a random number of positve examples are used for each average
+            loss for an epoch.
+
+        """
         raise NotImplementedError()
 
     def _check_training_sanity(self, epochs=None, total_examples=None, total_words=None, **kwargs):
@@ -167,12 +181,23 @@ def _worker_loop(self, job_queue, progress_queue):
             for callback in self.callbacks:
                 callback.on_batch_begin(self)
 
-            tally, raw_tally = self._do_train_job(data_iterable, job_parameters, thread_private_mem)
+            stats_tuple = self._do_train_job(
+                data_iterable, job_parameters, thread_private_mem)
+            if len(stats_tuple) == 3:
+                tally, raw_tally, effective_samples = stats_tuple
+            else:
+                # TODO: Some models haven't updated their _do_train_job method to return a 3-tuple instead of a
+                # 2-tuple, containing also the number of samples used while processing the batch.
+                # For those models that don't implement samples tallying, We assume that the number of samples is the
+                # effective words tally. This gives coherent outputs with previous implementations.
+                tally, raw_tally = stats_tuple
+                effective_samples = tally
 
             for callback in self.callbacks:
                 callback.on_batch_end(self)
 
-            progress_queue.put((len(data_iterable), tally, raw_tally))  # report back progress
+            # report back progress
+            progress_queue.put((len(data_iterable), tally, raw_tally, effective_samples))
             jobs_processed += 1
         logger.debug("worker exiting, processed %i jobs", jobs_processed)
 
@@ -260,6 +285,7 @@ def _log_train_end(self, raw_word_count, trained_word_count, total_elapsed, job_
 
     def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_examples=None, total_words=None,
                             report_delay=1.0):
+
         """Get the progress report for a single training epoch.
 
         Parameters
@@ -294,7 +320,7 @@ def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_exam
                 * Total word count used in training.
 
         """
-        example_count, trained_word_count, raw_word_count = 0, 0, 0
+        example_count, trained_word_count, raw_word_count, samples_count = 0, 0, 0, 0
         start, next_report = default_timer() - 0.00001, 1.0
         job_tally = 0
         unfinished_worker_count = self.workers
@@ -305,20 +331,20 @@ def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_exam
                 unfinished_worker_count -= 1
                 logger.info("worker thread finished; awaiting finish of %i more threads", unfinished_worker_count)
                 continue
-            examples, trained_words, raw_words = report
+            examples, trained_words, raw_words, effective_samples = report
             job_tally += 1
 
             # update progress stats
             example_count += examples
             trained_word_count += trained_words  # only words in vocab & sampled
             raw_word_count += raw_words
-
+            samples_count += effective_samples
             # log progress once every report_delay seconds
             elapsed = default_timer() - start
             if elapsed >= next_report:
                 self._log_progress(
                     job_queue, progress_queue, cur_epoch, example_count, total_examples,
-                    raw_word_count, total_words, trained_word_count, elapsed)
+                    raw_word_count, total_words, trained_word_count, samples_count, elapsed)
                 next_report = elapsed + report_delay
         # all done; report the final stats
         elapsed = default_timer() - start
@@ -361,6 +387,7 @@ def _train_epoch(self, data_iterable=None, data_iterables=None, cur_epoch=0, tot
                 * Total word count used in training.
 
         """
+        self.running_training_loss = 0.
         self._check_input_data_sanity(data_iterable, data_iterables)
         job_queue = Queue(maxsize=queue_factor * self.workers)
         progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers)
@@ -966,6 +993,9 @@ def train(self, sentences=None, input_streams=None, total_examples=None, total_w
             total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count,
             queue_factor=queue_factor, report_delay=report_delay, compute_loss=compute_loss, callbacks=callbacks)
 
+    def get_latest_training_loss(self):
+        raise NotImplementedError("To compute the loss for a model, you must implement get_latest_training_loss")
+
     def _get_job_params(self, cur_epoch):
         """Get the learning rate used in the current epoch.
 
@@ -1146,7 +1176,7 @@ def load(cls, *args, **kwargs):
         return model
 
     def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples,
-                      raw_word_count, total_words, trained_word_count, elapsed):
+                      raw_word_count, total_words, trained_word_count, total_samples, elapsed):
         """Callback used to log progress for long running jobs.
 
         Parameters
@@ -1172,24 +1202,28 @@ def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, tot
         trained_word_count : int
             Number of effective words used in training until now (after ignoring unknown words and trimming
             the sentence length).
+        total_samples : int
+            Number of effective samples used in training until now (differs from total_examples for Skip-Gram)
         elapsed : int
             Elapsed time since the beginning of training in seconds.
 
         """
         if total_examples:
-            # examples-based progress %
-            logger.info(
-                "EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i",
-                cur_epoch + 1, 100.0 * example_count / total_examples, trained_word_count / elapsed,
-                utils.qsize(job_queue), utils.qsize(progress_queue)
-            )
+            div = total_examples
         else:
-            # words-based progress %
-            logger.info(
-                "EPOCH %i - PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i",
-                cur_epoch + 1, 100.0 * raw_word_count / total_words, trained_word_count / elapsed,
-                utils.qsize(job_queue), utils.qsize(progress_queue)
-            )
+            div = total_words
+
+        msg = "EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i"
+        args = (cur_epoch + 1, 100.0 * example_count / div, trained_word_count / elapsed,
+                utils.qsize(job_queue), utils.qsize(progress_queue))
+        if self.compute_loss:
+            if total_samples == 0:
+                loss = -1
+            else:
+                loss = self.get_latest_training_loss() / total_samples
+            msg += ", current_loss %.3f"
+            args += (loss,)
+        logger.info(msg, *args)
 
     def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_count, total_words,
                        trained_word_count, elapsed):

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
@@ -178,9 +178,12 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False):
         int
             Number of words in the vocabulary actually used for training (that already existed in the vocabulary
             and were not discarded by negative sampling).
+        int
+            Number of samples used for training. A sample is a positive/negative example.
 
         """
         result = 0
+        effective_samples = 0
         for sentence in sentences:
             word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
                            model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
@@ -192,12 +195,13 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False):
                 for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start):
                     # don't train on the `word` itself
                     if pos2 != pos:
+                        effective_samples += 1
                         train_sg_pair(
                             model, model.wv.index2word[word.index], word2.index, alpha, compute_loss=compute_loss
                         )
 
             result += len(word_vocabs)
-        return result
+        return result, effective_samples
 
     def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss=False):
         """Update CBOW model by training on a sequence of sentences.
@@ -229,6 +233,9 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss
         int
             Number of words in the vocabulary actually used for training (that already existed in the vocabulary
             and were not discarded by negative sampling).
+        int
+            Number of samples used for training. A sample is a positive/negative example. In the case of CBOW
+            this is the same as the effective number of words.
 
         """
         result = 0
@@ -247,7 +254,7 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss
                     l1 /= len(word2_indices)
                 train_cbow_pair(model, word, word2_indices, l1, alpha, compute_loss=compute_loss)
             result += len(word_vocabs)
-        return result
+        return result, result
 
     def score_sentence_sg(model, sentence, work=None):
         """Obtain likelihood score for a single sentence in a fitted skip-gram representation.
@@ -771,12 +778,11 @@ def _do_train_job(self, sentences, alpha, inits):
 
         """
         work, neu1 = inits
-        tally = 0
         if self.sg:
-            tally += train_batch_sg(self, sentences, alpha, work, self.compute_loss)
+            (tally, effective_samples) = train_batch_sg(self, sentences, alpha, work, self.compute_loss)
         else:
-            tally += train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss)
-        return tally, self._raw_word_count(sentences)
+            (tally, effective_samples) = train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss)
+        return tally, self._raw_word_count(sentences), effective_samples
 
     def _clear_post_train(self):
         """Remove all L2-normalized word vectors from the model."""