Testing vector correctness.

piskvorky · Oct 15, 2015 · 9f95ff2 · 9f95ff2
1 parent b0b398a
commit 9f95ff2
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 10 deletions.
diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
@@ -740,7 +740,7 @@ def worker_loop():
         MAX_NUM_SENTENCES = 1000  # TODO: should be in word2vec_inner.pyx as well. TODO: consider proper value.
         # fill jobs queue with (sentence, alpha) job tuples
         job_source = enumerate(sentences)
-        while True:
+        while True:  # TODO: use for instead.
             try:
                 sent_idx, sent = job_source.next()
                 if batch_size + len(sent) < MAX_WORDS_IN_BATCH and num_sentences < MAX_NUM_SENTENCES:
@@ -796,19 +796,19 @@ def worker_loop():
                     batch_size = 0
                     job_no += 1
 
-                if job_no == -1 and self.train_count == 0:
-                    logger.warning(
-                        "train() called with empty iterator (if not intended, "
-                        "be sure to provide a corpus that offers restartable "
-                        "iteration)."
-                    )
-
                 logger.info(
                     "reached end of input; waiting to finish %i outstanding jobs",
                     job_no - done_jobs)
                 for _ in xrange(self.workers):
                     job_queue.put((None, 0, [0]))  # give the workers heads up that they can finish -- no more work!
                 push_done = True
+
+            if job_no == -1 and self.train_count == 0:
+                logger.warning(
+                    "train() called with empty iterator (if not intended, "
+                    "be sure to provide a corpus that offers restartable "
+                    "iteration)."
+                )
             try:
                 while done_jobs < job_no or not push_done:
                     examples, trained_words, raw_words = progress_queue.get(push_done)  # only block after all jobs pushed

diff --git a/gensim/test/test_word2vec_batching.py b/gensim/test/test_word2vec_batching.py
@@ -36,17 +36,23 @@ def __iter__(self):
     num_sents = 10000
     sentences = SentenceGenerator(num_sents=num_sents)
 
+    test_words = ['chance', 'strings', 'spiral']
+
     logging.info('Training model with batching.')
     start = time()
-    model2 = Word2Vec(sentences, batch=True)
+    model2 = Word2Vec(sentences, batch=True, seed=0)
     logging.info('------------------------------------------------------')
     logging.info('Done training model. Time elapsed: %f seconds.', time() - start)
 
     logging.info('Training model without batching.')
     start = time()
-    model1 = Word2Vec(sentences)
+    model1 = Word2Vec(sentences, seed=0)
     logging.info('------------------------------------------------------')
     logging.info('Done training model. Time elapsed: %f seconds.', time() - start)
 
+    diff = {}
+    for test_word in test_words:
+        diff[test_word] = model1[test_word] - model2[test_word]
+
     import pdb
     pdb.set_trace()