diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 07082a4a1e..3763dc8024 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -740,7 +740,7 @@ def worker_loop(): MAX_NUM_SENTENCES = 1000 # TODO: should be in word2vec_inner.pyx as well. TODO: consider proper value. # fill jobs queue with (sentence, alpha) job tuples job_source = enumerate(sentences) - while True: + while True: # TODO: use for instead. try: sent_idx, sent = job_source.next() if batch_size + len(sent) < MAX_WORDS_IN_BATCH and num_sentences < MAX_NUM_SENTENCES: @@ -796,19 +796,19 @@ def worker_loop(): batch_size = 0 job_no += 1 - if job_no == -1 and self.train_count == 0: - logger.warning( - "train() called with empty iterator (if not intended, " - "be sure to provide a corpus that offers restartable " - "iteration)." - ) - logger.info( "reached end of input; waiting to finish %i outstanding jobs", job_no - done_jobs) for _ in xrange(self.workers): job_queue.put((None, 0, [0])) # give the workers heads up that they can finish -- no more work! push_done = True + + if job_no == -1 and self.train_count == 0: + logger.warning( + "train() called with empty iterator (if not intended, " + "be sure to provide a corpus that offers restartable " + "iteration)." + ) try: while done_jobs < job_no or not push_done: examples, trained_words, raw_words = progress_queue.get(push_done) # only block after all jobs pushed diff --git a/gensim/test/test_word2vec_batching.py b/gensim/test/test_word2vec_batching.py index 930a7a11e0..08dd9e9ec7 100644 --- a/gensim/test/test_word2vec_batching.py +++ b/gensim/test/test_word2vec_batching.py @@ -36,17 +36,23 @@ def __iter__(self): num_sents = 10000 sentences = SentenceGenerator(num_sents=num_sents) + test_words = ['chance', 'strings', 'spiral'] + logging.info('Training model with batching.') start = time() - model2 = Word2Vec(sentences, batch=True) + model2 = Word2Vec(sentences, batch=True, seed=0) logging.info('------------------------------------------------------') logging.info('Done training model. Time elapsed: %f seconds.', time() - start) logging.info('Training model without batching.') start = time() - model1 = Word2Vec(sentences) + model1 = Word2Vec(sentences, seed=0) logging.info('------------------------------------------------------') logging.info('Done training model. Time elapsed: %f seconds.', time() - start) + diff = {} + for test_word in test_words: + diff[test_word] = model1[test_word] - model2[test_word] + import pdb pdb.set_trace()