Skip to content

Commit

Permalink
Testing vector correctness.
Browse files Browse the repository at this point in the history
  • Loading branch information
olavurmortensen committed Oct 15, 2015
1 parent b0b398a commit 9f95ff2
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 10 deletions.
16 changes: 8 additions & 8 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -740,7 +740,7 @@ def worker_loop():
MAX_NUM_SENTENCES = 1000 # TODO: should be in word2vec_inner.pyx as well. TODO: consider proper value.
# fill jobs queue with (sentence, alpha) job tuples
job_source = enumerate(sentences)
while True:
while True: # TODO: use for instead.
try:
sent_idx, sent = job_source.next()
if batch_size + len(sent) < MAX_WORDS_IN_BATCH and num_sentences < MAX_NUM_SENTENCES:
Expand Down Expand Up @@ -796,19 +796,19 @@ def worker_loop():
batch_size = 0
job_no += 1

if job_no == -1 and self.train_count == 0:
logger.warning(
"train() called with empty iterator (if not intended, "
"be sure to provide a corpus that offers restartable "
"iteration)."
)

logger.info(
"reached end of input; waiting to finish %i outstanding jobs",
job_no - done_jobs)
for _ in xrange(self.workers):
job_queue.put((None, 0, [0])) # give the workers heads up that they can finish -- no more work!
push_done = True

if job_no == -1 and self.train_count == 0:
logger.warning(
"train() called with empty iterator (if not intended, "
"be sure to provide a corpus that offers restartable "
"iteration)."
)
try:
while done_jobs < job_no or not push_done:
examples, trained_words, raw_words = progress_queue.get(push_done) # only block after all jobs pushed
Expand Down
10 changes: 8 additions & 2 deletions gensim/test/test_word2vec_batching.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,23 @@ def __iter__(self):
num_sents = 10000
sentences = SentenceGenerator(num_sents=num_sents)

test_words = ['chance', 'strings', 'spiral']

logging.info('Training model with batching.')
start = time()
model2 = Word2Vec(sentences, batch=True)
model2 = Word2Vec(sentences, batch=True, seed=0)
logging.info('------------------------------------------------------')
logging.info('Done training model. Time elapsed: %f seconds.', time() - start)

logging.info('Training model without batching.')
start = time()
model1 = Word2Vec(sentences)
model1 = Word2Vec(sentences, seed=0)
logging.info('------------------------------------------------------')
logging.info('Done training model. Time elapsed: %f seconds.', time() - start)

diff = {}
for test_word in test_words:
diff[test_word] = model1[test_word] - model2[test_word]

import pdb
pdb.set_trace()

0 comments on commit 9f95ff2

Please sign in to comment.