-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix computation of Word2Vec
loss & add loss value to logging string
#2135
base: develop
Are you sure you want to change the base?
Changes from 7 commits
e96798c
f447df0
a6548c4
a2fd340
1bdd4a5
7b457d6
18735e2
0bcae41
eb4b14d
00e7b7d
995b5f8
6b46f64
f6a5cc5
3a453a9
a8e4a66
854c8fd
5e21a85
0f4d572
3eec299
aaf9ed9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -124,7 +124,21 @@ def _clear_post_train(self): | |
raise NotImplementedError() | ||
|
||
def _do_train_job(self, data_iterable, job_parameters, thread_private_mem): | ||
"""Train a single batch. Return 2-tuple `(effective word count, total word count)`.""" | ||
"""Train a single batch. | ||
` | ||
Returns | ||
------- | ||
(int, int, int) | ||
effective_word_count: int | ||
The number of words processed after ignoring unknown words and sentence length trimming. | ||
total_word_count: int | ||
The total number of words in this batch. | ||
total_samples_used: int | ||
The total samples used while training on this data. This is the same as the effective word count when using | ||
CBOW, but it can differ with Skip-Gram, since a random number of positve examples are used for each average | ||
loss for an epoch. | ||
|
||
""" | ||
raise NotImplementedError() | ||
|
||
def _check_training_sanity(self, epochs=None, total_examples=None, total_words=None, **kwargs): | ||
|
@@ -167,12 +181,23 @@ def _worker_loop(self, job_queue, progress_queue): | |
for callback in self.callbacks: | ||
callback.on_batch_begin(self) | ||
|
||
tally, raw_tally = self._do_train_job(data_iterable, job_parameters, thread_private_mem) | ||
stats_tuple = self._do_train_job( | ||
data_iterable, job_parameters, thread_private_mem) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think there is no need to break the line here. We have a hard limit of 120 characters per line in gensim, and this is well within that limit. |
||
if len(stats_tuple) == 3: | ||
tally, raw_tally, effective_samples = stats_tuple | ||
else: | ||
# TODO: Some models haven't updated their _do_train_job method to return a 3-tuple instead of a | ||
# 2-tuple, containing also the number of samples used while processing the batch. | ||
# For those models that don't implement samples tallying, We assume that the number of samples is the | ||
# effective words tally. This gives coherent outputs with previous implementations. | ||
tally, raw_tally = stats_tuple | ||
effective_samples = tally | ||
|
||
for callback in self.callbacks: | ||
callback.on_batch_end(self) | ||
|
||
progress_queue.put((len(data_iterable), tally, raw_tally)) # report back progress | ||
# report back progress | ||
progress_queue.put((len(data_iterable), tally, raw_tally, effective_samples)) | ||
jobs_processed += 1 | ||
logger.debug("worker exiting, processed %i jobs", jobs_processed) | ||
|
||
|
@@ -260,6 +285,7 @@ def _log_train_end(self, raw_word_count, trained_word_count, total_elapsed, job_ | |
|
||
def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_examples=None, total_words=None, | ||
report_delay=1.0): | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please revert There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. still here |
||
"""Get the progress report for a single training epoch. | ||
|
||
Parameters | ||
|
@@ -294,7 +320,7 @@ def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_exam | |
* Total word count used in training. | ||
|
||
""" | ||
example_count, trained_word_count, raw_word_count = 0, 0, 0 | ||
example_count, trained_word_count, raw_word_count, samples_count = 0, 0, 0, 0 | ||
start, next_report = default_timer() - 0.00001, 1.0 | ||
job_tally = 0 | ||
unfinished_worker_count = self.workers | ||
|
@@ -305,20 +331,20 @@ def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_exam | |
unfinished_worker_count -= 1 | ||
logger.info("worker thread finished; awaiting finish of %i more threads", unfinished_worker_count) | ||
continue | ||
examples, trained_words, raw_words = report | ||
examples, trained_words, raw_words, effective_samples = report | ||
job_tally += 1 | ||
|
||
# update progress stats | ||
example_count += examples | ||
trained_word_count += trained_words # only words in vocab & sampled | ||
raw_word_count += raw_words | ||
|
||
samples_count += effective_samples | ||
# log progress once every report_delay seconds | ||
elapsed = default_timer() - start | ||
if elapsed >= next_report: | ||
self._log_progress( | ||
job_queue, progress_queue, cur_epoch, example_count, total_examples, | ||
raw_word_count, total_words, trained_word_count, elapsed) | ||
raw_word_count, total_words, trained_word_count, samples_count, elapsed) | ||
next_report = elapsed + report_delay | ||
# all done; report the final stats | ||
elapsed = default_timer() - start | ||
|
@@ -361,6 +387,7 @@ def _train_epoch(self, data_iterable=None, data_iterables=None, cur_epoch=0, tot | |
* Total word count used in training. | ||
|
||
""" | ||
self.running_training_loss = 0. | ||
self._check_input_data_sanity(data_iterable, data_iterables) | ||
job_queue = Queue(maxsize=queue_factor * self.workers) | ||
progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) | ||
|
@@ -966,6 +993,9 @@ def train(self, sentences=None, input_streams=None, total_examples=None, total_w | |
total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, | ||
queue_factor=queue_factor, report_delay=report_delay, compute_loss=compute_loss, callbacks=callbacks) | ||
|
||
def get_latest_training_loss(self): | ||
raise NotImplementedError("To compute the loss for a model, you must implement get_latest_training_loss") | ||
|
||
def _get_job_params(self, cur_epoch): | ||
"""Get the learning rate used in the current epoch. | ||
|
||
|
@@ -1146,7 +1176,7 @@ def load(cls, *args, **kwargs): | |
return model | ||
|
||
def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples, | ||
raw_word_count, total_words, trained_word_count, elapsed): | ||
raw_word_count, total_words, trained_word_count, total_samples, elapsed): | ||
"""Callback used to log progress for long running jobs. | ||
|
||
Parameters | ||
|
@@ -1172,24 +1202,28 @@ def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, tot | |
trained_word_count : int | ||
Number of effective words used in training until now (after ignoring unknown words and trimming | ||
the sentence length). | ||
total_samples : int | ||
Number of effective samples used in training until now (differs from total_examples for Skip-Gram) | ||
elapsed : int | ||
Elapsed time since the beginning of training in seconds. | ||
|
||
""" | ||
if total_examples: | ||
# examples-based progress % | ||
logger.info( | ||
"EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i", | ||
cur_epoch + 1, 100.0 * example_count / total_examples, trained_word_count / elapsed, | ||
utils.qsize(job_queue), utils.qsize(progress_queue) | ||
) | ||
div = total_examples | ||
else: | ||
# words-based progress % | ||
logger.info( | ||
"EPOCH %i - PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i", | ||
cur_epoch + 1, 100.0 * raw_word_count / total_words, trained_word_count / elapsed, | ||
utils.qsize(job_queue), utils.qsize(progress_queue) | ||
) | ||
div = total_words | ||
|
||
msg = "EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can be PROGRESS: at %.2f%% words (not only There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are right. Good catch. I'll fix it. |
||
args = (cur_epoch + 1, 100.0 * example_count / div, trained_word_count / elapsed, | ||
utils.qsize(job_queue), utils.qsize(progress_queue)) | ||
if self.compute_loss: | ||
if total_samples == 0: | ||
loss = -1 | ||
else: | ||
loss = self.get_latest_training_loss() / total_samples | ||
msg += ", current_loss %.3f" | ||
mpenkov marked this conversation as resolved.
Show resolved
Hide resolved
|
||
args += (loss,) | ||
logger.info(msg, *args) | ||
|
||
def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_count, total_words, | ||
trained_word_count, elapsed): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -178,9 +178,12 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False): | |
int | ||
Number of words in the vocabulary actually used for training (that already existed in the vocabulary | ||
and were not discarded by negative sampling). | ||
int | ||
Number of samples used for training. A sample is a positive/negative example. | ||
|
||
""" | ||
result = 0 | ||
effective_samples = 0 | ||
for sentence in sentences: | ||
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and | ||
model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32] | ||
|
@@ -192,12 +195,13 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False): | |
for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): | ||
# don't train on the `word` itself | ||
if pos2 != pos: | ||
effective_samples += 1 | ||
train_sg_pair( | ||
model, model.wv.index2word[word.index], word2.index, alpha, compute_loss=compute_loss | ||
) | ||
|
||
result += len(word_vocabs) | ||
return result | ||
return result, effective_samples | ||
|
||
def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss=False): | ||
"""Update CBOW model by training on a sequence of sentences. | ||
|
@@ -229,6 +233,9 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss | |
int | ||
Number of words in the vocabulary actually used for training (that already existed in the vocabulary | ||
and were not discarded by negative sampling). | ||
int | ||
Number of samples used for training. A sample is a positive/negative example. In the case of CBOW | ||
this is the same as the effective number of words. | ||
|
||
""" | ||
result = 0 | ||
|
@@ -247,7 +254,7 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss | |
l1 /= len(word2_indices) | ||
train_cbow_pair(model, word, word2_indices, l1, alpha, compute_loss=compute_loss) | ||
result += len(word_vocabs) | ||
return result | ||
return result, result | ||
|
||
def score_sentence_sg(model, sentence, work=None): | ||
"""Obtain likelihood score for a single sentence in a fitted skip-gram representation. | ||
|
@@ -771,12 +778,11 @@ def _do_train_job(self, sentences, alpha, inits): | |
|
||
""" | ||
work, neu1 = inits | ||
tally = 0 | ||
if self.sg: | ||
tally += train_batch_sg(self, sentences, alpha, work, self.compute_loss) | ||
(tally, effective_samples) = train_batch_sg(self, sentences, alpha, work, self.compute_loss) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
else: | ||
tally += train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss) | ||
return tally, self._raw_word_count(sentences) | ||
(tally, effective_samples) = train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss) | ||
return tally, self._raw_word_count(sentences), effective_samples | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need to update an docstrings everywhere when you change returning type There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @alreadytaikeune Still not done, please check |
||
|
||
def _clear_post_train(self): | ||
"""Remove all L2-normalized word vectors from the model.""" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we should document the input parameters as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree, though there are no clear constraints on what the inputs actually are. They can actually be pretty much anything depending on how the derived class implements
_get_thread_working_mem
and_get_job_params
, the duty is to the derived class to figure out what to do with them. I'll figure out something.