Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add evaluate_word_analogies (will replace accuracy) method for gensim.models.KeyedVectors #1935

Merged
merged 13 commits into from
Apr 3, 2018
119 changes: 113 additions & 6 deletions gensim/models/keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@

And on analogies::

>>> word_vectors.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt'))
>>> word_vectors.evaluate_word_analogies(os.path.join(module_path, 'test_data', 'questions-words.txt'))[0]
0.58

and so on.

Expand Down Expand Up @@ -850,6 +851,113 @@ def n_similarity(self, ws1, ws2):
v2 = [self[word] for word in ws2]
return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0)))

@staticmethod
def log_evaluate_word_analogies(section):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe better hide this method (with _)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What exactly do you mean? Or may be you can point to some example of such hiding in the existing Gensim code?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean why not _log_evaluate_word_analogies? I asking because this method looks like a helper for evaluate_word_analogies, not more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

correct, incorrect = len(section['correct']), len(section['incorrect'])
if correct + incorrect > 0:
score = correct / (correct + incorrect)
logger.info("%s: %.1f%% (%i/%i)", section['section'], 100.0 * score, correct, correct + incorrect)
return score

def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensitive=True, dummy4unknown=False):
"""
Compute performance of the model on an analogy test set
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

(see https://aclweb.org/aclwiki/Analogy_(State_of_the_art)).
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should be rendered as a link, should look like

 `Analogy (State of the art) <https://aclweb.org/aclwiki/Analogy_(State_of_the_art)>`_

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

`analogies` is a filename where lines are 4-tuples of words, split into sections by ": SECTION NAME" lines.
See questions-words.txt in
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this file also provided in the current repo: https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/test/test_data/questions-words.txt + this is part of the gensim package, i.e. path on local machine can be retrieved as

from gensim.test.utils import datapath
datapath("questions-words.txt")

No need to download source-code of C version for looking into this file

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

https://storage.googleapis.com/google-code-archive-source/v2/code.google.com/word2vec/source-archive.zip
for an example.

The accuracy is reported (=printed to log and returned as a score) for each section separately,
plus there's one aggregate summary at the end.

Use `restrict_vocab` to ignore all 4-tuples containing a word not in the first `restrict_vocab`
words (default 300,000). This may be meaningful if you've sorted the model vocabulary by descending frequency
(which is standard in modern word embedding models).

If `case_insensitive` is True, the first `restrict_vocab` words are taken first, and then
case normalization is performed.
Use `case_insensitive` to convert all words in 4-tuples and vocabulary to their uppercase form before
evaluating the performance (default True). Useful to handle case-mismatch between training tokens
and words in the test set. In case of multiple case variants of a single word, the vector for the first
occurrence (also the most frequent if vocabulary is sorted) is taken.

Use `dummy4unknown=True` to produce zero accuracies for 4-tuples with out-of-vocabulary words.
Otherwise (default False), these tuples are skipped entirely and not used in the evaluation.

This method corresponds to the `compute-accuracy` script of the original C word2vec.
"""
ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]]
ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab)
oov = 0
logger.info("Evaluating word analogies for top %i most frequent words on %s", restrict_vocab, analogies)
sections, section = [], None
for line_no, line in enumerate(utils.smart_open(analogies)):
line = utils.to_unicode(line)
if line.startswith(': '):
# a new section starts => store the old section
if section:
sections.append(section)
self.log_evaluate_word_analogies(section)
section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []}
else:
if not section:
raise ValueError("Missing section header before line #%i in %s" % (line_no, analogies))
try:
if case_insensitive:
a, b, c, expected = [word.upper() for word in line.split()]
else:
a, b, c, expected = [word for word in line.split()]
except ValueError:
logger.info("Skipping invalid line #%i in %s", line_no, analogies)
continue
if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
oov += 1
if dummy4unknown:
logger.debug('Zero accuracy for line #%d with OOV words: %s', line_no, line.strip())
section['incorrect'].append((a, b, c, expected))
else:
logger.debug("Skipping line #%i with OOV words: %s", line_no, line.strip())
continue
original_vocab = self.vocab
self.vocab = ok_vocab
ignore = {a, b, c} # input words to be ignored
predicted = None
# find the most likely prediction using 3CosAdd (vector offset) method
# TODO: implement 3CosMul and set-based methods for solving analogies
sims = self.most_similar(positive=[b, c], negative=[a], topn=5, restrict_vocab=restrict_vocab)
self.vocab = original_vocab
for element in sims:
predicted = element[0].upper() if case_insensitive else element[0]
if predicted in ok_vocab and predicted not in ignore:
if predicted != expected:
logger.debug("%s: expected %s, predicted %s", line.strip(), expected, predicted)
break
if predicted == expected:
section['correct'].append((a, b, c, expected))
else:
section['incorrect'].append((a, b, c, expected))
if section:
# store the last section, too
sections.append(section)
self.log_evaluate_word_analogies(section)

total = {
'section': 'Total accuracy',
'correct': sum((s['correct'] for s in sections), []),
'incorrect': sum((s['incorrect'] for s in sections), []),
}

oov_ratio = float(oov) / line_no * 100
logger.info('Quadruplets with out-of-vocabulary words: %.1f%%', oov_ratio)
if not dummy4unknown:
logger.info('NB: analogies containing OOV words were skipped from evaluation! '
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick: please use hanging indents (instead of vertical)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

'To change this behavior, use "dummy4unknown=True"')
analogies_score = self.log_evaluate_word_analogies(total)
sections.append(total)
# Return the overall score and the full lists of correct and incorrect analogies
return analogies_score, sections

@staticmethod
def log_accuracy(section):
correct, incorrect = len(section['correct']), len(section['incorrect'])
Expand All @@ -859,6 +967,7 @@ def log_accuracy(section):
section['section'], 100.0 * correct / (correct + incorrect), correct, correct + incorrect
)

@deprecated("Method will be removed in 4.0.0, use self.evaluate_word_analogies() instead")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the correct way, all fine 👍

def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, case_insensitive=True):
"""
Compute accuracy of the model. `questions` is a filename where lines are
Expand All @@ -881,7 +990,6 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c
occurrence (also the most frequent if vocabulary is sorted) is taken.

This method corresponds to the `compute-accuracy` script of the original C word2vec.

"""
ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]]
ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab)
Expand All @@ -898,19 +1006,18 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c
section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []}
else:
if not section:
raise ValueError("missing section header before line #%i in %s" % (line_no, questions))
raise ValueError("Missing section header before line #%i in %s" % (line_no, questions))
try:
if case_insensitive:
a, b, c, expected = [word.upper() for word in line.split()]
else:
a, b, c, expected = [word for word in line.split()]
except ValueError:
logger.info("skipping invalid line #%i in %s", line_no, questions)
logger.info("Skipping invalid line #%i in %s", line_no, questions)
continue
if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
logger.debug("skipping line #%i with OOV words: %s", line_no, line.strip())
logger.debug("Skipping line #%i with OOV words: %s", line_no, line.strip())
continue

original_vocab = self.vocab
self.vocab = ok_vocab
ignore = {a, b, c} # input words to be ignored
Expand Down
5 changes: 3 additions & 2 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@

And on analogies::

>>> model.wv.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt'))
>>> model.wv.evaluate_word_analogies(os.path.join(module_path, 'test_data', 'questions-words.txt'))[0]
0.58

and so on.

Expand Down Expand Up @@ -896,7 +897,7 @@ def reset_from(self, other_model):
def log_accuracy(section):
return Word2VecKeyedVectors.log_accuracy(section)

@deprecated("Method will be removed in 4.0.0, use self.wv.accuracy() instead")
@deprecated("Method will be removed in 4.0.0, use self.wv.evaluate_word_analogies() instead")
def accuracy(self, questions, restrict_vocab=30000, most_similar=None, case_insensitive=True):
most_similar = most_similar or Word2VecKeyedVectors.most_similar
return self.wv.accuracy(questions, restrict_vocab, most_similar, case_insensitive)
Expand Down