-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add evaluate_word_analogies
(will replace accuracy
) method for gensim.models.KeyedVectors
#1935
Changes from 9 commits
b0c2bac
7af4021
73e8c72
d0f09ea
bf9c750
d39176c
f2a8605
f9d175d
3040a31
175100b
0ad6a49
68c3316
6c6823d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -50,7 +50,8 @@ | |
|
||
And on analogies:: | ||
|
||
>>> word_vectors.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt')) | ||
>>> word_vectors.evaluate_word_analogies(os.path.join(module_path, 'test_data', 'questions-words.txt'))[0] | ||
0.58 | ||
|
||
and so on. | ||
|
||
|
@@ -850,6 +851,137 @@ def n_similarity(self, ws1, ws2): | |
v2 = [self[word] for word in ws2] | ||
return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0))) | ||
|
||
@staticmethod | ||
def log_evaluate_word_analogies(section): | ||
correct, incorrect = len(section['correct']), len(section['incorrect']) | ||
if correct + incorrect > 0: | ||
score = correct / (correct + incorrect) | ||
logger.info("%s: %.1f%% (%i/%i)", section['section'], 100.0 * score, correct, correct + incorrect) | ||
return score | ||
|
||
def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): | ||
"""Compute performance of the model on an analogy test set. | ||
|
||
The accuracy is reported (=printed to log and returned as a score) | ||
for each section separately, plus there's one aggregate summary | ||
at the end. | ||
This method corresponds to the `compute-accuracy` script of the | ||
original C word2vec. | ||
|
||
Parameters | ||
---------- | ||
`analogies` is a filename where lines are 4-tuples of words, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should be
example There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
split into sections by ": SECTION NAME" lines. | ||
See questions-words.txt for an example: | ||
`from gensim.test.utils import datapath` | ||
`datapath("questions-words.txt")` | ||
|
||
Use `restrict_vocab` to ignore all 4-tuples containing a word | ||
not in the first `restrict_vocab` words (default 300,000). This | ||
may be meaningful if you've sorted the model vocabulary by | ||
descending frequency (which is standard in modern word embedding | ||
models). | ||
|
||
Use `case_insensitive` to convert all words in 4-tuples and | ||
vocabulary to their uppercase form before evaluating | ||
the performance (default True). Useful to handle case-mismatch | ||
between training tokens and words in the test set. | ||
In case of multiple case variants of a single word, the vector | ||
for the first occurrence (also the most frequent if vocabulary | ||
is sorted) is taken. If `case_insensitive` is True, the first | ||
`restrict_vocab` words are taken first, and then case normalization | ||
is performed. | ||
|
||
Use `dummy4unknown=True` to produce zero accuracies for 4-tuples | ||
with out-of-vocabulary words. Otherwise (default False), these | ||
tuples are skipped entirely and not used in the evaluation. | ||
|
||
References | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please don't use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
------- | ||
See <https://aclweb.org/aclwiki/Analogy_(State_of_the_art)>. | ||
|
||
Returns | ||
------- | ||
float | ||
Overall evaluation score | ||
list | ||
Full lists of correct and incorrect predictions | ||
|
||
""" | ||
ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] | ||
ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab) | ||
oov = 0 | ||
logger.info("Evaluating word analogies for top %i most frequent words on %s", restrict_vocab, analogies) | ||
sections, section = [], None | ||
line_no = 1 | ||
for line_no, line in enumerate(utils.smart_open(analogies)): | ||
line = utils.to_unicode(line) | ||
if line.startswith(': '): | ||
# a new section starts => store the old section | ||
if section: | ||
sections.append(section) | ||
self.log_evaluate_word_analogies(section) | ||
section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []} | ||
else: | ||
if not section: | ||
raise ValueError("Missing section header before line #%i in %s" % (line_no, analogies)) | ||
try: | ||
if case_insensitive: | ||
a, b, c, expected = [word.upper() for word in line.split()] | ||
else: | ||
a, b, c, expected = [word for word in line.split()] | ||
except ValueError: | ||
logger.info("Skipping invalid line #%i in %s", line_no, analogies) | ||
continue | ||
if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: | ||
oov += 1 | ||
if dummy4unknown: | ||
logger.debug('Zero accuracy for line #%d with OOV words: %s', line_no, line.strip()) | ||
section['incorrect'].append((a, b, c, expected)) | ||
else: | ||
logger.debug("Skipping line #%i with OOV words: %s", line_no, line.strip()) | ||
continue | ||
original_vocab = self.vocab | ||
self.vocab = ok_vocab | ||
ignore = {a, b, c} # input words to be ignored | ||
predicted = None | ||
# find the most likely prediction using 3CosAdd (vector offset) method | ||
# TODO: implement 3CosMul and set-based methods for solving analogies | ||
sims = self.most_similar(positive=[b, c], negative=[a], topn=5, restrict_vocab=restrict_vocab) | ||
self.vocab = original_vocab | ||
for element in sims: | ||
predicted = element[0].upper() if case_insensitive else element[0] | ||
if predicted in ok_vocab and predicted not in ignore: | ||
if predicted != expected: | ||
logger.debug("%s: expected %s, predicted %s", line.strip(), expected, predicted) | ||
break | ||
if predicted == expected: | ||
section['correct'].append((a, b, c, expected)) | ||
else: | ||
section['incorrect'].append((a, b, c, expected)) | ||
if section: | ||
# store the last section, too | ||
sections.append(section) | ||
self.log_evaluate_word_analogies(section) | ||
|
||
total = { | ||
'section': 'Total accuracy', | ||
'correct': sum((s['correct'] for s in sections), []), | ||
'incorrect': sum((s['incorrect'] for s in sections), []), | ||
} | ||
|
||
oov_ratio = float(oov) / line_no * 100 | ||
logger.info('Quadruplets with out-of-vocabulary words: %.1f%%', oov_ratio) | ||
if not dummy4unknown: | ||
logger.info( | ||
'NB: analogies containing OOV words were skipped from evaluation! ' | ||
'To change this behavior, use "dummy4unknown=True"' | ||
) | ||
analogies_score = self.log_evaluate_word_analogies(total) | ||
sections.append(total) | ||
# Return the overall score and the full lists of correct and incorrect analogies | ||
return analogies_score, sections | ||
|
||
@staticmethod | ||
def log_accuracy(section): | ||
correct, incorrect = len(section['correct']), len(section['incorrect']) | ||
|
@@ -859,6 +991,7 @@ def log_accuracy(section): | |
section['section'], 100.0 * correct / (correct + incorrect), correct, correct + incorrect | ||
) | ||
|
||
@deprecated("Method will be removed in 4.0.0, use self.evaluate_word_analogies() instead") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the correct way, all fine 👍 |
||
def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, case_insensitive=True): | ||
""" | ||
Compute accuracy of the model. `questions` is a filename where lines are | ||
|
@@ -881,7 +1014,6 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c | |
occurrence (also the most frequent if vocabulary is sorted) is taken. | ||
|
||
This method corresponds to the `compute-accuracy` script of the original C word2vec. | ||
|
||
""" | ||
ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] | ||
ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab) | ||
|
@@ -898,19 +1030,18 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c | |
section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []} | ||
else: | ||
if not section: | ||
raise ValueError("missing section header before line #%i in %s" % (line_no, questions)) | ||
raise ValueError("Missing section header before line #%i in %s" % (line_no, questions)) | ||
try: | ||
if case_insensitive: | ||
a, b, c, expected = [word.upper() for word in line.split()] | ||
else: | ||
a, b, c, expected = [word for word in line.split()] | ||
except ValueError: | ||
logger.info("skipping invalid line #%i in %s", line_no, questions) | ||
logger.info("Skipping invalid line #%i in %s", line_no, questions) | ||
continue | ||
if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: | ||
logger.debug("skipping line #%i with OOV words: %s", line_no, line.strip()) | ||
logger.debug("Skipping line #%i with OOV words: %s", line_no, line.strip()) | ||
continue | ||
|
||
original_vocab = self.vocab | ||
self.vocab = ok_vocab | ||
ignore = {a, b, c} # input words to be ignored | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe better hide this method (with
_
)?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What exactly do you mean? Or may be you can point to some example of such hiding in the existing Gensim code?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I mean why not
_log_evaluate_word_analogies
? I asking because this method looks like a helper forevaluate_word_analogies
, not more.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.