From b0c2baca363f37717db87f9d930042875e3c6040 Mon Sep 17 00:00:00 2001 From: Andrey Kutuzov Date: Mon, 26 Feb 2018 01:40:48 +0100 Subject: [PATCH 01/13] Increased default restrict_vocab in accuracy The `accuracy` function evaluates the performance of word2vec models in analogy task. `restrict_vocab` parameter defines which part of the model vocabulary will be used for evaluation. The previous default was 30 000 top frequent words (analogy questions containing words beyond this threshold are simply skipped). It indeed makes sense to use some kind of limit here, as the evaluation running time depends on the size of the used vocabulary. However, 30 000 is a very small value, with typical models nowadays featuring hundreds of thousands or even millions of words in their vocabularies. This leads to unrealistic evaluation scores, calculated only on small parts of a test set and a model. Therefore, I suggest increasing the default value of `restrict_vocab` 10-fold, up to 300 000. This will be more in line with the typical vocabulary size of contemporary word embedding models, and also will be consistent with the default value for the `evaluate_word_pairs` function. Note that although the original C word2vec does mention 30 000 as a good threshold value for analogies evaluation, the default behavior of its `compute-accuracy` executable is still not to use any threshold (=evaluate on the whole vocabulary). --- gensim/models/keyedvectors.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index b35a974f4f..738916dd47 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -859,7 +859,7 @@ def log_accuracy(section): section['section'], 100.0 * correct / (correct + incorrect), correct, correct + incorrect ) - def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, case_insensitive=True): + def accuracy(self, questions, restrict_vocab=300000, most_similar=most_similar, case_insensitive=True): """ Compute accuracy of the model. `questions` is a filename where lines are 4-tuples of words, split into sections by ": SECTION NAME" lines. @@ -898,17 +898,17 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []} else: if not section: - raise ValueError("missing section header before line #%i in %s" % (line_no, questions)) + raise ValueError("Missing section header before line #%i in %s" % (line_no, questions)) try: if case_insensitive: a, b, c, expected = [word.upper() for word in line.split()] else: a, b, c, expected = [word for word in line.split()] except ValueError: - logger.info("skipping invalid line #%i in %s", line_no, questions) + logger.info("Skipping invalid line #%i in %s", line_no, questions) continue if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: - logger.debug("skipping line #%i with OOV words: %s", line_no, line.strip()) + logger.debug("Skipping line #%i with OOV words: %s", line_no, line.strip()) continue original_vocab = self.vocab From 7af40212f44503a44e71774ad343bc8d5e8c8894 Mon Sep 17 00:00:00 2001 From: Andrey Kutuzov Date: Sun, 1 Apr 2018 21:56:51 +0200 Subject: [PATCH 02/13] New word analogies method New method `evaluate_word_analogies` to solve word analogies. Implements more sensible frequency threshold and the `dummy4unknown` parameter. Also, works two times faster than the previous `accuracy` method which is now deprecated. --- gensim/models/keyedvectors.py | 114 +++++++++++++++++++++++++++++++++- 1 file changed, 113 insertions(+), 1 deletion(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 738916dd47..ae8f373b87 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -850,6 +850,117 @@ def n_similarity(self, ws1, ws2): v2 = [self[word] for word in ws2] return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0))) + @staticmethod + def log_evaluate_word_analogies(section): + correct, incorrect = len(section['correct']), len(section['incorrect']) + if correct + incorrect > 0: + score = correct / (correct + incorrect) + logger.info("%s: %.1f%% (%i/%i)", section['section'], 100.0 * score, correct, correct + incorrect) + return score + + def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): + """ + Compute performance of the model on an analogy test set + (see https://aclweb.org/aclwiki/Analogy_(State_of_the_art)). + `analogies` is a filename where lines are 4-tuples of words, split into sections by ": SECTION NAME" lines. + See questions-words.txt in + https://storage.googleapis.com/google-code-archive-source/v2/code.google.com/word2vec/source-archive.zip + for an example. + + The accuracy is reported (=printed to log and returned as a score) for each section separately, + plus there's one aggregate summary at the end. + + Use `restrict_vocab` to ignore all 4-tuples containing a word not in the first `restrict_vocab` + words (default 300,000). This may be meaningful if you've sorted the model vocabulary by descending frequency + (which is standard in modern word embedding models). + + If `case_insensitive` is True, the first `restrict_vocab` words are taken first, and then + case normalization is performed. + Use `case_insensitive` to convert all words in 4-tuples and vocabulary to their uppercase form before + evaluating the performance (default True). Useful to handle case-mismatch between training tokens + and words in the test set. In case of multiple case variants of a single word, the vector for the first + occurrence (also the most frequent if vocabulary is sorted) is taken. + + Use `dummy4unknown=True` to produce zero accuracies for 4-tuples with out-of-vocabulary words. + Otherwise (default False), these tuples are skipped entirely and not used in the evaluation. + + This method corresponds to the `compute-accuracy` script of the original C word2vec. + + """ + ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] + ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab) + oov = 0 + + logger.info("Evaluating word analogies for top %i most frequent words on %s", restrict_vocab, analogies) + + sections, section = [], None + for line_no, line in enumerate(utils.smart_open(analogies)): + line = utils.to_unicode(line) + if line.startswith(': '): + # a new section starts => store the old section + if section: + sections.append(section) + self.log_evaluate_word_analogies(section) + section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []} + else: + if not section: + raise ValueError("Missing section header before line #%i in %s" % (line_no, analogies)) + try: + if case_insensitive: + a, b, c, expected = [word.upper() for word in line.split()] + else: + a, b, c, expected = [word for word in line.split()] + except ValueError: + logger.info("Skipping invalid line #%i in %s", line_no, analogies) + continue + if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: + oov += 1 + if dummy4unknown: + logger.debug('Zero accuracy for line #%d with OOV words: %s', line_no, line.strip()) + section['incorrect'].append((a, b, c, expected)) + else: + logger.debug("Skipping line #%i with OOV words: %s", line_no, line.strip()) + continue + + original_vocab = self.vocab + self.vocab = ok_vocab + ignore = {a, b, c} # input words to be ignored + predicted = None + # find the most likely prediction using 3CosAdd (vector offset) method + # TODO: implement 3CosMul and set-based methods for solving analogies + sims = self.most_similar(positive=[b, c], negative=[a], topn=5, restrict_vocab=restrict_vocab) + self.vocab = original_vocab + for element in sims: + predicted = element[0].upper() if case_insensitive else element[0] + if predicted in ok_vocab and predicted not in ignore: + if predicted != expected: + logger.debug("%s: expected %s, predicted %s", line.strip(), expected, predicted) + break + if predicted == expected: + section['correct'].append((a, b, c, expected)) + else: + section['incorrect'].append((a, b, c, expected)) + if section: + # store the last section, too + sections.append(section) + self.log_evaluate_word_analogies(section) + + total = { + 'section': 'Total accuracy', + 'correct': sum((s['correct'] for s in sections), []), + 'incorrect': sum((s['incorrect'] for s in sections), []), + } + + oov_ratio = float(oov) / line_no * 100 + logger.info('Quadruplets with out-of-vocabulary words: %.1f%%', oov_ratio) + if not dummy4unknown: + logger.info('NB: analogies containing OOV words were skipped from evaluation! ' + 'To change this behavior, use "dummy4unknown=True"') + analogies_score = self.log_evaluate_word_analogies(total) + sections.append(total) + # Return the overall score and the full lists of correct and incorrect analogies + return analogies_score, sections + @staticmethod def log_accuracy(section): correct, incorrect = len(section['correct']), len(section['incorrect']) @@ -859,7 +970,8 @@ def log_accuracy(section): section['section'], 100.0 * correct / (correct + incorrect), correct, correct + incorrect ) - def accuracy(self, questions, restrict_vocab=300000, most_similar=most_similar, case_insensitive=True): + @deprecated("Method will be removed in 4.0.0, use self.evaluate_word_analogies() instead") + def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, case_insensitive=True): """ Compute accuracy of the model. `questions` is a filename where lines are 4-tuples of words, split into sections by ": SECTION NAME" lines. From 73e8c721533e6748612a6b34e250ee42a2dbedb5 Mon Sep 17 00:00:00 2001 From: Andrey Kutuzov Date: Sun, 1 Apr 2018 22:14:16 +0200 Subject: [PATCH 03/13] Mention new word analogies method in the doc --- gensim/models/keyedvectors.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index ae8f373b87..1b2bd215ba 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -50,7 +50,9 @@ And on analogies:: - >>> word_vectors.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt')) + >>> word_vectors.evaluate_word_analogies(os.path.join(module_path, 'test_data', 'questions-words.txt'))[0] + 0.58 + and so on. From d0f09eaf4d2ad6a0fe41c014826abd08a71dc7cf Mon Sep 17 00:00:00 2001 From: Andrey Kutuzov Date: Sun, 1 Apr 2018 22:17:40 +0200 Subject: [PATCH 04/13] Refer to new word analogies method in word2vec.py --- gensim/models/word2vec.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index f51b4cd25f..7ca85a6340 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -74,7 +74,8 @@ And on analogies:: - >>> model.wv.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt')) + >>> model.wv.evaluate_word_analogies(os.path.join(module_path, 'test_data', 'questions-words.txt'))[0] + 0.58 and so on. @@ -896,7 +897,7 @@ def reset_from(self, other_model): def log_accuracy(section): return Word2VecKeyedVectors.log_accuracy(section) - @deprecated("Method will be removed in 4.0.0, use self.wv.accuracy() instead") + @deprecated("Method will be removed in 4.0.0, use self.wv.evaluate_word_analogies() instead") def accuracy(self, questions, restrict_vocab=30000, most_similar=None, case_insensitive=True): most_similar = most_similar or Word2VecKeyedVectors.most_similar return self.wv.accuracy(questions, restrict_vocab, most_similar, case_insensitive) From bf9c750d2d22e345a000667d920adc5cfb7e5deb Mon Sep 17 00:00:00 2001 From: Andrey Kutuzov Date: Sun, 1 Apr 2018 23:08:37 +0200 Subject: [PATCH 05/13] Removed redundant spaces --- gensim/models/keyedvectors.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 1b2bd215ba..8d2e4762d4 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -51,8 +51,7 @@ And on analogies:: >>> word_vectors.evaluate_word_analogies(os.path.join(module_path, 'test_data', 'questions-words.txt'))[0] - 0.58 - + 0.58 and so on. @@ -994,8 +993,7 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c and question words. In case of multiple case variants of a single word, the vector for the first occurrence (also the most frequent if vocabulary is sorted) is taken. - This method corresponds to the `compute-accuracy` script of the original C word2vec. - + This method corresponds to the `compute-accuracy` script of the original C word2vec. """ ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab) From d39176c920d654e2161ff1230e10b1916372947d Mon Sep 17 00:00:00 2001 From: Andrey Kutuzov Date: Sun, 1 Apr 2018 23:39:57 +0200 Subject: [PATCH 06/13] Removed more redundant spaces --- gensim/models/keyedvectors.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 8d2e4762d4..23d4b60063 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -51,7 +51,7 @@ And on analogies:: >>> word_vectors.evaluate_word_analogies(os.path.join(module_path, 'test_data', 'questions-words.txt'))[0] - 0.58 + 0.58 and so on. @@ -886,14 +886,11 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi Otherwise (default False), these tuples are skipped entirely and not used in the evaluation. This method corresponds to the `compute-accuracy` script of the original C word2vec. - """ ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab) oov = 0 - logger.info("Evaluating word analogies for top %i most frequent words on %s", restrict_vocab, analogies) - sections, section = [], None for line_no, line in enumerate(utils.smart_open(analogies)): line = utils.to_unicode(line) @@ -922,7 +919,6 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi else: logger.debug("Skipping line #%i with OOV words: %s", line_no, line.strip()) continue - original_vocab = self.vocab self.vocab = ok_vocab ignore = {a, b, c} # input words to be ignored @@ -1022,7 +1018,6 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: logger.debug("Skipping line #%i with OOV words: %s", line_no, line.strip()) continue - original_vocab = self.vocab self.vocab = ok_vocab ignore = {a, b, c} # input words to be ignored From f2a860512f47f7639e7cd429445d45236b9b35b5 Mon Sep 17 00:00:00 2001 From: Andrey Kutuzov Date: Sun, 1 Apr 2018 23:56:30 +0200 Subject: [PATCH 07/13] Another round of space-elimination... --- gensim/models/keyedvectors.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 23d4b60063..9d81d7e762 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -874,7 +874,7 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi Use `restrict_vocab` to ignore all 4-tuples containing a word not in the first `restrict_vocab` words (default 300,000). This may be meaningful if you've sorted the model vocabulary by descending frequency (which is standard in modern word embedding models). - + If `case_insensitive` is True, the first `restrict_vocab` words are taken first, and then case normalization is performed. Use `case_insensitive` to convert all words in 4-tuples and vocabulary to their uppercase form before @@ -957,7 +957,7 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi sections.append(total) # Return the overall score and the full lists of correct and incorrect analogies return analogies_score, sections - + @staticmethod def log_accuracy(section): correct, incorrect = len(section['correct']), len(section['incorrect']) @@ -989,7 +989,7 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c and question words. In case of multiple case variants of a single word, the vector for the first occurrence (also the most frequent if vocabulary is sorted) is taken. - This method corresponds to the `compute-accuracy` script of the original C word2vec. + This method corresponds to the `compute-accuracy` script of the original C word2vec. """ ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab) From f9d175dfe8e69f2eb341b0f5767d62a7a0282704 Mon Sep 17 00:00:00 2001 From: akutuzov Date: Mon, 2 Apr 2018 20:57:05 +0200 Subject: [PATCH 08/13] Code polishing. --- gensim/models/keyedvectors.py | 70 +++++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 23 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 9d81d7e762..a8ef08aeac 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -860,38 +860,60 @@ def log_evaluate_word_analogies(section): return score def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): - """ - Compute performance of the model on an analogy test set - (see https://aclweb.org/aclwiki/Analogy_(State_of_the_art)). - `analogies` is a filename where lines are 4-tuples of words, split into sections by ": SECTION NAME" lines. - See questions-words.txt in - https://storage.googleapis.com/google-code-archive-source/v2/code.google.com/word2vec/source-archive.zip - for an example. + """Compute performance of the model on an analogy test set. - The accuracy is reported (=printed to log and returned as a score) for each section separately, - plus there's one aggregate summary at the end. + The accuracy is reported (=printed to log and returned as a score) + for each section separately, plus there's one aggregate summary + at the end. + This method corresponds to the `compute-accuracy` script of the + original C word2vec. - Use `restrict_vocab` to ignore all 4-tuples containing a word not in the first `restrict_vocab` - words (default 300,000). This may be meaningful if you've sorted the model vocabulary by descending frequency - (which is standard in modern word embedding models). + Parameters + ---------- + `analogies` is a filename where lines are 4-tuples of words, + split into sections by ": SECTION NAME" lines. + See questions-words.txt for an example: + `from gensim.test.utils import datapath + datapath("questions-words.txt")` + + Use `restrict_vocab` to ignore all 4-tuples containing a word + not in the first `restrict_vocab` words (default 300,000). This + may be meaningful if you've sorted the model vocabulary by + descending frequency (which is standard in modern word embedding + models). + + Use `case_insensitive` to convert all words in 4-tuples and + vocabulary to their uppercase form before evaluating + the performance (default True). Useful to handle case-mismatch + between training tokens and words in the test set. + In case of multiple case variants of a single word, the vector + for the first occurrence (also the most frequent if vocabulary + is sorted) is taken. If `case_insensitive` is True, the first + `restrict_vocab` words are taken first, and then case normalization + is performed. - If `case_insensitive` is True, the first `restrict_vocab` words are taken first, and then - case normalization is performed. - Use `case_insensitive` to convert all words in 4-tuples and vocabulary to their uppercase form before - evaluating the performance (default True). Useful to handle case-mismatch between training tokens - and words in the test set. In case of multiple case variants of a single word, the vector for the first - occurrence (also the most frequent if vocabulary is sorted) is taken. + Use `dummy4unknown=True` to produce zero accuracies for 4-tuples + with out-of-vocabulary words. Otherwise (default False), these + tuples are skipped entirely and not used in the evaluation. - Use `dummy4unknown=True` to produce zero accuracies for 4-tuples with out-of-vocabulary words. - Otherwise (default False), these tuples are skipped entirely and not used in the evaluation. + References + ------- + See . + + Returns + ------- + float + Overall evaluation score + list + Full lists of correct and incorrect predictions - This method corresponds to the `compute-accuracy` script of the original C word2vec. """ ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab) oov = 0 logger.info("Evaluating word analogies for top %i most frequent words on %s", restrict_vocab, analogies) sections, section = [], None + line_no = 1 for line_no, line in enumerate(utils.smart_open(analogies)): line = utils.to_unicode(line) if line.startswith(': '): @@ -951,8 +973,10 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi oov_ratio = float(oov) / line_no * 100 logger.info('Quadruplets with out-of-vocabulary words: %.1f%%', oov_ratio) if not dummy4unknown: - logger.info('NB: analogies containing OOV words were skipped from evaluation! ' - 'To change this behavior, use "dummy4unknown=True"') + logger.info( + 'NB: analogies containing OOV words were skipped from evaluation! ' + 'To change this behavior, use "dummy4unknown=True"' + ) analogies_score = self.log_evaluate_word_analogies(total) sections.append(total) # Return the overall score and the full lists of correct and incorrect analogies From 3040a312ee0a0530bc29c3ba50d8c01e94d985ff Mon Sep 17 00:00:00 2001 From: akutuzov Date: Mon, 2 Apr 2018 21:07:29 +0200 Subject: [PATCH 09/13] Fix for docstring --- gensim/models/keyedvectors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index a8ef08aeac..f0bdc266eb 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -873,8 +873,8 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi `analogies` is a filename where lines are 4-tuples of words, split into sections by ": SECTION NAME" lines. See questions-words.txt for an example: - `from gensim.test.utils import datapath - datapath("questions-words.txt")` + `from gensim.test.utils import datapath` + `datapath("questions-words.txt")` Use `restrict_vocab` to ignore all 4-tuples containing a word not in the first `restrict_vocab` words (default 300,000). This From 175100b215f417e719f98847b3057b23c831ffb1 Mon Sep 17 00:00:00 2001 From: Andrey Kutuzov Date: Tue, 3 Apr 2018 16:46:32 +0200 Subject: [PATCH 10/13] Hide log method, fix the docstring --- gensim/models/keyedvectors.py | 76 +++++++++++++++++------------------ 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index f0bdc266eb..588a6525de 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -852,7 +852,7 @@ def n_similarity(self, ws1, ws2): return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0))) @staticmethod - def log_evaluate_word_analogies(section): + def _log_evaluate_word_analogies(section): correct, incorrect = len(section['correct']), len(section['incorrect']) if correct + incorrect > 0: score = correct / (correct + incorrect) @@ -867,45 +867,44 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi at the end. This method corresponds to the `compute-accuracy` script of the original C word2vec. + See also `Analogy (State of the art) `. Parameters ---------- - `analogies` is a filename where lines are 4-tuples of words, - split into sections by ": SECTION NAME" lines. - See questions-words.txt for an example: - `from gensim.test.utils import datapath` - `datapath("questions-words.txt")` - - Use `restrict_vocab` to ignore all 4-tuples containing a word - not in the first `restrict_vocab` words (default 300,000). This - may be meaningful if you've sorted the model vocabulary by - descending frequency (which is standard in modern word embedding - models). - - Use `case_insensitive` to convert all words in 4-tuples and - vocabulary to their uppercase form before evaluating - the performance (default True). Useful to handle case-mismatch - between training tokens and words in the test set. - In case of multiple case variants of a single word, the vector - for the first occurrence (also the most frequent if vocabulary - is sorted) is taken. If `case_insensitive` is True, the first - `restrict_vocab` words are taken first, and then case normalization - is performed. - - Use `dummy4unknown=True` to produce zero accuracies for 4-tuples - with out-of-vocabulary words. Otherwise (default False), these - tuples are skipped entirely and not used in the evaluation. + analogies : str + filename where lines are 4-tuples of words, + split into sections by ": SECTION NAME" lines. + See questions-words.txt for an example: + `from gensim.test.utils import datapath` + `datapath("questions-words.txt")` - References - ------- - See . + restrict_vocab : int + ignore all 4-tuples containing a word not in the first + `restrict_vocab` words (default 300,000). This may be + meaningful if you've sorted the model vocabulary by descending + frequency (which is standard in modern word embedding models). + + case_insensitive : bool + convert all words in 4-tuples and vocabulary to their + uppercase form before evaluating the performance + (default True). Useful to handle case-mismatch + between training tokens and words in the test set. + In case of multiple case variants of a single word, the vector + for the first occurrence (also the most frequent if vocabulary + is sorted) is taken. If `case_insensitive` is True, the first + `restrict_vocab` words are taken first, and then case normalization + is performed. + + dummy4unknown : bool + produce zero accuracies for 4-tuples with out-of-vocabulary words. + Otherwise (default False), these tuples are skipped entirely + and not used in the evaluation. Returns ------- - float - Overall evaluation score - list - Full lists of correct and incorrect predictions + (float, list) + Overall evaluation score and full lists of correct and + incorrect predictions """ ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] @@ -913,14 +912,14 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi oov = 0 logger.info("Evaluating word analogies for top %i most frequent words on %s", restrict_vocab, analogies) sections, section = [], None - line_no = 1 + quadruplets_no = 0 for line_no, line in enumerate(utils.smart_open(analogies)): line = utils.to_unicode(line) if line.startswith(': '): # a new section starts => store the old section if section: sections.append(section) - self.log_evaluate_word_analogies(section) + self._log_evaluate_word_analogies(section) section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []} else: if not section: @@ -933,6 +932,7 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi except ValueError: logger.info("Skipping invalid line #%i in %s", line_no, analogies) continue + quadruplets_no += 1 if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: oov += 1 if dummy4unknown: @@ -962,7 +962,7 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi if section: # store the last section, too sections.append(section) - self.log_evaluate_word_analogies(section) + self._log_evaluate_word_analogies(section) total = { 'section': 'Total accuracy', @@ -970,14 +970,14 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi 'incorrect': sum((s['incorrect'] for s in sections), []), } - oov_ratio = float(oov) / line_no * 100 + oov_ratio = float(oov) / quadruplets_no * 100 logger.info('Quadruplets with out-of-vocabulary words: %.1f%%', oov_ratio) if not dummy4unknown: logger.info( 'NB: analogies containing OOV words were skipped from evaluation! ' 'To change this behavior, use "dummy4unknown=True"' ) - analogies_score = self.log_evaluate_word_analogies(total) + analogies_score = self._log_evaluate_word_analogies(total) sections.append(total) # Return the overall score and the full lists of correct and incorrect analogies return analogies_score, sections From 0ad6a49096e64e9d4ac42209eb7d2050eaf6cbdc Mon Sep 17 00:00:00 2001 From: Andrey Kutuzov Date: Tue, 3 Apr 2018 17:37:42 +0200 Subject: [PATCH 11/13] Docstring updated. --- gensim/models/keyedvectors.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 588a6525de..4ced124b0a 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -872,8 +872,8 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi Parameters ---------- analogies : str - filename where lines are 4-tuples of words, - split into sections by ": SECTION NAME" lines. + filename where lines are 4-tuples of words, + split into sections by ": SECTION NAME" lines. See questions-words.txt for an example: `from gensim.test.utils import datapath` `datapath("questions-words.txt")` @@ -910,7 +910,7 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab) oov = 0 - logger.info("Evaluating word analogies for top %i most frequent words on %s", restrict_vocab, analogies) + logger.info("Evaluating word analogies for top %i words in the model on %s", restrict_vocab, analogies) sections, section = [], None quadruplets_no = 0 for line_no, line in enumerate(utils.smart_open(analogies)): From 68c3316a000b021dee58e15c9a68bf0154b10da5 Mon Sep 17 00:00:00 2001 From: Andrey Kutuzov Date: Tue, 3 Apr 2018 17:39:25 +0200 Subject: [PATCH 12/13] Removed redundant spaces. --- gensim/models/keyedvectors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 4ced124b0a..485d3c0b39 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -872,8 +872,8 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi Parameters ---------- analogies : str - filename where lines are 4-tuples of words, - split into sections by ": SECTION NAME" lines. + filename where lines are 4-tuples of words, + split into sections by ": SECTION NAME" lines. See questions-words.txt for an example: `from gensim.test.utils import datapath` `datapath("questions-words.txt")` From 6c6823d4bc0050cf0949d0cd2d8011ce29cd96e8 Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 4 Apr 2018 00:31:50 +0500 Subject: [PATCH 13/13] cleanup docstrings --- gensim/models/keyedvectors.py | 74 ++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 36 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 485d3c0b39..fa1999aa8d 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -853,6 +853,20 @@ def n_similarity(self, ws1, ws2): @staticmethod def _log_evaluate_word_analogies(section): + """Calculate score by section, helper for + :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.evaluate_word_analogies`. + + Parameters + ---------- + section : dict of (str, (str, str, str, str)) + Section given from evaluation. + + Returns + ------- + float + Accuracy score. + + """ correct, incorrect = len(section['correct']), len(section['incorrect']) if correct + incorrect > 0: score = correct / (correct + incorrect) @@ -862,49 +876,37 @@ def _log_evaluate_word_analogies(section): def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): """Compute performance of the model on an analogy test set. - The accuracy is reported (=printed to log and returned as a score) - for each section separately, plus there's one aggregate summary - at the end. - This method corresponds to the `compute-accuracy` script of the - original C word2vec. - See also `Analogy (State of the art) `. + This is modern variant of :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.accuracy`, see + `discussion on GitHub #1935 `_. + + The accuracy is reported (printed to log and returned as a score) for each section separately, + plus there's one aggregate summary at the end. + + This method corresponds to the `compute-accuracy` script of the original C word2vec. + See also `Analogy (State of the art) `_. Parameters ---------- analogies : str - filename where lines are 4-tuples of words, - split into sections by ": SECTION NAME" lines. - See questions-words.txt for an example: - `from gensim.test.utils import datapath` - `datapath("questions-words.txt")` - - restrict_vocab : int - ignore all 4-tuples containing a word not in the first - `restrict_vocab` words (default 300,000). This may be - meaningful if you've sorted the model vocabulary by descending - frequency (which is standard in modern word embedding models). - - case_insensitive : bool - convert all words in 4-tuples and vocabulary to their - uppercase form before evaluating the performance - (default True). Useful to handle case-mismatch - between training tokens and words in the test set. - In case of multiple case variants of a single word, the vector - for the first occurrence (also the most frequent if vocabulary - is sorted) is taken. If `case_insensitive` is True, the first - `restrict_vocab` words are taken first, and then case normalization - is performed. - - dummy4unknown : bool - produce zero accuracies for 4-tuples with out-of-vocabulary words. - Otherwise (default False), these tuples are skipped entirely - and not used in the evaluation. + Path to file, where lines are 4-tuples of words, split into sections by ": SECTION NAME" lines. + See `gensim/test/test_data/questions-words.txt` as example. + restrict_vocab : int, optional + Ignore all 4-tuples containing a word not in the first `restrict_vocab` words. + This may be meaningful if you've sorted the model vocabulary by descending frequency (which is standard + in modern word embedding models). + case_insensitive : bool, optional + If True - convert all words to their uppercase form before evaluating the performance. + Useful to handle case-mismatch between training tokens and words in the test set. + In case of multiple case variants of a single word, the vector for the first occurrence + (also the most frequent if vocabulary is sorted) is taken. + dummy4unknown : bool, optional + If True - produce zero accuracies for 4-tuples with out-of-vocabulary words. + Otherwise, these tuples are skipped entirely and not used in the evaluation. Returns ------- - (float, list) - Overall evaluation score and full lists of correct and - incorrect predictions + (float, list of dict of (str, (str, str, str)) + Overall evaluation score and full lists of correct and incorrect predictions divided by sections. """ ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]]