piskvorky · menshikh-iv · Apr 3, 2018 · Feb 26, 2018 · Apr 1, 2018 · Apr 1, 2018
diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
@@ -50,7 +50,8 @@
 
 And on analogies::
 
-  >>> word_vectors.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt'))
+  >>> word_vectors.evaluate_word_analogies(os.path.join(module_path, 'test_data', 'questions-words.txt'))[0]
+  0.58
 
 and so on.
 
@@ -850,6 +851,137 @@ def n_similarity(self, ws1, ws2):
         v2 = [self[word] for word in ws2]
         return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0)))
 
+    @staticmethod
+    def log_evaluate_word_analogies(section):
+        correct, incorrect = len(section['correct']), len(section['incorrect'])
+        if correct + incorrect > 0:
+            score = correct / (correct + incorrect)
+            logger.info("%s: %.1f%% (%i/%i)", section['section'], 100.0 * score, correct, correct + incorrect)
+            return score
+
+    def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensitive=True, dummy4unknown=False):
+        """Compute performance of the model on an analogy test set.
+
+        The accuracy is reported (=printed to log and returned as a score)
+        for each section separately, plus there's one aggregate summary
+        at the end.
+        This method corresponds to the `compute-accuracy` script of the
+        original C word2vec.
+
+        Parameters
+        ----------
+        `analogies` is a filename where lines are 4-tuples of words,
+        split into sections by ": SECTION NAME" lines.
+        See questions-words.txt for an example:
+        `from gensim.test.utils import datapath`
+        `datapath("questions-words.txt")`
+
+        Use `restrict_vocab` to ignore all 4-tuples containing a word
+        not in the first `restrict_vocab` words (default 300,000). This
+        may be meaningful if you've sorted the model vocabulary by
+        descending frequency (which is standard in modern word embedding
+        models).
+
+        Use `case_insensitive` to convert all words in 4-tuples and
+        vocabulary to their uppercase form before evaluating
+        the performance (default True). Useful to handle case-mismatch
+        between training tokens and words in the test set.
+        In case of multiple case variants of a single word, the vector
+        for the first occurrence (also the most frequent if vocabulary
+        is sorted) is taken. If `case_insensitive` is True, the first
+        `restrict_vocab` words are taken first, and then case normalization
+        is performed.
+
+        Use `dummy4unknown=True` to produce zero accuracies for 4-tuples
+        with out-of-vocabulary words. Otherwise (default False), these
+        tuples are skipped entirely and not used in the evaluation.
+
+        References
+        -------
+        See <https://aclweb.org/aclwiki/Analogy_(State_of_the_art)>.
+
+        Returns
+        -------
+        float
+            Overall evaluation score
+        list
+            Full lists of correct and incorrect predictions
+
+        """
+        ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]]
+        ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab)
+        oov = 0
+        logger.info("Evaluating word analogies for top %i most frequent words on %s", restrict_vocab, analogies)
+        sections, section = [], None
+        line_no = 1
+        for line_no, line in enumerate(utils.smart_open(analogies)):
+            line = utils.to_unicode(line)
+            if line.startswith(': '):
+                # a new section starts => store the old section
+                if section:
+                    sections.append(section)
+                    self.log_evaluate_word_analogies(section)
+                section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []}
+            else:
+                if not section:
+                    raise ValueError("Missing section header before line #%i in %s" % (line_no, analogies))
+                try:
+                    if case_insensitive:
+                        a, b, c, expected = [word.upper() for word in line.split()]
+                    else:
+                        a, b, c, expected = [word for word in line.split()]
+                except ValueError:
+                    logger.info("Skipping invalid line #%i in %s", line_no, analogies)
+                    continue
+                if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
+                    oov += 1
+                    if dummy4unknown:
+                        logger.debug('Zero accuracy for line #%d with OOV words: %s', line_no, line.strip())
+                        section['incorrect'].append((a, b, c, expected))
+                    else:
+                        logger.debug("Skipping line #%i with OOV words: %s", line_no, line.strip())
+                    continue
+                original_vocab = self.vocab
+                self.vocab = ok_vocab
+                ignore = {a, b, c}  # input words to be ignored
+                predicted = None
+                # find the most likely prediction using 3CosAdd (vector offset) method
+                # TODO: implement 3CosMul and set-based methods for solving analogies
+                sims = self.most_similar(positive=[b, c], negative=[a], topn=5, restrict_vocab=restrict_vocab)
+                self.vocab = original_vocab
+                for element in sims:
+                    predicted = element[0].upper() if case_insensitive else element[0]
+                    if predicted in ok_vocab and predicted not in ignore:
+                        if predicted != expected:
+                            logger.debug("%s: expected %s, predicted %s", line.strip(), expected, predicted)
+                        break
+                if predicted == expected:
+                    section['correct'].append((a, b, c, expected))
+                else:
+                    section['incorrect'].append((a, b, c, expected))
+        if section:
+            # store the last section, too
+            sections.append(section)
+            self.log_evaluate_word_analogies(section)
+
+        total = {
+            'section': 'Total accuracy',
+            'correct': sum((s['correct'] for s in sections), []),
+            'incorrect': sum((s['incorrect'] for s in sections), []),
+        }
+
+        oov_ratio = float(oov) / line_no * 100
+        logger.info('Quadruplets with out-of-vocabulary words: %.1f%%', oov_ratio)
+        if not dummy4unknown:
+            logger.info(
+                'NB: analogies containing OOV words were skipped from evaluation! '
+                'To change this behavior, use "dummy4unknown=True"'
+            )
+        analogies_score = self.log_evaluate_word_analogies(total)
+        sections.append(total)
+        # Return the overall score and the full lists of correct and incorrect analogies
+        return analogies_score, sections
+
     @staticmethod
     def log_accuracy(section):
         correct, incorrect = len(section['correct']), len(section['incorrect'])
@@ -859,6 +991,7 @@ def log_accuracy(section):
                 section['section'], 100.0 * correct / (correct + incorrect), correct, correct + incorrect
             )
 
+    @deprecated("Method will be removed in 4.0.0, use self.evaluate_word_analogies() instead")
     def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, case_insensitive=True):
         """
         Compute accuracy of the model. `questions` is a filename where lines are
@@ -881,7 +1014,6 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c
         occurrence (also the most frequent if vocabulary is sorted) is taken.
 
         This method corresponds to the `compute-accuracy` script of the original C word2vec.
-
         """
         ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]]
         ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab)
@@ -898,19 +1030,18 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c
                 section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []}
             else:
                 if not section:
-                    raise ValueError("missing section header before line #%i in %s" % (line_no, questions))
+                    raise ValueError("Missing section header before line #%i in %s" % (line_no, questions))
                 try:
                     if case_insensitive:
                         a, b, c, expected = [word.upper() for word in line.split()]
                     else:
                         a, b, c, expected = [word for word in line.split()]
                 except ValueError:
-                    logger.info("skipping invalid line #%i in %s", line_no, questions)
+                    logger.info("Skipping invalid line #%i in %s", line_no, questions)
                     continue
                 if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
-                    logger.debug("skipping line #%i with OOV words: %s", line_no, line.strip())
+                    logger.debug("Skipping line #%i with OOV words: %s", line_no, line.strip())
                     continue
-
                 original_vocab = self.vocab
                 self.vocab = ok_vocab
                 ignore = {a, b, c}  # input words to be ignored

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
@@ -74,7 +74,8 @@
 
 And on analogies::
 
-  >>> model.wv.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt'))
+  >>> model.wv.evaluate_word_analogies(os.path.join(module_path, 'test_data', 'questions-words.txt'))[0]
+  0.58
 
 and so on.
 
@@ -896,7 +897,7 @@ def reset_from(self, other_model):
     def log_accuracy(section):
         return Word2VecKeyedVectors.log_accuracy(section)
 
-    @deprecated("Method will be removed in 4.0.0, use self.wv.accuracy() instead")
+    @deprecated("Method will be removed in 4.0.0, use self.wv.evaluate_word_analogies() instead")
     def accuracy(self, questions, restrict_vocab=30000, most_similar=None, case_insensitive=True):
         most_similar = most_similar or Word2VecKeyedVectors.most_similar
         return self.wv.accuracy(questions, restrict_vocab, most_similar, case_insensitive)