streamlining most_similar_cosmul and evaluate_word_analogies (#2656)

* streamlining most_similar_cosmul * Fix PR requested changes and add unit test * fix merge artifacts Co-authored-by: n3hrox <n3hrox@gmail.com> Co-authored-by: Michael Penkov <m@penkov.dev>
piskvorky · Mar 22, 2022 · ac3bbcd · ac3bbcd
1 parent 3ce81a4
commit ac3bbcd
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 3 deletions.
diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
@@ -946,7 +946,9 @@ def nbow(document):
         # Compute WMD.
         return emd(d1, d2, distance_matrix)
 
-    def most_similar_cosmul(self, positive=None, negative=None, topn=10):
+    def most_similar_cosmul(
+            self, positive=None, negative=None, topn=10, restrict_vocab=None
+        ):
         """Find the top-N most similar words, using the multiplicative combination objective,
         proposed by `Omer Levy and Yoav Goldberg "Linguistic Regularities in Sparse and Explicit Word Representations"
         <http://www.aclweb.org/anthology/W14-1618>`_. Positive words still contribute positively towards the similarity,
@@ -959,6 +961,9 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10):
         With a single positive example, rankings will be the same as in the default
         :meth:`~gensim.models.keyedvectors.KeyedVectors.most_similar`.
 
+        Allows calls like most_similar_cosmul('dog', 'cat'), as a shorthand for
+        most_similar_cosmul(['dog'], ['cat']) where 'dog' is positive and 'cat' negative
+
         Parameters
         ----------
         positive : list of str, optional
@@ -968,6 +973,11 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10):
         topn : int or None, optional
             Number of top-N similar words to return, when `topn` is int. When `topn` is None,
             then similarities for all words are returned.
+        restrict_vocab : int or None, optional
+            Optional integer which limits the range of vectors which are searched for most-similar values.
+            For example, restrict_vocab=10000 would only check the first 10000 node vectors in the vocabulary order.
+            This may be meaningful if vocabulary is sorted by descending frequency.
+
 
         Returns
         -------
@@ -985,7 +995,14 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10):
         positive = _ensure_list(positive)
         negative = _ensure_list(negative)
 
-        self.fill_norms()
+        self.init_sims()
+
+        if isinstance(positive, str):
+            # allow calls like most_similar_cosmul('dog'), as a shorthand for most_similar_cosmul(['dog'])
+            positive = [positive]
+
+        if isinstance(negative, str):
+            negative = [negative]
 
         all_words = {
             self.get_index(word) for word in positive + negative
@@ -1205,7 +1222,9 @@ def _log_evaluate_word_analogies(section):
         logger.info("%s: %.1f%% (%i/%i)", section['section'], 100.0 * score, correct, correct + incorrect)
         return score
 
-    def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensitive=True, dummy4unknown=False):
+    def evaluate_word_analogies(
+            self, analogies, restrict_vocab=300000, case_insensitive=True,
+            dummy4unknown=False, similarity_function='most_similar'):
         """Compute performance of the model on an analogy test set.
 
         The accuracy is reported (printed to log and returned as a score) for each section separately,
@@ -1231,6 +1250,8 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi
         dummy4unknown : bool, optional
             If True - produce zero accuracies for 4-tuples with out-of-vocabulary words.
             Otherwise, these tuples are skipped entirely and not used in the evaluation.
+        similarity_function : str, optional
+            Function name used for similarity calculation.
 
         Returns
         -------
@@ -1286,6 +1307,7 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi
                     predicted = None
                     # find the most likely prediction using 3CosAdd (vector offset) method
                     # TODO: implement 3CosMul and set-based methods for solving analogies
+
                     sims = self.most_similar(positive=[b, c], negative=[a], topn=5, restrict_vocab=restrict_vocab)
                     self.key_to_index = original_key_to_index
                     for element in sims:

diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py
@@ -373,6 +373,9 @@ def test_most_similar_cosmul(self):
         self.assertEqual(
             self.test_model.wv.most_similar_cosmul('nights'),
             self.test_model.wv.most_similar_cosmul(positive=['nights']))
+        self.assertEqual(
+            self.test_model.wv.most_similar_cosmul('the', 'and'),
+            self.test_model.wv.most_similar_cosmul(positive=['the'], negative=['and']))
 
     def test_lookup(self):
         # In vocab, sanity check

diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py
@@ -555,6 +555,12 @@ def test_evaluate_word_analogies(self):
         """Test that evaluating analogies on KeyedVectors give sane results"""
         model = word2vec.Word2Vec(LeeCorpus())
         score, sections = model.wv.evaluate_word_analogies(datapath('questions-words.txt'))
+        score_cosmul, sections_cosmul = model.wv.evaluate_word_analogies(
+            datapath('questions-words.txt'),
+            similarity_function='most_similar_cosmul'
+        )
+        self.assertEqual(score, score_cosmul)
+        self.assertEqual(sections, sections_cosmul)
         self.assertGreaterEqual(score, 0.0)
         self.assertLessEqual(score, 1.0)
         self.assertGreater(len(sections), 0)