Fix WordEmbeddingsKeyedVectors.most_similar (#2461)

* Use topn=None in WordEmbeddingsKeyedVectors.accuracy * Implement and document topn=None * Make topn=None disable indexer in most_similar * Use `or` instead of `{...}` for documenting a union of types * Reword documentation on return value of most_similar and related methods * Use `or` instead of `{...}` for documenting a union of types * Document the return value dimensions of most_similar and related methods * Extend the bugfix from WordEmbeddingsKeyedVectors to Doc2VecKeyedVectors * Test the difference between topn=0 and topn=None in most_similar. * Test that the AnnoyIndexer is disabled in most_similar when topn is None
piskvorky · May 4, 2019 · ce0af20 · ce0af20
1 parent 1ceb7a4
commit ce0af20
Show file tree

Hide file tree

Showing 3 changed files with 54 additions and 26 deletions.
diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
@@ -282,7 +282,7 @@ def add(self, entities, weights, replace=False):
         ----------
         entities : list of str
             Entities specified by string ids.
-        weights: {list of numpy.ndarray, numpy.ndarray}
+        weights: list of numpy.ndarray or numpy.ndarray
             List of 1D np.array vectors or a 2D np.array of vectors.
         replace: bool, optional
             Flag indicating whether to replace vectors for entities which already exist in the vocabulary,
@@ -323,7 +323,7 @@ def __setitem__(self, entities, weights):
         ----------
         entities : {str, list of str}
             Entities specified by their string ids.
-        weights: {list of numpy.ndarray, numpy.ndarray}
+        weights: list of numpy.ndarray or numpy.ndarray
             List of 1D np.array vectors or 2D np.array of vectors.
 
         """
@@ -502,8 +502,9 @@ def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=Non
             List of words that contribute positively.
         negative : list of str, optional
             List of words that contribute negatively.
-        topn : int, optional
-            Number of top-N similar words to return.
+        topn : int or None, optional
+            Number of top-N similar words to return, when `topn` is int. When `topn` is None,
+            then similarities for all words are returned.
         restrict_vocab : int, optional
             Optional integer which limits the range of vectors which
             are searched for most-similar values. For example, restrict_vocab=10000 would
@@ -512,11 +513,13 @@ def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=Non
 
         Returns
         -------
-        list of (str, float)
-            Sequence of (word, similarity).
+        list of (str, float) or numpy.array
+            When `topn` is int, a sequence of (word, similarity) is returned.
+            When `topn` is None, then similarities for all words are returned as a
+            one-dimensional numpy array with the size of the vocabulary.
 
         """
-        if topn is not None and topn < 1:
+        if isinstance(topn, int) and topn < 1:
             return []
 
         if positive is None:
@@ -553,12 +556,12 @@ def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=Non
             raise ValueError("cannot compute similarity with no input")
         mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)
 
-        if indexer is not None:
+        if indexer is not None and isinstance(topn, int):
             return indexer.most_similar(mean, topn)
 
         limited = self.vectors_norm if restrict_vocab is None else self.vectors_norm[:restrict_vocab]
         dists = dot(limited, mean)
-        if topn is None:
+        if not topn:
             return dists
         best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True)
         # ignore (don't return) words from the input
@@ -572,8 +575,8 @@ def similar_by_word(self, word, topn=10, restrict_vocab=None):
         ----------
         word : str
             Word
-        topn : {int, False}, optional
-            Number of top-N similar words to return. If topn is False, similar_by_word returns
+        topn : int or None, optional
+            Number of top-N similar words to return. If topn is None, similar_by_word returns
             the vector of similarity scores.
         restrict_vocab : int, optional
             Optional integer which limits the range of vectors which
@@ -583,8 +586,10 @@ def similar_by_word(self, word, topn=10, restrict_vocab=None):
 
         Returns
         -------
-        list of (str, float)
-            Sequence of (word, similarity).
+        list of (str, float) or numpy.array
+            When `topn` is int, a sequence of (word, similarity) is returned.
+            When `topn` is None, then similarities for all words are returned as a
+            one-dimensional numpy array with the size of the vocabulary.
 
         """
         return self.most_similar(positive=[word], topn=topn, restrict_vocab=restrict_vocab)
@@ -596,9 +601,9 @@ def similar_by_vector(self, vector, topn=10, restrict_vocab=None):
         ----------
         vector : numpy.array
             Vector from which similarities are to be computed.
-        topn : {int, False}, optional
-            Number of top-N similar words to return. If topn is False, similar_by_vector returns
-            the vector of similarity scores.
+        topn : int or None, optional
+            Number of top-N similar words to return, when `topn` is int. When `topn` is None,
+            then similarities for all words are returned.
         restrict_vocab : int, optional
             Optional integer which limits the range of vectors which
             are searched for most-similar values. For example, restrict_vocab=10000 would
@@ -607,8 +612,10 @@ def similar_by_vector(self, vector, topn=10, restrict_vocab=None):
 
         Returns
         -------
-        list of (str, float)
-            Sequence of (word, similarity).
+        list of (str, float) or numpy.array
+            When `topn` is int, a sequence of (word, similarity) is returned.
+            When `topn` is None, then similarities for all words are returned as a
+            one-dimensional numpy array with the size of the vocabulary.
 
         """
         return self.most_similar(positive=[vector], topn=topn, restrict_vocab=restrict_vocab)
@@ -788,15 +795,21 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10):
             List of words that contribute positively.
         negative : list of str, optional
             List of words that contribute negatively.
-        topn : int, optional
-            Number of top-N similar words to return.
+        topn : int or None, optional
+            Number of top-N similar words to return, when `topn` is int. When `topn` is None,
+            then similarities for all words are returned.
 
         Returns
         -------
-        list of (str, float)
-            Sequence of (word, similarity).
+        list of (str, float) or numpy.array
+            When `topn` is int, a sequence of (word, similarity) is returned.
+            When `topn` is None, then similarities for all words are returned as a
+            one-dimensional numpy array with the size of the vocabulary.
 
         """
+        if isinstance(topn, int) and topn < 1:
+            return []
+
         if positive is None:
             positive = []
         if negative is None:
@@ -1189,7 +1202,7 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c
                 ignore = {a, b, c}  # input words to be ignored
                 predicted = None
                 # find the most likely prediction, ignoring OOV words and input words
-                sims = most_similar(self, positive=[b, c], negative=[a], topn=False, restrict_vocab=restrict_vocab)
+                sims = most_similar(self, positive=[b, c], negative=[a], topn=None, restrict_vocab=restrict_vocab)
                 self.vocab = original_vocab
                 for index in matutils.argsort(sims, reverse=True):
                     predicted = self.index2word[index].upper() if case_insensitive else self.index2word[index]
@@ -1651,8 +1664,9 @@ def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip
             List of doctags/indexes that contribute positively.
         negative : list of {str, int}, optional
             List of doctags/indexes that contribute negatively.
-        topn : int, optional
-            Number of top-N similar docvecs to return.
+        topn : int or None, optional
+            Number of top-N similar docvecs to return, when `topn` is int. When `topn` is None,
+            then similarities for all docvecs are returned.
         clip_start : int
             Start clipping index.
         clip_end : int
@@ -1664,6 +1678,9 @@ def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip
             Sequence of (doctag/index, similarity).
 
         """
+        if isinstance(topn, int) and topn < 1:
+            return []
+
         if positive is None:
             positive = []
         if negative is None:
@@ -1700,7 +1717,7 @@ def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip
             raise ValueError("cannot compute similarity with no input")
         mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)
 
-        if indexer is not None:
+        if indexer is not None and isinstance(topn, int):
             return indexer.most_similar(mean, topn)
 
         dists = dot(self.vectors_docs_norm[clip_start:clip_end], mean)

diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py
@@ -106,6 +106,9 @@ def test_most_similar_topn(self):
         predicted = self.vectors.most_similar('war', topn=None)
         self.assertEqual(len(predicted), len(self.vectors.vocab))
 
+        predicted = self.vectors.most_similar('war', topn=0)
+        self.assertEqual(len(predicted), 0)
+
     def test_relative_cosine_similarity(self):
         """Test relative_cosine_similarity returns expected results with an input of a word pair and topn"""
         wordnet_syn = [

diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py
@@ -608,6 +608,14 @@ def assertApproxNeighborsMatchExact(self, model, wv, index):
 
         self.assertEqual(approx_words, exact_words)
 
+    def assertAllSimilaritiesDisableIndexer(self, model, wv, index):
+        vector = wv.vectors_norm[0]
+        approx_similarities = model.wv.most_similar([vector], topn=None, indexer=index)
+        exact_similarities = model.wv.most_similar(positive=[vector], topn=None)
+
+        self.assertEqual(approx_similarities, exact_similarities)
+        self.assertEqual(len(approx_similarities), len(wv.vectors.vocab))
+
     def assertIndexSaved(self, index):
         fname = get_tmpfile('gensim_similarities.tst.pkl')
         index.save(fname)