Skip to content

Commit

Permalink
Fix WordEmbeddingsKeyedVectors.most_similar (#2461)
Browse files Browse the repository at this point in the history
* Use topn=None in WordEmbeddingsKeyedVectors.accuracy

* Implement and document topn=None

* Make topn=None disable indexer in most_similar

* Use `or` instead of `{...}` for documenting a union of types

* Reword documentation on return value of most_similar and related methods

* Use `or` instead of `{...}` for documenting a union of types

* Document the return value dimensions of most_similar and related methods

* Extend the bugfix from WordEmbeddingsKeyedVectors to Doc2VecKeyedVectors

* Test the difference between topn=0 and topn=None in most_similar.

* Test that the AnnoyIndexer is disabled in most_similar when topn is None
  • Loading branch information
Witiko authored and mpenkov committed May 4, 2019
1 parent 1ceb7a4 commit ce0af20
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 26 deletions.
69 changes: 43 additions & 26 deletions gensim/models/keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ def add(self, entities, weights, replace=False):
----------
entities : list of str
Entities specified by string ids.
weights: {list of numpy.ndarray, numpy.ndarray}
weights: list of numpy.ndarray or numpy.ndarray
List of 1D np.array vectors or a 2D np.array of vectors.
replace: bool, optional
Flag indicating whether to replace vectors for entities which already exist in the vocabulary,
Expand Down Expand Up @@ -323,7 +323,7 @@ def __setitem__(self, entities, weights):
----------
entities : {str, list of str}
Entities specified by their string ids.
weights: {list of numpy.ndarray, numpy.ndarray}
weights: list of numpy.ndarray or numpy.ndarray
List of 1D np.array vectors or 2D np.array of vectors.
"""
Expand Down Expand Up @@ -502,8 +502,9 @@ def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=Non
List of words that contribute positively.
negative : list of str, optional
List of words that contribute negatively.
topn : int, optional
Number of top-N similar words to return.
topn : int or None, optional
Number of top-N similar words to return, when `topn` is int. When `topn` is None,
then similarities for all words are returned.
restrict_vocab : int, optional
Optional integer which limits the range of vectors which
are searched for most-similar values. For example, restrict_vocab=10000 would
Expand All @@ -512,11 +513,13 @@ def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=Non
Returns
-------
list of (str, float)
Sequence of (word, similarity).
list of (str, float) or numpy.array
When `topn` is int, a sequence of (word, similarity) is returned.
When `topn` is None, then similarities for all words are returned as a
one-dimensional numpy array with the size of the vocabulary.
"""
if topn is not None and topn < 1:
if isinstance(topn, int) and topn < 1:
return []

if positive is None:
Expand Down Expand Up @@ -553,12 +556,12 @@ def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=Non
raise ValueError("cannot compute similarity with no input")
mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)

if indexer is not None:
if indexer is not None and isinstance(topn, int):
return indexer.most_similar(mean, topn)

limited = self.vectors_norm if restrict_vocab is None else self.vectors_norm[:restrict_vocab]
dists = dot(limited, mean)
if topn is None:
if not topn:
return dists
best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True)
# ignore (don't return) words from the input
Expand All @@ -572,8 +575,8 @@ def similar_by_word(self, word, topn=10, restrict_vocab=None):
----------
word : str
Word
topn : {int, False}, optional
Number of top-N similar words to return. If topn is False, similar_by_word returns
topn : int or None, optional
Number of top-N similar words to return. If topn is None, similar_by_word returns
the vector of similarity scores.
restrict_vocab : int, optional
Optional integer which limits the range of vectors which
Expand All @@ -583,8 +586,10 @@ def similar_by_word(self, word, topn=10, restrict_vocab=None):
Returns
-------
list of (str, float)
Sequence of (word, similarity).
list of (str, float) or numpy.array
When `topn` is int, a sequence of (word, similarity) is returned.
When `topn` is None, then similarities for all words are returned as a
one-dimensional numpy array with the size of the vocabulary.
"""
return self.most_similar(positive=[word], topn=topn, restrict_vocab=restrict_vocab)
Expand All @@ -596,9 +601,9 @@ def similar_by_vector(self, vector, topn=10, restrict_vocab=None):
----------
vector : numpy.array
Vector from which similarities are to be computed.
topn : {int, False}, optional
Number of top-N similar words to return. If topn is False, similar_by_vector returns
the vector of similarity scores.
topn : int or None, optional
Number of top-N similar words to return, when `topn` is int. When `topn` is None,
then similarities for all words are returned.
restrict_vocab : int, optional
Optional integer which limits the range of vectors which
are searched for most-similar values. For example, restrict_vocab=10000 would
Expand All @@ -607,8 +612,10 @@ def similar_by_vector(self, vector, topn=10, restrict_vocab=None):
Returns
-------
list of (str, float)
Sequence of (word, similarity).
list of (str, float) or numpy.array
When `topn` is int, a sequence of (word, similarity) is returned.
When `topn` is None, then similarities for all words are returned as a
one-dimensional numpy array with the size of the vocabulary.
"""
return self.most_similar(positive=[vector], topn=topn, restrict_vocab=restrict_vocab)
Expand Down Expand Up @@ -788,15 +795,21 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10):
List of words that contribute positively.
negative : list of str, optional
List of words that contribute negatively.
topn : int, optional
Number of top-N similar words to return.
topn : int or None, optional
Number of top-N similar words to return, when `topn` is int. When `topn` is None,
then similarities for all words are returned.
Returns
-------
list of (str, float)
Sequence of (word, similarity).
list of (str, float) or numpy.array
When `topn` is int, a sequence of (word, similarity) is returned.
When `topn` is None, then similarities for all words are returned as a
one-dimensional numpy array with the size of the vocabulary.
"""
if isinstance(topn, int) and topn < 1:
return []

if positive is None:
positive = []
if negative is None:
Expand Down Expand Up @@ -1189,7 +1202,7 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c
ignore = {a, b, c} # input words to be ignored
predicted = None
# find the most likely prediction, ignoring OOV words and input words
sims = most_similar(self, positive=[b, c], negative=[a], topn=False, restrict_vocab=restrict_vocab)
sims = most_similar(self, positive=[b, c], negative=[a], topn=None, restrict_vocab=restrict_vocab)
self.vocab = original_vocab
for index in matutils.argsort(sims, reverse=True):
predicted = self.index2word[index].upper() if case_insensitive else self.index2word[index]
Expand Down Expand Up @@ -1651,8 +1664,9 @@ def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip
List of doctags/indexes that contribute positively.
negative : list of {str, int}, optional
List of doctags/indexes that contribute negatively.
topn : int, optional
Number of top-N similar docvecs to return.
topn : int or None, optional
Number of top-N similar docvecs to return, when `topn` is int. When `topn` is None,
then similarities for all docvecs are returned.
clip_start : int
Start clipping index.
clip_end : int
Expand All @@ -1664,6 +1678,9 @@ def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip
Sequence of (doctag/index, similarity).
"""
if isinstance(topn, int) and topn < 1:
return []

if positive is None:
positive = []
if negative is None:
Expand Down Expand Up @@ -1700,7 +1717,7 @@ def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip
raise ValueError("cannot compute similarity with no input")
mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)

if indexer is not None:
if indexer is not None and isinstance(topn, int):
return indexer.most_similar(mean, topn)

dists = dot(self.vectors_docs_norm[clip_start:clip_end], mean)
Expand Down
3 changes: 3 additions & 0 deletions gensim/test/test_keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ def test_most_similar_topn(self):
predicted = self.vectors.most_similar('war', topn=None)
self.assertEqual(len(predicted), len(self.vectors.vocab))

predicted = self.vectors.most_similar('war', topn=0)
self.assertEqual(len(predicted), 0)

def test_relative_cosine_similarity(self):
"""Test relative_cosine_similarity returns expected results with an input of a word pair and topn"""
wordnet_syn = [
Expand Down
8 changes: 8 additions & 0 deletions gensim/test/test_similarities.py
Original file line number Diff line number Diff line change
Expand Up @@ -608,6 +608,14 @@ def assertApproxNeighborsMatchExact(self, model, wv, index):

self.assertEqual(approx_words, exact_words)

def assertAllSimilaritiesDisableIndexer(self, model, wv, index):
vector = wv.vectors_norm[0]
approx_similarities = model.wv.most_similar([vector], topn=None, indexer=index)
exact_similarities = model.wv.most_similar(positive=[vector], topn=None)

self.assertEqual(approx_similarities, exact_similarities)
self.assertEqual(len(approx_similarities), len(wv.vectors.vocab))

def assertIndexSaved(self, index):
fname = get_tmpfile('gensim_similarities.tst.pkl')
index.save(fname)
Expand Down

0 comments on commit ce0af20

Please sign in to comment.