Skip to content

Commit

Permalink
streamlining most_similar_cosmul and evaluate_word_analogies (#2656)
Browse files Browse the repository at this point in the history
* streamlining most_similar_cosmul

* Fix PR requested changes and add unit test

* fix merge artifacts

Co-authored-by: n3hrox <n3hrox@gmail.com>
Co-authored-by: Michael Penkov <m@penkov.dev>
  • Loading branch information
3 people authored Mar 22, 2022
1 parent 3ce81a4 commit ac3bbcd
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 3 deletions.
28 changes: 25 additions & 3 deletions gensim/models/keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -946,7 +946,9 @@ def nbow(document):
# Compute WMD.
return emd(d1, d2, distance_matrix)

def most_similar_cosmul(self, positive=None, negative=None, topn=10):
def most_similar_cosmul(
self, positive=None, negative=None, topn=10, restrict_vocab=None
):
"""Find the top-N most similar words, using the multiplicative combination objective,
proposed by `Omer Levy and Yoav Goldberg "Linguistic Regularities in Sparse and Explicit Word Representations"
<http://www.aclweb.org/anthology/W14-1618>`_. Positive words still contribute positively towards the similarity,
Expand All @@ -959,6 +961,9 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10):
With a single positive example, rankings will be the same as in the default
:meth:`~gensim.models.keyedvectors.KeyedVectors.most_similar`.
Allows calls like most_similar_cosmul('dog', 'cat'), as a shorthand for
most_similar_cosmul(['dog'], ['cat']) where 'dog' is positive and 'cat' negative
Parameters
----------
positive : list of str, optional
Expand All @@ -968,6 +973,11 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10):
topn : int or None, optional
Number of top-N similar words to return, when `topn` is int. When `topn` is None,
then similarities for all words are returned.
restrict_vocab : int or None, optional
Optional integer which limits the range of vectors which are searched for most-similar values.
For example, restrict_vocab=10000 would only check the first 10000 node vectors in the vocabulary order.
This may be meaningful if vocabulary is sorted by descending frequency.
Returns
-------
Expand All @@ -985,7 +995,14 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10):
positive = _ensure_list(positive)
negative = _ensure_list(negative)

self.fill_norms()
self.init_sims()

if isinstance(positive, str):
# allow calls like most_similar_cosmul('dog'), as a shorthand for most_similar_cosmul(['dog'])
positive = [positive]

if isinstance(negative, str):
negative = [negative]

all_words = {
self.get_index(word) for word in positive + negative
Expand Down Expand Up @@ -1205,7 +1222,9 @@ def _log_evaluate_word_analogies(section):
logger.info("%s: %.1f%% (%i/%i)", section['section'], 100.0 * score, correct, correct + incorrect)
return score

def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensitive=True, dummy4unknown=False):
def evaluate_word_analogies(
self, analogies, restrict_vocab=300000, case_insensitive=True,
dummy4unknown=False, similarity_function='most_similar'):
"""Compute performance of the model on an analogy test set.
The accuracy is reported (printed to log and returned as a score) for each section separately,
Expand All @@ -1231,6 +1250,8 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi
dummy4unknown : bool, optional
If True - produce zero accuracies for 4-tuples with out-of-vocabulary words.
Otherwise, these tuples are skipped entirely and not used in the evaluation.
similarity_function : str, optional
Function name used for similarity calculation.
Returns
-------
Expand Down Expand Up @@ -1286,6 +1307,7 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi
predicted = None
# find the most likely prediction using 3CosAdd (vector offset) method
# TODO: implement 3CosMul and set-based methods for solving analogies

sims = self.most_similar(positive=[b, c], negative=[a], topn=5, restrict_vocab=restrict_vocab)
self.key_to_index = original_key_to_index
for element in sims:
Expand Down
3 changes: 3 additions & 0 deletions gensim/test/test_fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,9 @@ def test_most_similar_cosmul(self):
self.assertEqual(
self.test_model.wv.most_similar_cosmul('nights'),
self.test_model.wv.most_similar_cosmul(positive=['nights']))
self.assertEqual(
self.test_model.wv.most_similar_cosmul('the', 'and'),
self.test_model.wv.most_similar_cosmul(positive=['the'], negative=['and']))

def test_lookup(self):
# In vocab, sanity check
Expand Down
6 changes: 6 additions & 0 deletions gensim/test/test_word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,12 @@ def test_evaluate_word_analogies(self):
"""Test that evaluating analogies on KeyedVectors give sane results"""
model = word2vec.Word2Vec(LeeCorpus())
score, sections = model.wv.evaluate_word_analogies(datapath('questions-words.txt'))
score_cosmul, sections_cosmul = model.wv.evaluate_word_analogies(
datapath('questions-words.txt'),
similarity_function='most_similar_cosmul'
)
self.assertEqual(score, score_cosmul)
self.assertEqual(sections, sections_cosmul)
self.assertGreaterEqual(score, 0.0)
self.assertLessEqual(score, 1.0)
self.assertGreater(len(sections), 0)
Expand Down

0 comments on commit ac3bbcd

Please sign in to comment.