Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

most_similar_cosmul bug fix #1177

Merged
merged 3 commits into from
Mar 3, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions gensim/models/keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,14 +455,21 @@ def most_similar_cosmul(self, positive=[], negative=[], topn=10):
# allow calls like most_similar_cosmul('dog'), as a shorthand for most_similar_cosmul(['dog'])
positive = [positive]

all_words = set([self.vocab[word].index for word in positive+negative
if not isinstance(word, ndarray) and word in self.vocab])

positive = [
self.word_vec(word, use_norm=True) if isinstance(word, string_types) else word
for word in positive
]
negative = [
self.word_vec(word, use_norm=True) if isinstance(word, string_types) else word
for word in negative
]

positive = [self.word_vec(word, use_norm=True) for word in positive]
negative = [self.word_vec(word, use_norm=True) for word in negative]
if not positive:
raise ValueError("cannot compute similarity with no input")

all_words = set([self.vocab[word].index for word in positive+negative if word in self.vocab])

# equation (4) of Levy & Goldberg "Linguistic Regularities...",
# with distances shifted to [0,1] per footnote (7)
pos_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in positive]
Expand Down
10 changes: 5 additions & 5 deletions gensim/test/test_fasttext_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,11 +158,11 @@ def testMostSimilar(self):
def testMostSimilarCosmul(self):
"""Test most_similar_cosmul for in-vocab and out-of-vocab words"""
# In vocab, sanity check
self.assertEqual(len(self.test_model.most_similar(positive=['the', 'and'], topn=5)), 5)
self.assertEqual(self.test_model.most_similar('the'), self.test_model.most_similar(positive=['the']))
self.assertEqual(len(self.test_model.most_similar_cosmul(positive=['the', 'and'], topn=5)), 5)
self.assertEqual(self.test_model.most_similar_cosmul('the'), self.test_model.most_similar_cosmul(positive=['the']))
# Out of vocab check
self.assertEqual(len(self.test_model.most_similar(['night', 'nights'], topn=5)), 5)
self.assertEqual(self.test_model.most_similar('nights'), self.test_model.most_similar(positive=['nights']))
self.assertEqual(len(self.test_model.most_similar_cosmul(['night', 'nights'], topn=5)), 5)
self.assertEqual(self.test_model.most_similar_cosmul('nights'), self.test_model.most_similar_cosmul(positive=['nights']))

def testLookup(self):
"""Tests word vector lookup for in-vocab and out-of-vocab words"""
Expand Down Expand Up @@ -218,4 +218,4 @@ def testHash(self):

if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
unittest.main()
unittest.main()
11 changes: 11 additions & 0 deletions gensim/test/test_word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,17 @@ def test_cbow_neg(self):
min_count=5, iter=10, workers=2, sample=0)
self.model_sanity(model)

def test_cosmul(self):
model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0)
sims = model.most_similar_cosmul('graph', topn=10)
# self.assertTrue(sims[0][0] == 'trees', sims) # most similar

# test querying for "most similar" by vector
graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index]
sims2 = model.most_similar_cosmul(positive=[graph_vector], topn=11)
sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself
self.assertEqual(sims, sims2)

def testTrainingCbow(self):
"""Test CBOW word2vec training."""
# to test training, make the corpus larger by repeating its sentences over and over
Expand Down