Skip to content

Commit

Permalink
Fix SoftCosineSimilarity.get_similarities on corpora. Fix #1955 (#1972
Browse files Browse the repository at this point in the history
)

* Fix misinformation in SoftCosineSimilarity and WmdSimilarity docstrings

* Properly handle corpora in SoftCosineSimilarity.get_similarities

* Remove unnecessary member variables in SoftCosineSimilarity tests

* Add corpus and TransformedCorpus query tests for SoftCosineSimilarity

* Fix a test of non-diagonal results from SoftCosineSimilarity

* Allow precision error in soft cosine between two identical documents

* Make corpus query tests for SoftCosineSimilarity query the correct index

* Remove extra blank like in SoftCosineSimilarity.get_similarities
  • Loading branch information
Witiko authored and menshikh-iv committed Mar 12, 2018
1 parent 49df463 commit 97c280f
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 28 deletions.
30 changes: 15 additions & 15 deletions gensim/similarities/docsim.py
Original file line number Diff line number Diff line change
Expand Up @@ -922,7 +922,7 @@ def get_similarities(self, query):
Parameters
----------
query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix`
query : {list of (int, number), iterable of list of (int, number)
Document or collection of documents.
Return
Expand All @@ -931,29 +931,29 @@ def get_similarities(self, query):
Similarity matrix.
"""
if isinstance(query, numpy.ndarray):
# Convert document indexes to actual documents.
query = [self.corpus[i] for i in query]

if not query or not isinstance(query[0], list):
query = [query]
is_corpus, query = utils.is_corpus(query)
if not is_corpus:
if isinstance(query, numpy.ndarray):
# Convert document indexes to actual documents.
query = [self.corpus[i] for i in query]
else:
query = [query]

n_queries = len(query)
result = []
for qidx in range(n_queries):
for query_document in query:
# Compute similarity for each query.
qresult = [matutils.softcossim(document, query[qidx], self.similarity_matrix)
for document in self.corpus]
qresult = [matutils.softcossim(query_document, corpus_document, self.similarity_matrix)
for corpus_document in self.corpus]
qresult = numpy.array(qresult)

# Append single query result to list of all results.
result.append(qresult)

if len(result) == 1:
# Only one query.
result = result[0]
else:
if is_corpus:
result = numpy.array(result)
else:
result = result[0]

return result

Expand Down Expand Up @@ -1038,7 +1038,7 @@ def get_similarities(self, query):
Parameters
----------
query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix`
query : {list of (int, number), iterable of list of (int, number)
Document or collection of documents.
Return
Expand Down
45 changes: 32 additions & 13 deletions gensim/test/test_similarities.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@
import scipy

from smart_open import smart_open
from gensim.corpora import Dictionary
from gensim.models import word2vec
from gensim.models import doc2vec
from gensim.models import KeyedVectors
from gensim.models import TfidfModel
from gensim import matutils, similarities
from gensim.models import Word2Vec, FastText
from gensim.test.utils import (datapath, get_tmpfile,
Expand Down Expand Up @@ -373,42 +373,61 @@ def testIter(self):
class TestSoftCosineSimilarity(unittest.TestCase, _TestSimilarityABC):
def setUp(self):
self.cls = similarities.SoftCosineSimilarity
self.dictionary = Dictionary(texts)
self.corpus = [dictionary.doc2bow(document) for document in texts]
self.tfidf = TfidfModel(dictionary=dictionary)
similarity_matrix = scipy.sparse.identity(12, format="lil")
similarity_matrix[dictionary.token2id["user"], dictionary.token2id["human"]] = 0.5
similarity_matrix[dictionary.token2id["human"], dictionary.token2id["user"]] = 0.5
self.similarity_matrix = similarity_matrix.tocsc()

def factoryMethod(self):
# Override factoryMethod.
return self.cls(self.corpus, self.similarity_matrix)
return self.cls(corpus, self.similarity_matrix)

def testFull(self, num_best=None):
# Override testFull.

index = self.cls(self.corpus, self.similarity_matrix, num_best=num_best)
query = self.dictionary.doc2bow(texts[0])
# Single query
index = self.cls(corpus, self.similarity_matrix, num_best=num_best)
query = dictionary.doc2bow(texts[0])
sims = index[query]

if num_best is not None:
# Sparse array.
for i, sim in sims:
self.assertTrue(numpy.alltrue(sim <= 1.0))
self.assertTrue(numpy.alltrue(sim >= 0.0))
else:
self.assertTrue(sims[0] == 1.0) # Similarity of a document with itself is 1.0.
self.assertAlmostEqual(1.0, sims[0]) # Similarity of a document with itself is 1.0.
self.assertTrue(numpy.alltrue(sims[1:] >= 0.0))
self.assertTrue(numpy.alltrue(sims[1:] < 1.0))
expected = 2.1889350195476758
self.assertAlmostEqual(expected, numpy.sum(sims))

# Corpora
for query in (
corpus, # Basic text corpus.
self.tfidf[corpus]): # Transformed corpus without slicing support.
index = self.cls(query, self.similarity_matrix, num_best=num_best)
sims = index[query]
if num_best is not None:
# Sparse array.
for result in sims:
for i, sim in result:
self.assertTrue(numpy.alltrue(sim <= 1.0))
self.assertTrue(numpy.alltrue(sim >= 0.0))
else:
for i, result in enumerate(sims):
self.assertAlmostEqual(1.0, result[i]) # Similarity of a document with itself is 1.0.
self.assertTrue(numpy.alltrue(result[:i] >= 0.0))
self.assertTrue(numpy.alltrue(result[:i] < 1.0))
self.assertTrue(numpy.alltrue(result[i + 1:] >= 0.0))
self.assertTrue(numpy.alltrue(result[i + 1:] < 1.0))

def testNonIncreasing(self):
""" Check that similarities are non-increasing when `num_best` is not `None`."""
# NOTE: this could be implemented for other similarities as well (i.e. in _TestSimilarityABC).

index = self.cls(self.corpus, self.similarity_matrix, num_best=5)
query = self.dictionary.doc2bow(texts[0])
index = self.cls(corpus, self.similarity_matrix, num_best=5)
query = dictionary.doc2bow(texts[0])
sims = index[query]
sims2 = numpy.asarray(sims)[:, 1] # Just the similarities themselves.

Expand All @@ -419,8 +438,8 @@ def testNonIncreasing(self):
def testChunking(self):
# Override testChunking.

index = self.cls(self.corpus, self.similarity_matrix)
query = [self.dictionary.doc2bow(document) for document in texts[:3]]
index = self.cls(corpus, self.similarity_matrix)
query = [dictionary.doc2bow(document) for document in texts[:3]]
sims = index[query]

for i in range(3):
Expand All @@ -438,7 +457,7 @@ def testChunking(self):
def testIter(self):
# Override testIter.

index = self.cls(self.corpus, self.similarity_matrix)
index = self.cls(corpus, self.similarity_matrix)
for sims in index:
self.assertTrue(numpy.alltrue(sims >= 0.0))
self.assertTrue(numpy.alltrue(sims <= 1.0))
Expand Down

0 comments on commit 97c280f

Please sign in to comment.