From 609fce3f17690052509e29f87a1f9c0b19740528 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= <witiko@mail.muni.cz>
Date: Sat, 17 Mar 2018 19:43:02 +0100
Subject: [PATCH] Test that the tfidf parameter has desired effect on
 similarity_matrix

---
 gensim/test/test_keyedvectors.py | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py
index 83a3030f05..c6c420900a 100644
--- a/gensim/test/test_keyedvectors.py
+++ b/gensim/test/test_keyedvectors.py
@@ -15,7 +15,7 @@
 import numpy as np
 
 from gensim.corpora import Dictionary
-from gensim.models import KeyedVectors as EuclideanKeyedVectors
+from gensim.models import KeyedVectors as EuclideanKeyedVectors, TfidfModel
 from gensim.test.utils import datapath
 
 
@@ -56,7 +56,33 @@ def test_similarity_matrix(self):
         similarity_matrix = self.vectors.similarity_matrix(dictionary, nonzero_limit=3).todense()
         self.assertEquals(20, np.sum(similarity_matrix == 0))
 
-        # TODO: Add unit test to check that supplied tfidf has desired effect
+        # check that processing rows in the order given by IDF has desired effect
+
+        # The complete similarity matrix we would obtain with nonzero_limit would look as follows:
+        documents = [["honour", "understanding"], ["understanding", "mean", "knop"]]
+        dictionary = Dictionary(documents)
+        tfidf = TfidfModel(dictionary=dictionary)
+
+        # All terms except for "understanding" have IDF of log2(2 / 1) = log2(2) = 1.0.
+        # The term "understanding" has IDF of log2(2 / 2) = log2(1) = 0.
+        #
+        # If we do not pass the tfidf parameter to the similarity_matrix
+        # method, then we process rows from 1 to 4. If we do pass the tfidf
+        # parameter to the similarity_matrix method, then we first process the
+        # rows 1, 3, 4 that correspond to terms with IDF of 1.0 and then row 2
+        # that corresponds to the term "understanding" with IDF of 0. Since the
+        # method is greedy, we end up with two different similarity matrices.
+
+        self.assertTrue(
+            np.isclose(
+                self.vectors.similarity_matrix(dictionary, nonzero_limit=2).todense(),
+                np.array([[1, 0.9348248, 0, 0], [0.9348248, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]])))
+        self.assertTrue(
+            np.isclose(
+                self.vectors.similarity_matrix(dictionary, tfidf, nonzero_limit=2).todense(),
+                np.array([
+                    [1, 0.9348248, 0, 0.9112908], [0.9348248, 1, 0.90007025, 0],
+                    [0, 0.90007025, 1, 0], [0.9112908, 0, 0, 1]])))
 
     def test_most_similar(self):
         """Test most_similar returns expected results."""