From 609fce3f17690052509e29f87a1f9c0b19740528 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Sat, 17 Mar 2018 19:43:02 +0100 Subject: [PATCH] Test that the tfidf parameter has desired effect on similarity_matrix --- gensim/test/test_keyedvectors.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py index 83a3030f05..c6c420900a 100644 --- a/gensim/test/test_keyedvectors.py +++ b/gensim/test/test_keyedvectors.py @@ -15,7 +15,7 @@ import numpy as np from gensim.corpora import Dictionary -from gensim.models import KeyedVectors as EuclideanKeyedVectors +from gensim.models import KeyedVectors as EuclideanKeyedVectors, TfidfModel from gensim.test.utils import datapath @@ -56,7 +56,33 @@ def test_similarity_matrix(self): similarity_matrix = self.vectors.similarity_matrix(dictionary, nonzero_limit=3).todense() self.assertEquals(20, np.sum(similarity_matrix == 0)) - # TODO: Add unit test to check that supplied tfidf has desired effect + # check that processing rows in the order given by IDF has desired effect + + # The complete similarity matrix we would obtain with nonzero_limit would look as follows: + documents = [["honour", "understanding"], ["understanding", "mean", "knop"]] + dictionary = Dictionary(documents) + tfidf = TfidfModel(dictionary=dictionary) + + # All terms except for "understanding" have IDF of log2(2 / 1) = log2(2) = 1.0. + # The term "understanding" has IDF of log2(2 / 2) = log2(1) = 0. + # + # If we do not pass the tfidf parameter to the similarity_matrix + # method, then we process rows from 1 to 4. If we do pass the tfidf + # parameter to the similarity_matrix method, then we first process the + # rows 1, 3, 4 that correspond to terms with IDF of 1.0 and then row 2 + # that corresponds to the term "understanding" with IDF of 0. Since the + # method is greedy, we end up with two different similarity matrices. + + self.assertTrue( + np.isclose( + self.vectors.similarity_matrix(dictionary, nonzero_limit=2).todense(), + np.array([[1, 0.9348248, 0, 0], [0.9348248, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]))) + self.assertTrue( + np.isclose( + self.vectors.similarity_matrix(dictionary, tfidf, nonzero_limit=2).todense(), + np.array([ + [1, 0.9348248, 0, 0.9112908], [0.9348248, 1, 0.90007025, 0], + [0, 0.90007025, 1, 0], [0.9112908, 0, 0, 1]]))) def test_most_similar(self): """Test most_similar returns expected results."""