Use VP-Tree for fast kNN over Levenshtein distance

piskvorky · May 15, 2021 · d680af4 · d680af4
1 parent 5a116db
commit d680af4
Show file tree

Hide file tree

Showing 3 changed files with 57 additions and 60 deletions.
diff --git a/gensim/similarities/levenshtein.py b/gensim/similarities/levenshtein.py
@@ -90,10 +90,7 @@ def levsim(t1, t2, alpha=1.8, beta=5.0, min_similarity=0.0):
     assert alpha >= 0
     assert beta >= 0
 
-    max_lengths = max(len(t1), len(t2))
-    if max_lengths == 0:
-        return 1.0
-
+    max_lengths = max(len(t1), len(t2)) or 1
     min_similarity = float(max(min(min_similarity, 1.0), 0.0))
     max_distance = int(floor(max_lengths * (1 - (min_similarity / alpha) ** (1 / beta))))
     distance = levdist(t1, t2, max_distance)
@@ -102,52 +99,63 @@ def levsim(t1, t2, alpha=1.8, beta=5.0, min_similarity=0.0):
 
 
 class LevenshteinSimilarityIndex(TermSimilarityIndex):
-    """
+    r"""
     Computes Levenshtein similarities between terms and retrieves most similar
     terms for a given term.
 
     Notes
     -----
-    This is a naive implementation that iteratively computes pointwise Levenshtein similarities
-    between individual terms. Using this implementation to compute the similarity of all terms in
-    real-world dictionaries such as the English Wikipedia will take years.
+    This implementation uses a VP-Tree for metric indexing.
 
     Parameters
     ----------
     dictionary : :class:`~gensim.corpora.dictionary.Dictionary`
         A dictionary that specifies the considered terms.
     alpha : float, optional
-        The multiplicative factor alpha defined by Charlet and Damnati (2017).
+        The multiplicative factor alpha defined by [charletetal17]_.
     beta : float, optional
-        The exponential factor beta defined by Charlet and Damnati (2017).
-    threshold : float, optional
-        Only terms more similar than `threshold` are considered when retrieving
-        the most similar terms for a given term.
+        The exponential factor beta defined by [charletetal17]_.
+
+    Attributes
+    ----------
+    dictionary : :class:`~gensim.corpora.dictionary.Dictionary`
+        A dictionary that specifies the considered terms.
+    alpha : float, optional
+        The multiplicative factor alpha defined by [charletetal17]_.
+    beta : float, optional
+        The exponential factor beta defined by [charletetal17]_.
+    index : :class:`vptree.VPTree`
+        The VP-Tree metric index.
 
     See Also
     --------
-    :func:`gensim.similarities.levenshtein.levsim`
-        The Levenshtein similarity.
+    :class:`~gensim.similarities.termsim.WordEmbeddingSimilarityIndex`
+        Retrieve most similar terms for a given term using the cosine similarity over word
+        embeddings.
     :class:`~gensim.similarities.termsim.SparseTermSimilarityMatrix`
         Build a term similarity matrix and compute the Soft Cosine Measure.
 
+    References
+    ----------
+    The Levenshtein similarity in the context of term similarity was defined
+    by [charletetal17]_.
+
+    .. [charletetal17] Delphine Charlet and Geraldine Damnati, "SimBow at SemEval-2017 Task 3:
+       Soft-Cosine Semantic Similarity between Questions for Community Question Answering", 2017,
+       https://www.aclweb.org/anthology/S17-2051/.
+
     """
-    def __init__(self, dictionary, alpha=1.8, beta=5.0, threshold=0.0):
+    def __init__(self, dictionary, alpha=1.8, beta=5.0):
+        from vptree import VPTree
+
         self.dictionary = dictionary
         self.alpha = alpha
         self.beta = beta
-        self.threshold = threshold
+        terms = list(self.dictionary.values())
+        self.index = VPTree(terms, levdist)
         super(LevenshteinSimilarityIndex, self).__init__()
 
     def most_similar(self, t1, topn=10):
-        similarities = (
-            (levsim(t1, t2, self.alpha, self.beta, self.threshold), t2)
-            for t2 in self.dictionary.values()
-            if t1 != t2
-        )
-        most_similar = (
-            (t2, similarity)
-            for (similarity, t2) in sorted(similarities, reverse=True)
-            if similarity > 0
-        )
-        return itertools.islice(most_similar, int(topn))
+        terms = (term for _, term in self.index.get_n_nearest_neighbors(t1, int(topn + 1)))
+        most_similar = ((t2, levsim(t1, t2)) for t2 in terms if t1 != t2)
+        return itertools.islice(most_similar, topn)
diff --git a/gensim/similarities/termsim.py b/gensim/similarities/termsim.py
@@ -99,10 +99,8 @@ def most_similar(self, t1, topn=10):
 
 class WordEmbeddingSimilarityIndex(TermSimilarityIndex):
     """
-    Use objects of this class to:
-
-    1) Compute cosine similarities between word embeddings.
-    2) Retrieve the closest word embeddings (by cosine similarity) to a given word embedding.
+    Computes cosine similarities between word embeddings and retrieves most
+    similar terms for a given term.
 
     Parameters
     ----------
@@ -114,13 +112,15 @@ class WordEmbeddingSimilarityIndex(TermSimilarityIndex):
     exponent : float, optional
         Take the word embedding similarities larger than `threshold` to the power of `exponent`.
     kwargs : dict or None
-        A dict with keyword arguments that will be passed to the `keyedvectors.most_similar` method
+        A dict with keyword arguments that will be passed to the `KeyedVectors.most_similar` method
         when retrieving the word embeddings closest to a given word embedding.
 
     See Also
     --------
+    :class:`~gensim.similarities.levenshtein.LevenshteinSimilarityIndex`
+        Retrieve most similar terms for a given term using the Levenshtein distance.
     :class:`~gensim.similarities.termsim.SparseTermSimilarityMatrix`
-        A sparse term similarity matrix built using a term similarity index.
+        Build a term similarity matrix and compute the Soft Cosine Measure.
 
     """
     def __init__(self, keyedvectors, threshold=0.0, exponent=2.0, kwargs=None):

diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py
@@ -1610,57 +1610,46 @@ class TestLevenshteinSimilarityIndex(unittest.TestCase):
     def setUp(self):
         self.documents = [[u"government", u"denied", u"holiday"], [u"holiday", u"slowing", u"hollingworth"]]
         self.dictionary = Dictionary(self.documents)
+        self.index = LevenshteinSimilarityIndex(self.dictionary)
 
     @unittest.skipIf(LevenshteinSimilarityIndex is None, "gensim.similarities.levenshtein is disabled")
-    def test_most_similar(self):
+    def test_most_similar_topn(self):
         """Test most_similar returns expected results."""
-        index = LevenshteinSimilarityIndex(self.dictionary)
-        results = list(index.most_similar(u"holiday", topn=1))
+        results = list(self.index.most_similar(u"holiday", topn=1))
         self.assertLess(0, len(results))
         self.assertGreaterEqual(1, len(results))
-        results = list(index.most_similar(u"holiday", topn=4))
+
+        results = list(self.index.most_similar(u"holiday", topn=4))
         self.assertLess(1, len(results))
         self.assertGreaterEqual(4, len(results))
 
-        # check the order of the results
-        results = index.most_similar(u"holiday", topn=4)
+    @unittest.skipIf(LevenshteinSimilarityIndex is None, "gensim.similarities.levenshtein is disabled")
+    def test_most_similar_result_order(self):
+        results = self.index.most_similar(u"holiday", topn=4)
         terms, _ = tuple(zip(*results))
         self.assertEqual((u"hollingworth", u"slowing", u"denied", u"government"), terms)
 
-        # check that the term itself is not returned
-        index = LevenshteinSimilarityIndex(self.dictionary)
-        terms = [term for term, similarity in index.most_similar(u"holiday", topn=len(self.dictionary))]
+    @unittest.skipIf(LevenshteinSimilarityIndex is None, "gensim.similarities.levenshtein is disabled")
+    def test_most_similar_skips_self(self):
+        terms = [term for term, similarity in self.index.most_similar(u"holiday", topn=len(self.dictionary))]
         self.assertFalse(u"holiday" in terms)
 
-        # check that the threshold works as expected
-        index = LevenshteinSimilarityIndex(self.dictionary, threshold=0.0)
-        results = list(index.most_similar(u"holiday", topn=10))
-        self.assertLess(0, len(results))
-        self.assertGreaterEqual(10, len(results))
-
-        index = LevenshteinSimilarityIndex(self.dictionary, threshold=1.0)
-        results = list(index.most_similar(u"holiday", topn=10))
-        self.assertEqual(0, len(results))
-
-        # check that the alpha works as expected
+    @unittest.skipIf(LevenshteinSimilarityIndex is None, "gensim.similarities.levenshtein is disabled")
+    def test_most_similar_alpha(self):
         index = LevenshteinSimilarityIndex(self.dictionary, alpha=1.0)
         first_similarities = numpy.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)])
         index = LevenshteinSimilarityIndex(self.dictionary, alpha=2.0)
         second_similarities = numpy.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)])
         self.assertTrue(numpy.allclose(2.0 * first_similarities, second_similarities))
 
-        # check that the beta works as expected
+    @unittest.skipIf(LevenshteinSimilarityIndex is None, "gensim.similarities.levenshtein is disabled")
+    def test_most_similar_beta(self):
         index = LevenshteinSimilarityIndex(self.dictionary, alpha=1.0, beta=1.0)
         first_similarities = numpy.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)])
         index = LevenshteinSimilarityIndex(self.dictionary, alpha=1.0, beta=2.0)
         second_similarities = numpy.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)])
         self.assertTrue(numpy.allclose(first_similarities ** 2.0, second_similarities))
 
-        # check proper integration with SparseTermSimilarityMatrix
-        index = LevenshteinSimilarityIndex(self.dictionary, alpha=1.0, beta=1.0)
-        similarity_matrix = SparseTermSimilarityMatrix(index, DICTIONARY)
-        self.assertTrue(scipy.sparse.issparse(similarity_matrix.matrix))
-
 
 class TestWordEmbeddingSimilarityIndex(unittest.TestCase):
     def setUp(self):