Skip to content

Commit

Permalink
Use VP-Tree for fast kNN over Levenshtein distance
Browse files Browse the repository at this point in the history
  • Loading branch information
Witiko committed May 15, 2021
1 parent 5a116db commit bf904eb
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 66 deletions.
7 changes: 5 additions & 2 deletions gensim/similarities/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,14 @@
import warnings
try:
import Levenshtein # noqa:F401
import vptree # noqa:F401
except ImportError:
msg = (
"The gensim.similarities.levenshtein submodule is disabled, because the optional "
"Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. "
"Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning."
"Levenshtein <https://pypi.org/project/python-Levenshtein/> and "
"vptree <https://pypi.org/project/vptree/> packages are unavailable. "
"Install Levenhstein and vptree (e.g. `pip install python-Levenshtein vptree`) to "
"suppress this warning."
)
warnings.warn(msg)
LevenshteinSimilarityIndex = None
Expand Down
64 changes: 36 additions & 28 deletions gensim/similarities/levenshtein.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,7 @@ def levsim(t1, t2, alpha=1.8, beta=5.0, min_similarity=0.0):
assert alpha >= 0
assert beta >= 0

max_lengths = max(len(t1), len(t2))
if max_lengths == 0:
return 1.0

max_lengths = max(len(t1), len(t2)) or 1
min_similarity = float(max(min(min_similarity, 1.0), 0.0))
max_distance = int(floor(max_lengths * (1 - (min_similarity / alpha) ** (1 / beta))))
distance = levdist(t1, t2, max_distance)
Expand All @@ -102,52 +99,63 @@ def levsim(t1, t2, alpha=1.8, beta=5.0, min_similarity=0.0):


class LevenshteinSimilarityIndex(TermSimilarityIndex):
"""
r"""
Computes Levenshtein similarities between terms and retrieves most similar
terms for a given term.
Notes
-----
This is a naive implementation that iteratively computes pointwise Levenshtein similarities
between individual terms. Using this implementation to compute the similarity of all terms in
real-world dictionaries such as the English Wikipedia will take years.
This implementation uses a VP-Tree for metric indexing.
Parameters
----------
dictionary : :class:`~gensim.corpora.dictionary.Dictionary`
A dictionary that specifies the considered terms.
alpha : float, optional
The multiplicative factor alpha defined by Charlet and Damnati (2017).
The multiplicative factor alpha defined by [charletetal17]_.
beta : float, optional
The exponential factor beta defined by Charlet and Damnati (2017).
threshold : float, optional
Only terms more similar than `threshold` are considered when retrieving
the most similar terms for a given term.
The exponential factor beta defined by [charletetal17]_.
Attributes
----------
dictionary : :class:`~gensim.corpora.dictionary.Dictionary`
A dictionary that specifies the considered terms.
alpha : float, optional
The multiplicative factor alpha defined by [charletetal17]_.
beta : float, optional
The exponential factor beta defined by [charletetal17]_.
index : :class:`vptree.VPTree`
The VP-Tree metric index.
See Also
--------
:func:`gensim.similarities.levenshtein.levsim`
The Levenshtein similarity.
:class:`~gensim.similarities.termsim.WordEmbeddingSimilarityIndex`
Retrieve most similar terms for a given term using the cosine similarity over word
embeddings.
:class:`~gensim.similarities.termsim.SparseTermSimilarityMatrix`
Build a term similarity matrix and compute the Soft Cosine Measure.
References
----------
The Levenshtein similarity in the context of term similarity was defined
by [charletetal17]_.
.. [charletetal17] Delphine Charlet and Geraldine Damnati, "SimBow at SemEval-2017 Task 3:
Soft-Cosine Semantic Similarity between Questions for Community Question Answering", 2017,
https://www.aclweb.org/anthology/S17-2051/.
"""
def __init__(self, dictionary, alpha=1.8, beta=5.0, threshold=0.0):
def __init__(self, dictionary, alpha=1.8, beta=5.0):
from vptree import VPTree

self.dictionary = dictionary
self.alpha = alpha
self.beta = beta
self.threshold = threshold
terms = list(self.dictionary.values())
self.index = VPTree(terms, levdist)
super(LevenshteinSimilarityIndex, self).__init__()

def most_similar(self, t1, topn=10):
similarities = (
(levsim(t1, t2, self.alpha, self.beta, self.threshold), t2)
for t2 in self.dictionary.values()
if t1 != t2
)
most_similar = (
(t2, similarity)
for (similarity, t2) in sorted(similarities, reverse=True)
if similarity > 0
)
return itertools.islice(most_similar, int(topn))
terms = [term for _, term in self.index.get_n_nearest_neighbors(t1, int(topn + 1))]
most_similar = ((t2, levsim(t1, t2, self.alpha, self.beta)) for t2 in terms if t1 != t2)
return itertools.islice(most_similar, topn)
12 changes: 6 additions & 6 deletions gensim/similarities/termsim.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,8 @@ def most_similar(self, t1, topn=10):

class WordEmbeddingSimilarityIndex(TermSimilarityIndex):
"""
Use objects of this class to:
1) Compute cosine similarities between word embeddings.
2) Retrieve the closest word embeddings (by cosine similarity) to a given word embedding.
Computes cosine similarities between word embeddings and retrieves most
similar terms for a given term.
Parameters
----------
Expand All @@ -114,13 +112,15 @@ class WordEmbeddingSimilarityIndex(TermSimilarityIndex):
exponent : float, optional
Take the word embedding similarities larger than `threshold` to the power of `exponent`.
kwargs : dict or None
A dict with keyword arguments that will be passed to the `keyedvectors.most_similar` method
A dict with keyword arguments that will be passed to the `KeyedVectors.most_similar` method
when retrieving the word embeddings closest to a given word embedding.
See Also
--------
:class:`~gensim.similarities.levenshtein.LevenshteinSimilarityIndex`
Retrieve most similar terms for a given term using the Levenshtein distance.
:class:`~gensim.similarities.termsim.SparseTermSimilarityMatrix`
A sparse term similarity matrix built using a term similarity index.
Build a term similarity matrix and compute the Soft Cosine Measure.
"""
def __init__(self, keyedvectors, threshold=0.0, exponent=2.0, kwargs=None):
Expand Down
52 changes: 22 additions & 30 deletions gensim/test/test_similarities.py
Original file line number Diff line number Diff line change
Expand Up @@ -1562,8 +1562,9 @@ class TestLevenshteinSimilarity(unittest.TestCase):
def test_empty_strings(self):
t1 = ""
t2 = ""
alpha = 1.8

self.assertEqual(1.0, levsim(t1, t2))
self.assertEqual(alpha, levsim(t1, t2))

@unittest.skipIf(LevenshteinSimilarityIndex is None, "gensim.similarities.levenshtein is disabled")
def test_negative_hyperparameters(self):
Expand Down Expand Up @@ -1610,57 +1611,48 @@ class TestLevenshteinSimilarityIndex(unittest.TestCase):
def setUp(self):
self.documents = [[u"government", u"denied", u"holiday"], [u"holiday", u"slowing", u"hollingworth"]]
self.dictionary = Dictionary(self.documents)
self.index = LevenshteinSimilarityIndex(self.dictionary)

@unittest.skipIf(LevenshteinSimilarityIndex is None, "gensim.similarities.levenshtein is disabled")
def test_most_similar(self):
def test_most_similar_topn(self):
"""Test most_similar returns expected results."""
index = LevenshteinSimilarityIndex(self.dictionary)
results = list(index.most_similar(u"holiday", topn=1))
results = list(self.index.most_similar(u"holiday", topn=1))
self.assertLess(0, len(results))
self.assertGreaterEqual(1, len(results))
results = list(index.most_similar(u"holiday", topn=4))
self.assertLess(1, len(results))

results = list(self.index.most_similar(u"holiday", topn=4))
self.assertLess(0, len(results))
self.assertGreaterEqual(4, len(results))

# check the order of the results
results = index.most_similar(u"holiday", topn=4)
terms, _ = tuple(zip(*results))
self.assertEqual((u"hollingworth", u"slowing", u"denied", u"government"), terms)
@unittest.skipIf(LevenshteinSimilarityIndex is None, "gensim.similarities.levenshtein is disabled")
def test_most_similar_result_order(self):
results = self.index.most_similar(u"holiday", topn=4)
terms, _ = list(zip(*results))
expected_terms = [u"hollingworth", u"slowing", u"denied", u"government"]
expected_terms = [term for term in expected_terms if term in terms]
self.assertEqual(expected_terms, terms)

# check that the term itself is not returned
index = LevenshteinSimilarityIndex(self.dictionary)
terms = [term for term, similarity in index.most_similar(u"holiday", topn=len(self.dictionary))]
@unittest.skipIf(LevenshteinSimilarityIndex is None, "gensim.similarities.levenshtein is disabled")
def test_most_similar_skips_self(self):
terms = [term for term, similarity in self.index.most_similar(u"holiday", topn=len(self.dictionary))]
self.assertFalse(u"holiday" in terms)

# check that the threshold works as expected
index = LevenshteinSimilarityIndex(self.dictionary, threshold=0.0)
results = list(index.most_similar(u"holiday", topn=10))
self.assertLess(0, len(results))
self.assertGreaterEqual(10, len(results))

index = LevenshteinSimilarityIndex(self.dictionary, threshold=1.0)
results = list(index.most_similar(u"holiday", topn=10))
self.assertEqual(0, len(results))

# check that the alpha works as expected
@unittest.skipIf(LevenshteinSimilarityIndex is None, "gensim.similarities.levenshtein is disabled")
def test_most_similar_alpha(self):
index = LevenshteinSimilarityIndex(self.dictionary, alpha=1.0)
first_similarities = numpy.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)])
index = LevenshteinSimilarityIndex(self.dictionary, alpha=2.0)
second_similarities = numpy.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)])
self.assertTrue(numpy.allclose(2.0 * first_similarities, second_similarities))

# check that the beta works as expected
@unittest.skipIf(LevenshteinSimilarityIndex is None, "gensim.similarities.levenshtein is disabled")
def test_most_similar_beta(self):
index = LevenshteinSimilarityIndex(self.dictionary, alpha=1.0, beta=1.0)
first_similarities = numpy.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)])
index = LevenshteinSimilarityIndex(self.dictionary, alpha=1.0, beta=2.0)
second_similarities = numpy.array([similarity for term, similarity in index.most_similar(u"holiday", topn=10)])
self.assertTrue(numpy.allclose(first_similarities ** 2.0, second_similarities))

# check proper integration with SparseTermSimilarityMatrix
index = LevenshteinSimilarityIndex(self.dictionary, alpha=1.0, beta=1.0)
similarity_matrix = SparseTermSimilarityMatrix(index, DICTIONARY)
self.assertTrue(scipy.sparse.issparse(similarity_matrix.matrix))


class TestWordEmbeddingSimilarityIndex(unittest.TestCase):
def setUp(self):
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,7 @@ def run(self):
'pyemd',
'nmslib',
'python-Levenshtein >= 0.10.2',
'vptree >= 1.2',
])

# Add additional requirements for testing on Linux that are skipped on Windows.
Expand Down

0 comments on commit bf904eb

Please sign in to comment.