Skip to content

Commit

Permalink
Use DAWG for fast approximate kNN over Levenshtein distance
Browse files Browse the repository at this point in the history
  • Loading branch information
Witiko committed May 15, 2021
1 parent bf904eb commit df261b3
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 162 deletions.
6 changes: 3 additions & 3 deletions gensim/similarities/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
import warnings
try:
import Levenshtein # noqa:F401
import vptree # noqa:F401
import lexpy # noqa:F401
except ImportError:
msg = (
"The gensim.similarities.levenshtein submodule is disabled, because the optional "
"Levenshtein <https://pypi.org/project/python-Levenshtein/> and "
"vptree <https://pypi.org/project/vptree/> packages are unavailable. "
"Install Levenhstein and vptree (e.g. `pip install python-Levenshtein vptree`) to "
"lexpy <https://pypi.org/project/lexpy/> packages are unavailable. "
"Install Levenhstein and lexpy (e.g. `pip install python-Levenshtein lexpy`) to "
"suppress this warning."
)
warnings.warn(msg)
Expand Down
127 changes: 33 additions & 94 deletions gensim/similarities/levenshtein.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,102 +10,21 @@

import itertools
import logging
from math import floor

from gensim.similarities.termsim import TermSimilarityIndex

logger = logging.getLogger(__name__)


def levdist(t1, t2, max_distance=float("inf")):
"""Get the Levenshtein distance between two terms.
Return the Levenshtein distance between two terms. The distance is a
number between <1.0, inf>, higher is less similar.
Parameters
----------
t1 : {bytes, str, unicode}
The first compared term.
t2 : {bytes, str, unicode}
The second compared term.
max_distance : {int, float}, optional
If you don't care about distances larger than a known threshold, a more
efficient code path can be taken. For terms that are clearly "too far
apart", we will not compute the distance exactly, but we will return
`max(len(t1), len(t2))` more quickly, meaning "more than
`max_distance`".
Default: always compute distance exactly, no threshold clipping.
Returns
-------
int
The Levenshtein distance between `t1` and `t2`.
"""
import Levenshtein

distance = Levenshtein.distance(t1, t2)
if distance > max_distance:
return max(len(t1), len(t2))
return distance


def levsim(t1, t2, alpha=1.8, beta=5.0, min_similarity=0.0):
"""Get the Levenshtein similarity between two terms.
Return the Levenshtein similarity between two terms. The similarity is a
number between <0.0, 1.0>, higher is more similar.
Parameters
----------
t1 : {bytes, str, unicode}
The first compared term.
t2 : {bytes, str, unicode}
The second compared term.
alpha : float, optional
The multiplicative factor alpha defined by Charlet and Damnati (2017).
beta : float, optional
The exponential factor beta defined by Charlet and Damnati (2017).
min_similarity : {int, float}, optional
If you don't care about similarities smaller than a known threshold, a
more efficient code path can be taken. For terms that are clearly "too
far apart", we will not compute the distance exactly, but we will
return zero more quickly, meaning "less than `min_similarity`".
Default: always compute similarity exactly, no threshold clipping.
Returns
-------
float
The Levenshtein similarity between `t1` and `t2`.
Notes
-----
This notion of Levenshtein similarity was first defined in section 2.2 of
`Delphine Charlet and Geraldine Damnati, "SimBow at SemEval-2017 Task 3:
Soft-Cosine Semantic Similarity between Questions for Community Question
Answering", 2017 <http://www.aclweb.org/anthology/S/S17/S17-2051.pdf>`_.
"""
assert alpha >= 0
assert beta >= 0

max_lengths = max(len(t1), len(t2)) or 1
min_similarity = float(max(min(min_similarity, 1.0), 0.0))
max_distance = int(floor(max_lengths * (1 - (min_similarity / alpha) ** (1 / beta))))
distance = levdist(t1, t2, max_distance)
similarity = alpha * (1 - distance * 1.0 / max_lengths)**beta
return similarity


class LevenshteinSimilarityIndex(TermSimilarityIndex):
r"""
Computes Levenshtein similarities between terms and retrieves most similar
terms for a given term.
Notes
-----
This implementation uses a VP-Tree for metric indexing.
This implementation uses a Directed Acyclic Word Graph (DAWG)
for fast nearest-neighbor retrieval of the most similar terms.
Parameters
----------
Expand All @@ -115,17 +34,25 @@ class LevenshteinSimilarityIndex(TermSimilarityIndex):
The multiplicative factor alpha defined by [charletetal17]_.
beta : float, optional
The exponential factor beta defined by [charletetal17]_.
max_distance : int, optional
The maximum Levenshtein distance of the most similar terms.
Keeping this value below 3 has a significant impact on the
retrieval performance. Default is 1.
Attributes
----------
dictionary : :class:`~gensim.corpora.dictionary.Dictionary`
A dictionary that specifies the considered terms.
alpha : float, optional
alpha : float
The multiplicative factor alpha defined by [charletetal17]_.
beta : float, optional
beta : float
The exponential factor beta defined by [charletetal17]_.
index : :class:`vptree.VPTree`
The VP-Tree metric index.
index : :class:`lexpy.dawg.DAWG`
The DAWG nearest-neighbor search index.
max_distance : int
The maximum Levenshtein distance of the most similar terms.
Keeping this value below 3 has a significant impact on the
retrieval performance.
See Also
--------
Expand All @@ -145,17 +72,29 @@ class LevenshteinSimilarityIndex(TermSimilarityIndex):
https://www.aclweb.org/anthology/S17-2051/.
"""
def __init__(self, dictionary, alpha=1.8, beta=5.0):
from vptree import VPTree

def __init__(self, dictionary, alpha=1.8, beta=5.0, max_distance=1):
self.dictionary = dictionary
self.alpha = alpha
self.beta = beta
terms = list(self.dictionary.values())
self.index = VPTree(terms, levdist)
self.max_distance = max_distance

from lexpy.dawg import DAWG

self.index = DAWG()
terms = sorted(self.dictionary.values())
self.index.add_all(terms)
self.index.reduce()

super(LevenshteinSimilarityIndex, self).__init__()

def _levsim(self, t1, t2):
from Levenshtein import distance

max_lengths = max(len(t1), len(t2)) or 1
similarity = self.alpha * (1.0 - distance(t1, t2) * 1.0 / max_lengths)**self.beta
return similarity

def most_similar(self, t1, topn=10):
terms = [term for _, term in self.index.get_n_nearest_neighbors(t1, int(topn + 1))]
most_similar = ((t2, levsim(t1, t2, self.alpha, self.beta)) for t2 in terms if t1 != t2)
terms = self.index.search_within_distance(t1, self.max_distance)
most_similar = ((t2, self._levsim(t1, t2)) for t2 in terms if t1 != t2)
return itertools.islice(most_similar, topn)
64 changes: 0 additions & 64 deletions gensim/test/test_similarities.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import LevenshteinSimilarityIndex
from gensim.similarities.docsim import _nlargest
from gensim.similarities.levenshtein import levdist, levsim

try:
from pyemd import emd # noqa:F401
Expand Down Expand Up @@ -1544,69 +1543,6 @@ def test_inner_product_corpus_corpus_true_true(self):
self.assertTrue(numpy.allclose(expected_result, result.todense()))


class TestLevenshteinDistance(unittest.TestCase):
@unittest.skipIf(LevenshteinSimilarityIndex is None, "gensim.similarities.levenshtein is disabled")
def test_max_distance(self):
t1 = "holiday"
t2 = "day"
max_distance = max(len(t1), len(t2))

self.assertEqual(4, levdist(t1, t2))
self.assertEqual(4, levdist(t1, t2, 4))
self.assertEqual(max_distance, levdist(t1, t2, 2))
self.assertEqual(max_distance, levdist(t1, t2, -2))


class TestLevenshteinSimilarity(unittest.TestCase):
@unittest.skipIf(LevenshteinSimilarityIndex is None, "gensim.similarities.levenshtein is disabled")
def test_empty_strings(self):
t1 = ""
t2 = ""
alpha = 1.8

self.assertEqual(alpha, levsim(t1, t2))

@unittest.skipIf(LevenshteinSimilarityIndex is None, "gensim.similarities.levenshtein is disabled")
def test_negative_hyperparameters(self):
t1 = "holiday"
t2 = "day"
alpha = 2.0
beta = 2.0

with self.assertRaises(AssertionError):
levsim(t1, t2, -alpha, beta)

with self.assertRaises(AssertionError):
levsim(t1, t2, alpha, -beta)

with self.assertRaises(AssertionError):
levsim(t1, t2, -alpha, -beta)

@unittest.skipIf(LevenshteinSimilarityIndex is None, "gensim.similarities.levenshtein is disabled")
def test_min_similarity(self):
t1 = "holiday"
t2 = "day"
alpha = 2.0
beta = 2.0
similarity = alpha * (1 - 4.0 / 7)**beta
assert similarity > 0.1 and similarity < 0.5

self.assertAlmostEqual(similarity, levsim(t1, t2, alpha, beta))

self.assertAlmostEqual(similarity, levsim(t1, t2, alpha, beta, -2))
self.assertAlmostEqual(similarity, levsim(t1, t2, alpha, beta, -2.0))

self.assertAlmostEqual(similarity, levsim(t1, t2, alpha, beta, 0))
self.assertAlmostEqual(similarity, levsim(t1, t2, alpha, beta, 0.0))

self.assertEqual(similarity, levsim(t1, t2, alpha, beta, 0.1))
self.assertEqual(0.0, levsim(t1, t2, alpha, beta, 0.5))
self.assertEqual(0.0, levsim(t1, t2, alpha, beta, 1.0))

self.assertEqual(0.0, levsim(t1, t2, alpha, beta, 2))
self.assertEqual(0.0, levsim(t1, t2, alpha, beta, 2.0))


class TestLevenshteinSimilarityIndex(unittest.TestCase):
def setUp(self):
self.documents = [[u"government", u"denied", u"holiday"], [u"holiday", u"slowing", u"hollingworth"]]
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ def run(self):
'pyemd',
'nmslib',
'python-Levenshtein >= 0.10.2',
'vptree >= 1.2',
'lexpy >= 0.9.8',
])

# Add additional requirements for testing on Linux that are skipped on Windows.
Expand Down

0 comments on commit df261b3

Please sign in to comment.