Skip to content

Commit

Permalink
Merge pull request #5 from x-tabdeveloping/levenshtein
Browse files Browse the repository at this point in the history
Refine results with Levenshtein distance
  • Loading branch information
x-tabdeveloping authored Sep 6, 2024
2 parents 20dcb51 + c2874a6 commit 2f3e7ab
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 6 deletions.
18 changes: 16 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,20 @@ Blazing fast, lightweight and customizable fuzzy and semantic text search in Pyt
Neofuzz is a fuzzy search library based on vectorization and approximate nearest neighbour
search techniques.

### New in version 0.3.0
Now you can reorder your search results using Levenshtein distance!
Sometimes n-gram processes or vectorized processes don't quite order the results correctly.
In these cases you can retrieve a higher number of examples from the indexed corpus, then refine those results with Levenshtein distance.

```python
from neofuzz import char_ngram_process

process = char_ngram_process()
process.index(corpus)

process.extract("your query", limit=30, refine_levenshtein=True)
```

### Why is Neofuzz fast?
Most fuzzy search libraries rely on optimizing the hell out of the same couple of fuzzy search algorithms (Hamming distance, Levenshtein distance). Sometimes unfortunately due to the complexity of these algorithms, no amount of optimization will get you the speed, that you want.

Expand Down Expand Up @@ -93,7 +107,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer
process = Process(vectorizer, metric="cosine")
```

### Dimentionality Reduction
### Dimensionality Reduction

You might find that the speed of your fuzzy search process is not sufficient. In this case it might be desirable to reduce the dimentionality of the produced vectors with some matrix decomposition method or topic model.

Expand All @@ -107,7 +121,7 @@ from sklearn.pipeline import make_pipeline

# Vectorization with tokens again
vectorizer = TfidfVectorizer()
# Dimentionality reduction method to 20 dimentions
# Dimensionality reduction method to 20 dimensions
nmf = NMF(n_components=20)
# Create a pipeline of the two
pipeline = make_pipeline(vectorizer, nmf)
Expand Down
49 changes: 46 additions & 3 deletions neofuzz/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pynndescent
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import pairwise_distances
from thefuzz import process as thefuzz_process


class Process:
Expand All @@ -20,6 +21,8 @@ class Process:
Some kind of vectorizer model that can vectorize strings.
You could use tf-idf, bow or even a Pipeline that
has multiple steps.
refine_levenshtein: bool, default False
Indicates whether final results should be refined with the Levenshtein algorithm
metric: string or callable, default 'cosine'
The metric to use for computing nearest neighbors. If a callable is
used it must be a numba njit compiled function. Supported metrics
Expand Down Expand Up @@ -143,6 +146,7 @@ class Process:
def __init__(
self,
vectorizer,
refine_levenshtein=False,
metric="cosine",
metric_kwds=None,
n_neighbors=30,
Expand All @@ -165,6 +169,7 @@ def __init__(
verbose=False,
):
self.vectorizer = vectorizer
self.refine_levenshtein = refine_levenshtein
self.nearest_neighbours_kwargs = {
"metric": metric,
"metric_kwds": metric_kwds,
Expand Down Expand Up @@ -213,7 +218,10 @@ def index(self, options: Iterable[str]):
self.nearest_neighbours.prepare()

def query(
self, search_terms: Iterable[str], limit: int = 10
self,
search_terms: Iterable[str],
limit: int = 10,
refine_levenshtein: Optional[bool] = None,
) -> Tuple[np.ndarray, np.ndarray]:
"""Searches for the given terms in the options.
Expand All @@ -223,6 +231,11 @@ def query(
Terms to search for.
limit: int, default 10
Amount of closest matches to return.
refine_levenshtein: bool, default None
Indicates whether results should be refined with Levenshtein distance
using TheFuzz.
This can increase the accuracy of your results.
If not specified, the process's attribute is used.
Parameters
----------
Expand All @@ -237,13 +250,36 @@ def query(
" please index before querying."
)
search_matrix = self.vectorizer.transform(search_terms)
return self.nearest_neighbours.query(search_matrix, k=limit)
indices, distances = self.nearest_neighbours.query(
search_matrix, k=limit
)
if refine_levenshtein is None:
refine_levenshtein = self.refine_levenshtein
if refine_levenshtein:
refined_indices = []
refined_distances = []
for term, idx in zip(search_terms, indices):
options = list(self.options[idx])
res = thefuzz_process.extract(
term, options, limit=len(options)
)
res_indices = []
res_dist = []
for result_term, result_sim in res:
res_indices.append(idx[options.index(result_term)])
res_dist.append(1 - (result_sim / 100))
refined_indices.append(res_indices)
refined_distances.append(res_dist)
indices = np.stack(refined_indices)
distances = np.stack(refined_distances)
return indices, distances

def extract(
self,
query: str,
choices: Optional[Iterable[str]] = None,
limit: int = 10,
refine_levenshtein: Optional[bool] = None,
) -> List[Tuple[str, int]]:
"""TheFuzz compatible querying.
Expand All @@ -257,6 +293,11 @@ def extract(
it will be used for indexing.
limit: int, default 10
Number of results to return
refine_levenshtein: bool, default None
Indicates whether results should be refined with Levenshtein distance
using TheFuzz.
This can increase the accuracy of your results.
If not specified, the process's attribute is used.
Returns
-------
Expand All @@ -271,7 +312,9 @@ def extract(
"and no choices were provided."
)
self.index(options=choices)
indices, distances = self.query([query], limit=limit)
indices, distances = self.query(
[query], limit=limit, refine_levenshtein=refine_levenshtein
)
indices = np.ravel(indices)
distances = np.ravel(distances)
scores = (1 - distances) * 100
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
line-length=79
[tool.poetry]
name = "neofuzz"
version = "0.2.0"
version = "0.3.0"
description = "Blazing fast fuzzy text search for Python."
authors = ["Márton Kardos <power.up1163@gmail.com>"]
license = "MIT"
Expand All @@ -16,6 +16,7 @@ pynndescent = ">=0.5.0, <0.6.0"
numpy = ">=0.22.0, <2.0.0"
tokenizers = ">=0.19.0, <0.20.0"
joblib = ">=1.4.0, <1.5.0"
thefuzz = ">=0.22.0, <0.23.0"


[build-system]
Expand Down

0 comments on commit 2f3e7ab

Please sign in to comment.