diff --git a/README.md b/README.md index e25cc49..94791c6 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,20 @@ Blazing fast, lightweight and customizable fuzzy and semantic text search in Pyt Neofuzz is a fuzzy search library based on vectorization and approximate nearest neighbour search techniques. +### New in version 0.3.0 +Now you can reorder your search results using Levenshtein distance! +Sometimes n-gram processes or vectorized processes don't quite order the results correctly. +In these cases you can retrieve a higher number of examples from the indexed corpus, then refine those results with Levenshtein distance. + +```python +from neofuzz import char_ngram_process + +process = char_ngram_process() +process.index(corpus) + +process.extract("your query", limit=30, refine_levenshtein=True) +``` + ### Why is Neofuzz fast? Most fuzzy search libraries rely on optimizing the hell out of the same couple of fuzzy search algorithms (Hamming distance, Levenshtein distance). Sometimes unfortunately due to the complexity of these algorithms, no amount of optimization will get you the speed, that you want. @@ -93,7 +107,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer process = Process(vectorizer, metric="cosine") ``` -### Dimentionality Reduction +### Dimensionality Reduction You might find that the speed of your fuzzy search process is not sufficient. In this case it might be desirable to reduce the dimentionality of the produced vectors with some matrix decomposition method or topic model. @@ -107,7 +121,7 @@ from sklearn.pipeline import make_pipeline # Vectorization with tokens again vectorizer = TfidfVectorizer() -# Dimentionality reduction method to 20 dimentions +# Dimensionality reduction method to 20 dimensions nmf = NMF(n_components=20) # Create a pipeline of the two pipeline = make_pipeline(vectorizer, nmf) diff --git a/neofuzz/process.py b/neofuzz/process.py index 77bec6b..c0cf8e4 100644 --- a/neofuzz/process.py +++ b/neofuzz/process.py @@ -7,6 +7,7 @@ import pynndescent from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.metrics import pairwise_distances +from thefuzz import process as thefuzz_process class Process: @@ -20,6 +21,8 @@ class Process: Some kind of vectorizer model that can vectorize strings. You could use tf-idf, bow or even a Pipeline that has multiple steps. + refine_levenshtein: bool, default False + Indicates whether final results should be refined with the Levenshtein algorithm metric: string or callable, default 'cosine' The metric to use for computing nearest neighbors. If a callable is used it must be a numba njit compiled function. Supported metrics @@ -143,6 +146,7 @@ class Process: def __init__( self, vectorizer, + refine_levenshtein=False, metric="cosine", metric_kwds=None, n_neighbors=30, @@ -165,6 +169,7 @@ def __init__( verbose=False, ): self.vectorizer = vectorizer + self.refine_levenshtein = refine_levenshtein self.nearest_neighbours_kwargs = { "metric": metric, "metric_kwds": metric_kwds, @@ -213,7 +218,10 @@ def index(self, options: Iterable[str]): self.nearest_neighbours.prepare() def query( - self, search_terms: Iterable[str], limit: int = 10 + self, + search_terms: Iterable[str], + limit: int = 10, + refine_levenshtein: Optional[bool] = None, ) -> Tuple[np.ndarray, np.ndarray]: """Searches for the given terms in the options. @@ -223,6 +231,11 @@ def query( Terms to search for. limit: int, default 10 Amount of closest matches to return. + refine_levenshtein: bool, default None + Indicates whether results should be refined with Levenshtein distance + using TheFuzz. + This can increase the accuracy of your results. + If not specified, the process's attribute is used. Parameters ---------- @@ -237,13 +250,36 @@ def query( " please index before querying." ) search_matrix = self.vectorizer.transform(search_terms) - return self.nearest_neighbours.query(search_matrix, k=limit) + indices, distances = self.nearest_neighbours.query( + search_matrix, k=limit + ) + if refine_levenshtein is None: + refine_levenshtein = self.refine_levenshtein + if refine_levenshtein: + refined_indices = [] + refined_distances = [] + for term, idx in zip(search_terms, indices): + options = list(self.options[idx]) + res = thefuzz_process.extract( + term, options, limit=len(options) + ) + res_indices = [] + res_dist = [] + for result_term, result_sim in res: + res_indices.append(idx[options.index(result_term)]) + res_dist.append(1 - (result_sim / 100)) + refined_indices.append(res_indices) + refined_distances.append(res_dist) + indices = np.stack(refined_indices) + distances = np.stack(refined_distances) + return indices, distances def extract( self, query: str, choices: Optional[Iterable[str]] = None, limit: int = 10, + refine_levenshtein: Optional[bool] = None, ) -> List[Tuple[str, int]]: """TheFuzz compatible querying. @@ -257,6 +293,11 @@ def extract( it will be used for indexing. limit: int, default 10 Number of results to return + refine_levenshtein: bool, default None + Indicates whether results should be refined with Levenshtein distance + using TheFuzz. + This can increase the accuracy of your results. + If not specified, the process's attribute is used. Returns ------- @@ -271,7 +312,9 @@ def extract( "and no choices were provided." ) self.index(options=choices) - indices, distances = self.query([query], limit=limit) + indices, distances = self.query( + [query], limit=limit, refine_levenshtein=refine_levenshtein + ) indices = np.ravel(indices) distances = np.ravel(distances) scores = (1 - distances) * 100 diff --git a/pyproject.toml b/pyproject.toml index 642a1fb..9c60e89 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ line-length=79 [tool.poetry] name = "neofuzz" -version = "0.2.0" +version = "0.3.0" description = "Blazing fast fuzzy text search for Python." authors = ["Márton Kardos "] license = "MIT" @@ -16,6 +16,7 @@ pynndescent = ">=0.5.0, <0.6.0" numpy = ">=0.22.0, <2.0.0" tokenizers = ">=0.19.0, <0.20.0" joblib = ">=1.4.0, <1.5.0" +thefuzz = ">=0.22.0, <0.23.0" [build-system]