Skip to content

Commit

Permalink
TFIDF: Add option to omit removing n-grams with space (#76)
Browse files Browse the repository at this point in the history
  • Loading branch information
Laubeee authored Mar 3, 2024
1 parent e754003 commit 5d0734b
Showing 1 changed file with 9 additions and 3 deletions.
12 changes: 9 additions & 3 deletions polyfuzz/models/_tfidf.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,13 @@ class TFIDF(BaseMatcher):
knn uses 1-nearest neighbor to extract the most similar strings
it is significantly slower than both methods but requires little memory
model_id: The name of the particular instance, used when comparing models
remove_space_ngrams: Remove n-grams that contain a space
Usage:
```python
from polymatcher.models import TFIDF
model = TFIDF(n_gram_range=(3, 3), clean_string=True, use_knn=False)
model = TFIDF(n_gram_range=(3, 3), clean_string=True)
```
"""
def __init__(self,
Expand All @@ -51,7 +52,8 @@ def __init__(self,
min_similarity: float = 0.75,
top_n: int = 1,
cosine_method: str = "sparse",
model_id: str = None):
model_id: str = None,
remove_space_ngrams = True):
super().__init__(model_id)
self.type = "TF-IDF"
self.n_gram_range = n_gram_range
Expand All @@ -61,6 +63,7 @@ def __init__(self,
self.top_n = top_n
self.vectorizer = None
self.tf_idf_to = None
self.remove_space_ngrams = remove_space_ngrams

def match(self,
from_list: List[str],
Expand Down Expand Up @@ -127,7 +130,10 @@ def _create_ngrams(self, string: str) -> List[str]:
result = []
for n in range(self.n_gram_range[0], self.n_gram_range[1]+1):
ngrams = zip(*[string[i:] for i in range(n)])
ngrams = [''.join(ngram) for ngram in ngrams if ' ' not in ngram]
if self.remove_space_ngrams:
ngrams = [''.join(ngram) for ngram in ngrams if ' ' not in ngram]
else:
ngrams = [''.join(ngram) for ngram in ngrams]
result.extend(ngrams)

return result
Expand Down

0 comments on commit 5d0734b

Please sign in to comment.