Skip to content

Commit

Permalink
Fix broken build and removed warning messages (#168)
Browse files Browse the repository at this point in the history
  • Loading branch information
twalen authored Apr 7, 2022
1 parent 5b3230f commit 4da0650
Show file tree
Hide file tree
Showing 6 changed files with 16 additions and 12 deletions.
8 changes: 4 additions & 4 deletions recordlinkage/algorithms/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@ def jaro_similarity(s1, s2):

conc = pandas.Series(list(zip(s1, s2)))

from jellyfish import jaro_distance
from jellyfish import jaro_similarity

def jaro_apply(x):

try:
return jaro_distance(x[0], x[1])
return jaro_similarity(x[0], x[1])
except Exception as err:
if pandas.isnull(x[0]) or pandas.isnull(x[1]):
return np.nan
Expand All @@ -35,12 +35,12 @@ def jarowinkler_similarity(s1, s2):

conc = pandas.Series(list(zip(s1, s2)))

from jellyfish import jaro_winkler
from jellyfish import jaro_winkler_similarity

def jaro_winkler_apply(x):

try:
return jaro_winkler(x[0], x[1])
return jaro_winkler_similarity(x[0], x[1])
except Exception as err:
if pandas.isnull(x[0]) or pandas.isnull(x[1]):
return np.nan
Expand Down
3 changes: 3 additions & 0 deletions recordlinkage/classifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,9 @@ def match_cluster_center(self, value):
if value is None:
return

# this attribute is filled in KMeans.fit and is required for predict
self.kernel._n_threads = 1

if not hasattr(self.kernel, 'cluster_centers_'):
self.kernel.cluster_centers_ = numpy.empty((2, len(value)))
self.kernel.cluster_centers_[:] = numpy.nan
Expand Down
10 changes: 5 additions & 5 deletions recordlinkage/preprocessing/cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,17 +111,17 @@ def strip_accents_fn_wrapper(x):

# Remove all content between brackets
if remove_brackets is True:
s = s.str.replace(r'(\[.*?\]|\(.*?\)|\{.*?\})', '')
s = s.str.replace(r'(\[.*?\]|\(.*?\)|\{.*?\})', '', regex=True)

# Remove the special characters
if replace_by_none:
s = s.str.replace(replace_by_none, '')
s = s.str.replace(replace_by_none, '', regex=True)

if replace_by_whitespace:
s = s.str.replace(replace_by_whitespace, ' ')
s = s.str.replace(replace_by_whitespace, ' ', regex=True)

# Remove multiple whitespaces
s = s.str.replace(r'\s\s+', ' ')
s = s.str.replace(r'\s\s+', ' ', regex=True)

# Strip s
s = s.str.lstrip().str.rstrip()
Expand All @@ -145,7 +145,7 @@ def phonenumbers(s):
"""

# Remove all special tokens
s = s.astype(object).str.replace('[^0-9+]+', '')
s = s.astype(object).str.replace('[^0-9+]+', '', regex=True)

return s

Expand Down
2 changes: 1 addition & 1 deletion recordlinkage/preprocessing/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def phonetic(s, method, concat=True, encoding='utf-8', decode_error='strict'):
if type(x) == bytes else x)

if concat:
s = s.str.replace(r"[\-\_\s]", "")
s = s.str.replace(r"[\-\_\s]", "", regex=True)

for alg in _phonetic_algorithms:
if method in alg['argument_names']:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def read(fname):
],
python_requires=">=3.5",
install_requires=[
"jellyfish>=0.5.4",
"jellyfish>=0.8.0",
"numpy>=1.13.0",
"pandas>=1,<2",
"scipy>=1",
Expand Down
3 changes: 2 additions & 1 deletion tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,8 +210,9 @@ def test_encode_match_rating(self):
np.nan, u'John', u'Mary Ann', u'billy', u'Jonathan', u'Gretha',
u'Micheal', u'Sjors'
])
# in jellyfish.match_rating_codex version 0.8.0 results have changed
expected = pd.Series([
np.nan, u'JHN', u'MRYNN', u'BLLY', u'JNTHN', u'GRTH', u'MCHL',
np.nan, u'JHN', u'MRYN', u'BLY', u'JNTHN', u'GRTH', u'MCHL',
u'SJRS'
])

Expand Down

0 comments on commit 4da0650

Please sign in to comment.