Fix broken build and removed warning messages (#168)

J535D165 · Apr 7, 2022 · 4da0650 · 4da0650
1 parent 5b3230f
commit 4da0650
Show file tree

Hide file tree

Showing 6 changed files with 16 additions and 12 deletions.
diff --git a/recordlinkage/algorithms/string.py b/recordlinkage/algorithms/string.py
@@ -16,12 +16,12 @@ def jaro_similarity(s1, s2):
 
     conc = pandas.Series(list(zip(s1, s2)))
 
-    from jellyfish import jaro_distance
+    from jellyfish import jaro_similarity
 
     def jaro_apply(x):
 
         try:
-            return jaro_distance(x[0], x[1])
+            return jaro_similarity(x[0], x[1])
         except Exception as err:
             if pandas.isnull(x[0]) or pandas.isnull(x[1]):
                 return np.nan
@@ -35,12 +35,12 @@ def jarowinkler_similarity(s1, s2):
 
     conc = pandas.Series(list(zip(s1, s2)))
 
-    from jellyfish import jaro_winkler
+    from jellyfish import jaro_winkler_similarity
 
     def jaro_winkler_apply(x):
 
         try:
-            return jaro_winkler(x[0], x[1])
+            return jaro_winkler_similarity(x[0], x[1])
         except Exception as err:
             if pandas.isnull(x[0]) or pandas.isnull(x[1]):
                 return np.nan

diff --git a/recordlinkage/classifiers.py b/recordlinkage/classifiers.py
@@ -297,6 +297,9 @@ def match_cluster_center(self, value):
         if value is None:
             return
 
+        # this attribute is filled in KMeans.fit and is required for predict
+        self.kernel._n_threads = 1
+
         if not hasattr(self.kernel, 'cluster_centers_'):
             self.kernel.cluster_centers_ = numpy.empty((2, len(value)))
             self.kernel.cluster_centers_[:] = numpy.nan

diff --git a/recordlinkage/preprocessing/cleaning.py b/recordlinkage/preprocessing/cleaning.py
@@ -111,17 +111,17 @@ def strip_accents_fn_wrapper(x):
 
     # Remove all content between brackets
     if remove_brackets is True:
-        s = s.str.replace(r'(\[.*?\]|\(.*?\)|\{.*?\})', '')
+        s = s.str.replace(r'(\[.*?\]|\(.*?\)|\{.*?\})', '', regex=True)
 
     # Remove the special characters
     if replace_by_none:
-        s = s.str.replace(replace_by_none, '')
+        s = s.str.replace(replace_by_none, '', regex=True)
 
     if replace_by_whitespace:
-        s = s.str.replace(replace_by_whitespace, ' ')
+        s = s.str.replace(replace_by_whitespace, ' ', regex=True)
 
     # Remove multiple whitespaces
-    s = s.str.replace(r'\s\s+', ' ')
+    s = s.str.replace(r'\s\s+', ' ', regex=True)
 
     # Strip s
     s = s.str.lstrip().str.rstrip()
@@ -145,7 +145,7 @@ def phonenumbers(s):
     """
 
     # Remove all special tokens
-    s = s.astype(object).str.replace('[^0-9+]+', '')
+    s = s.astype(object).str.replace('[^0-9+]+', '', regex=True)
 
     return s
 

diff --git a/recordlinkage/preprocessing/encoding.py b/recordlinkage/preprocessing/encoding.py
@@ -77,7 +77,7 @@ def phonetic(s, method, concat=True, encoding='utf-8', decode_error='strict'):
             if type(x) == bytes else x)
 
     if concat:
-        s = s.str.replace(r"[\-\_\s]", "")
+        s = s.str.replace(r"[\-\_\s]", "", regex=True)
 
     for alg in _phonetic_algorithms:
         if method in alg['argument_names']:

diff --git a/setup.py b/setup.py
@@ -47,7 +47,7 @@ def read(fname):
     ],
     python_requires=">=3.5",
     install_requires=[
-        "jellyfish>=0.5.4",
+        "jellyfish>=0.8.0",
         "numpy>=1.13.0",
         "pandas>=1,<2",
         "scipy>=1",

diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
@@ -210,8 +210,9 @@ def test_encode_match_rating(self):
             np.nan, u'John', u'Mary Ann', u'billy', u'Jonathan', u'Gretha',
             u'Micheal', u'Sjors'
         ])
+        # in jellyfish.match_rating_codex version 0.8.0 results have changed
         expected = pd.Series([
-            np.nan, u'JHN', u'MRYNN', u'BLLY', u'JNTHN', u'GRTH', u'MCHL',
+            np.nan, u'JHN', u'MRYN', u'BLY', u'JNTHN', u'GRTH', u'MCHL',
             u'SJRS'
         ])