From c86deb429b0fb71227c2afe66032077a3854216e Mon Sep 17 00:00:00 2001 From: Vit Novotny Date: Mon, 21 Jun 2021 20:22:14 +0200 Subject: [PATCH 1/4] Do not expect the same Unicode type in editdist --- gensim/similarities/fastss.pyx | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/gensim/similarities/fastss.pyx b/gensim/similarities/fastss.pyx index 63469243c2..07203073d7 100644 --- a/gensim/similarities/fastss.pyx +++ b/gensim/similarities/fastss.pyx @@ -29,8 +29,8 @@ cdef extern from *: WIDTH * CYTHON_RESTRICT pos_new; WIDTH * CYTHON_RESTRICT pos_old; int row_flip = 1; /* Does pos_new represent row1 or row2? */ - int kind = PyUnicode_KIND(s1); /* How many bytes per unicode codepoint? */ - if (kind != PyUnicode_KIND(s2)) return -1; + int kind1 = PyUnicode_KIND(s1); /* How many bytes per unicode codepoint? */ + int kind2 = PyUnicode_KIND(s2); WIDTH len_s1 = (WIDTH)PyUnicode_GET_LENGTH(s1); WIDTH len_s2 = (WIDTH)PyUnicode_GET_LENGTH(s2); @@ -39,7 +39,7 @@ cdef extern from *: const WIDTH tmpi = len_s1; len_s1 = len_s2; len_s2 = tmpi; } if (len_s2 - len_s1 > maximum) return maximum + 1; - if (len_s2 > MAX_WORD_LENGTH) return -2; + if (len_s2 > MAX_WORD_LENGTH) return -1; void * s1_data = PyUnicode_DATA(s1); void * s2_data = PyUnicode_DATA(s2); @@ -47,7 +47,7 @@ cdef extern from *: for (WIDTH i2 = 0; i2 < len_s2; i2++) { int all_bad = i2 >= maximum; - const Py_UCS4 ch = PyUnicode_READ(kind, s2_data, i2); + const Py_UCS4 ch = PyUnicode_READ(kind2, s2_data, i2); row_flip = 1 - row_flip; if (row_flip) { pos_new = row2; pos_old = row1; @@ -58,7 +58,7 @@ cdef extern from *: for (WIDTH i1 = 0; i1 < len_s1; i1++) { WIDTH val = *(pos_old++); - if (ch != PyUnicode_READ(kind, s1_data, i1)) { + if (ch != PyUnicode_READ(kind1, s1_data, i1)) { const WIDTH _val1 = *pos_old; const WIDTH _val2 = *pos_new; if (_val1 < val) val = _val1; @@ -96,8 +96,6 @@ def editdist(s1: str, s2: str, max_dist=None): if result >= 0: return result elif result == -1: - raise ValueError("incompatible types of unicode strings") - elif result == -2: raise ValueError(f"editdist doesn't support strings longer than {MAX_WORD_LENGTH} characters") else: raise ValueError(f"editdist returned an error: {result}") From 6c9dceab471efe6b658d444afd99ab7e3c7bd375 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Tue, 22 Jun 2021 22:16:31 +0200 Subject: [PATCH 2/4] Unit-test editdist --- gensim/test/test_similarities.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py index 2dda8d8a30..0a2dee5ccd 100644 --- a/gensim/test/test_similarities.py +++ b/gensim/test/test_similarities.py @@ -33,6 +33,7 @@ from gensim.similarities import SparseTermSimilarityMatrix from gensim.similarities import LevenshteinSimilarityIndex from gensim.similarities.docsim import _nlargest +from gensim.similarities.fastss import editdist try: from pyemd import emd # noqa:F401 @@ -1631,6 +1632,32 @@ def test_most_similar(self): self.assertTrue(numpy.allclose(first_similarities**2.0, second_similarities)) +class TestFastSS(unittest.TestCase): + def test_editdist_same_unicode_kind_latin1(self): + """Test editdist returns the expected result with two Latin-1 strings.""" + expected = 2 + actual = editdist('Zizka', 'siska') + self.assertEqual(expected, actual) + + def test_editdist_same_unicode_kind_ucs2(self): + """Test editdist returns the expected result with two UCS-2 strings.""" + expected = 2 + actual = editdist('Žižka', 'šiška') + self.assertEqual(expected, actual) + + def test_editdist_same_unicode_kind_ucs4(self): + """Test editdist returns the expected result with two UCS-4 strings.""" + expected = 2 + actual = editdist('Žižka 😀', 'šiška 😀') + self.assertEqual(expected, actual) + + def test_editdist_different_unicode_kinds(self): + """Test editdist returns the expected result with strings of different Unicode kinds.""" + expected = 2 + actual = editdist('Žižka', 'siska') + self.assertEqual(expected, actual) + + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() From b9350179bbab7c1e9c4ac2f727dda6439c440c8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Tue, 22 Jun 2021 23:36:57 +0200 Subject: [PATCH 3/4] Use pytest assertion syntax in unit tests --- gensim/test/test_similarities.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py index 0a2dee5ccd..35ddd03397 100644 --- a/gensim/test/test_similarities.py +++ b/gensim/test/test_similarities.py @@ -1637,25 +1637,25 @@ def test_editdist_same_unicode_kind_latin1(self): """Test editdist returns the expected result with two Latin-1 strings.""" expected = 2 actual = editdist('Zizka', 'siska') - self.assertEqual(expected, actual) + assert expected == actual def test_editdist_same_unicode_kind_ucs2(self): """Test editdist returns the expected result with two UCS-2 strings.""" expected = 2 actual = editdist('Žižka', 'šiška') - self.assertEqual(expected, actual) + assert expected == actual def test_editdist_same_unicode_kind_ucs4(self): """Test editdist returns the expected result with two UCS-4 strings.""" expected = 2 actual = editdist('Žižka 😀', 'šiška 😀') - self.assertEqual(expected, actual) + assert expected == actual def test_editdist_different_unicode_kinds(self): """Test editdist returns the expected result with strings of different Unicode kinds.""" expected = 2 actual = editdist('Žižka', 'siska') - self.assertEqual(expected, actual) + assert expected == actual if __name__ == '__main__': From eeee1738d199788f633f1de0f06a70062590e5c3 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Tue, 29 Jun 2021 14:22:38 +0900 Subject: [PATCH 4/4] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 423ce2b389..6385b2caf6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ Changes * [#3116](https://github.com/RaRe-Technologies/gensim/pull/3116): Fix bug where saved Phrases model did not load its connector_words, by [@aloknayak29](https://github.com/aloknayak29) * [#3136](https://github.com/RaRe-Technologies/gensim/pull/3136): fix indexing error in word2vec_inner.pyx, by [@bluekura](https://github.com/bluekura) * [#3174](https://github.com/RaRe-Technologies/gensim/pull/3174): Fix a bug when upgrading phraser from gensim 3.x to 4.0, by [@emgucv](https://github.com/emgucv) +* [#3178](https://github.com/RaRe-Technologies/gensim/pull/3178): Fix Unicode string incompatibility in gensim.similarities.fastss.editdist, by [@Witiko](https://github.com/Witiko) ### :+1: Improvements