From 3a75ddb0dda38e59bd1e034390933ec39a1ab0ff Mon Sep 17 00:00:00 2001 From: Seth M Morton Date: Fri, 18 Aug 2017 21:08:01 -0700 Subject: [PATCH 1/3] Add unicode normalization to all input. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All unicode input now gets 'NFD' normalization, which ensures that all characters that look the same are represented by the same code points. 'NFD' was chosen because it is the expanded for which will cause (for example) 'é' to be placed immediately after 'e' rather than after 'z'. Users can choose 'NFKD' with ns.COMPATIBILITYNORMALIZE (or ns.CN) which will change certain characters to their compatible (and often ASCII) representation. This may be useful to cause force numbers in odd representations to be transformed to ASCII which will potentially give better sorting orders. This will close issue #44. --- natsort/ns_enum.py | 38 +++++++++++-------- natsort/utils.py | 15 +++++++- .../test_input_string_transform_factory.py | 34 +++++++++++------ test_natsort/test_natsorted.py | 6 ++- test_natsort/test_utils.py | 1 + 5 files changed, 64 insertions(+), 30 deletions(-) diff --git a/natsort/ns_enum.py b/natsort/ns_enum.py index e5ffbf56..37a00deb 100644 --- a/natsort/ns_enum.py +++ b/natsort/ns_enum.py @@ -39,7 +39,7 @@ class ns(object): This is a shortcut for ``ns.FLOAT | ns.SIGNED``, which is useful when attempting to sort real numbers. NOEXP, N - Tell `natsort` to not search for exponents as part of the number. + Tell `natsort` to not search for exponents as part of a float number. For example, with `NOEXP` the number "5.6E5" would be interpreted as `5.6`, `"E"`, and `5` instead of `560000`. PATH, P @@ -51,6 +51,13 @@ class ns(object): sorted properly; 'Folder/' will be placed at the end, not at the front. It is the same as setting the old `as_path` option to `True`. + COMPATIBILITYNORMALIZE, CN + Use the "NFKD" unicode normalization form on input rather than the + default "NFD". This will transform characters such as '⑦' into + '7'. Please see https://stackoverflow.com/a/7934397/1399279, + https://stackoverflow.com/a/7931547/1399279, + and http://unicode.org/reports/tr15/ full details into unicode + normalization. LOCALE, L Tell `natsort` to be locale-aware when sorting. This includes both proper sorting of alphabetical characters as well as proper @@ -129,20 +136,21 @@ class ns(object): # The below are options. The values are stored as powers of two # so bitmasks can be used to extract the user's requested options. - FLOAT = F = 1 << 0 - SIGNED = S = 1 << 1 - REAL = R = FLOAT | SIGNED - NOEXP = N = 1 << 2 - PATH = P = 1 << 3 - LOCALEALPHA = LA = 1 << 4 - LOCALENUM = LN = 1 << 5 - LOCALE = L = LOCALEALPHA | LOCALENUM - IGNORECASE = IC = 1 << 6 - LOWERCASEFIRST = LF = 1 << 7 - GROUPLETTERS = G = 1 << 8 - UNGROUPLETTERS = UG = 1 << 9 - CAPITALFIRST = C = UNGROUPLETTERS - NANLAST = NL = 1 << 10 + FLOAT = F = 1 << 0 + SIGNED = S = 1 << 1 + REAL = R = FLOAT | SIGNED + NOEXP = N = 1 << 2 + PATH = P = 1 << 3 + LOCALEALPHA = LA = 1 << 4 + LOCALENUM = LN = 1 << 5 + LOCALE = L = LOCALEALPHA | LOCALENUM + IGNORECASE = IC = 1 << 6 + LOWERCASEFIRST = LF = 1 << 7 + GROUPLETTERS = G = 1 << 8 + UNGROUPLETTERS = UG = 1 << 9 + CAPITALFIRST = C = UNGROUPLETTERS + NANLAST = NL = 1 << 10 + COMPATIBILITYNORMALIZE = CN = 1 << 11 # The below are private options for internal use only. _NUMERIC_ONLY = REAL | NOEXP diff --git a/natsort/utils.py b/natsort/utils.py index c21d3b40..28f1487d 100644 --- a/natsort/utils.py +++ b/natsort/utils.py @@ -54,6 +54,7 @@ from collections import deque from functools import partial, reduce from operator import methodcaller +from unicodedata import normalize # Local imports. from natsort.ns_enum import ns @@ -267,11 +268,23 @@ def _input_string_transform_factory(alg): # Shortcuts. lowfirst = alg & ns.LOWERCASEFIRST dumb = alg & ns._DUMB + normalization_form = 'NFKD' if alg & ns.COMPATIBILITYNORMALIZE else 'NFD' + + if NEWPY: + careful_normalize = partial(normalize, normalization_form) + else: + def careful_normalize(x): + """Normalize unicode input.""" + if isinstance(x, py23_str): # unicode + return normalize(normalization_form, x) + else: + return x # Build the chain of functions to execute in order. - function_chain = [] + function_chain = [careful_normalize] if (dumb and not lowfirst) or (lowfirst and not dumb): function_chain.append(methodcaller('swapcase')) + if alg & ns.IGNORECASE: if NEWPY: function_chain.append(methodcaller('casefold')) diff --git a/test_natsort/test_input_string_transform_factory.py b/test_natsort/test_input_string_transform_factory.py index 3dbd8433..97acf216 100644 --- a/test_natsort/test_input_string_transform_factory.py +++ b/test_natsort/test_input_string_transform_factory.py @@ -5,6 +5,7 @@ import pytest import locale from operator import methodcaller +from unicodedata import normalize from natsort.ns_enum import ns from natsort.utils import _input_string_transform_factory from natsort.compat.py23 import NEWPY @@ -28,12 +29,22 @@ def test_input_string_transform_factory_is_no_op_for_no_alg_options_examples(): x = 'feijGGAd' - assert _input_string_transform_factory(0)(x) is x + assert _input_string_transform_factory(0)(x) == x @given(text()) -def test_input_string_transform_factory_is_no_op_for_no_alg_options(x): - assert _input_string_transform_factory(0)(x) is x +def test_input_string_transform_factory_is_no_op_for_no_alg_options_except_normalization(x): + assert _input_string_transform_factory(0)(x) == normalize('NFD', x) + + +def test_input_string_transform_factory_performs_compatibility_normalization_with_COMPATIBILITYNORMALIZE_examples(): + x = '⑦' + assert _input_string_transform_factory(ns.COMPATIBILITYNORMALIZE)(x) == '7' + + +@given(text()) +def test_input_string_transform_factory_performs_compatibility_normalization_with_COMPATIBILITYNORMALIZE(x): + assert _input_string_transform_factory(ns.COMPATIBILITYNORMALIZE)(x) == normalize('NFKD', x) def test_input_string_transform_factory_performs_casefold_with_IGNORECASE_examples(): @@ -47,9 +58,9 @@ def test_input_string_transform_factory_performs_casefold_with_IGNORECASE_exampl @given(text()) def test_input_string_transform_factory_performs_casefold_with_IGNORECASE(x): if NEWPY: - assert _input_string_transform_factory(ns.IGNORECASE)(x) == x.casefold() + assert _input_string_transform_factory(ns.IGNORECASE)(x) == normalize('NFD', x).casefold() else: - assert _input_string_transform_factory(ns.IGNORECASE)(x) == x.lower() + assert _input_string_transform_factory(ns.IGNORECASE)(x) == normalize('NFD', x).lower() def test_input_string_transform_factory_performs_swapcase_with_DUMB_examples(): @@ -59,7 +70,7 @@ def test_input_string_transform_factory_performs_swapcase_with_DUMB_examples(): @given(text()) def test_input_string_transform_factory_performs_swapcase_with_DUMB(x): - assert _input_string_transform_factory(ns._DUMB)(x) == x.swapcase() + assert _input_string_transform_factory(ns._DUMB)(x) == normalize('NFD', x).swapcase() def test_input_string_transform_factory_performs_swapcase_with_LOWERCASEFIRST_example(): @@ -69,18 +80,17 @@ def test_input_string_transform_factory_performs_swapcase_with_LOWERCASEFIRST_ex @given(text()) def test_input_string_transform_factory_performs_swapcase_with_LOWERCASEFIRST(x): - x = 'feijGGAd' - assert _input_string_transform_factory(ns.LOWERCASEFIRST)(x) == x.swapcase() + assert _input_string_transform_factory(ns.LOWERCASEFIRST)(x) == normalize('NFD', x).swapcase() def test_input_string_transform_factory_is_no_op_with_both_LOWERCASEFIRST_AND_DUMB_example(): x = 'feijGGAd' - assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) is x + assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) == x @given(text()) def test_input_string_transform_factory_is_no_op_with_both_LOWERCASEFIRST_AND_DUMB(x): - assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) is x + assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) == normalize('NFD', x) def test_input_string_transform_factory_performs_swapcase_and_casefold_both_LOWERCASEFIRST_AND_IGNORECASE_example(): @@ -94,9 +104,9 @@ def test_input_string_transform_factory_performs_swapcase_and_casefold_both_LOWE @given(text()) def test_input_string_transform_factory_performs_swapcase_and_casefold_both_LOWERCASEFIRST_AND_IGNORECASE(x): if NEWPY: - assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == x.swapcase().casefold() + assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == normalize('NFD', x).swapcase().casefold() else: - assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == x.swapcase().lower() + assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == normalize('NFD', x).swapcase().lower() def test_input_string_transform_factory_removes_thousands_separator_with_LOCALE_example(): diff --git a/test_natsort/test_natsorted.py b/test_natsort/test_natsorted.py index 146997a7..388e209f 100644 --- a/test_natsort/test_natsorted.py +++ b/test_natsort/test_natsorted.py @@ -80,8 +80,10 @@ def test_natsorted_returns_sorted_list_with_mixed_type_input_and_does_not_raise_ def test_natsorted_with_mixed_input_returns_sorted_results_without_error(): + a = ['0', 'Á', '2', 'Z'] + assert natsorted(a) == ['0', '2', 'Á', 'Z'] a = ['2', 'ä', 'b', 1.5, 3] - assert natsorted(a) == [1.5, '2', 3, 'b', 'ä'] + assert natsorted(a) == [1.5, '2', 3, 'ä', 'b'] def test_natsorted_with_nan_input_returns_sorted_results_with_nan_last_with_NANLAST(): @@ -240,7 +242,7 @@ def test_natsorted_with_LOCALE_and_de_setting_returns_results_sorted_by_de_langu def test_natsorted_with_LOCALE_and_mixed_input_returns_sorted_results_without_error(): load_locale('en_US') a = ['0', 'Á', '2', 'Z'] - assert natsorted(a) == ['0', '2', 'Z', 'Á'] + assert natsorted(a, alg=ns.LOCALE) == ['0', '2', 'Á', 'Z'] a = ['2', 'ä', 'b', 1.5, 3] assert natsorted(a, alg=ns.LOCALE) == [1.5, '2', 3, 'ä', 'b'] locale.setlocale(locale.LC_ALL, str('')) diff --git a/test_natsort/test_utils.py b/test_natsort/test_utils.py index 934757a9..f1cffa20 100644 --- a/test_natsort/test_utils.py +++ b/test_natsort/test_utils.py @@ -149,6 +149,7 @@ def test_ns_enum_values_have_are_as_expected(): assert ns.CAPITALFIRST == ns.C assert ns.UNGROUPLETTERS == ns.CAPITALFIRST assert ns.NANLAST == ns.NL + assert ns.COMPATIBILITYNORMALIZE == ns.CN # Convenience assert ns.LOCALE == ns.LOCALEALPHA | ns.LOCALENUM From 06a67bf5d4a3ba7de1e3104f87df911b86d1511b Mon Sep 17 00:00:00 2001 From: Seth M Morton Date: Fri, 18 Aug 2017 23:22:39 -0700 Subject: [PATCH 2/3] Update documentation to discuss Unicode normalization. --- README.rst | 1 + docs/source/howitworks.rst | 89 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 86 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index 3a283f23..61ddb963 100644 --- a/README.rst +++ b/README.rst @@ -234,6 +234,7 @@ Other Useful Things +++++++++++++++++++ - recursively descend into lists of lists + - automatic unicode normalization of input data - `controlling the case-sensitivity `_ - `sorting file paths correctly `_ - `allow custom sorting keys `_ diff --git a/docs/source/howitworks.rst b/docs/source/howitworks.rst index f15db8fe..22de2a52 100644 --- a/docs/source/howitworks.rst +++ b/docs/source/howitworks.rst @@ -655,12 +655,14 @@ StdLib there can't be too many dragons, right? - https://github.com/SethMMorton/natsort/issues/22 - https://github.com/SethMMorton/natsort/issues/23 - https://github.com/SethMMorton/natsort/issues/36 + - https://github.com/SethMMorton/natsort/issues/44 - https://bugs.python.org/issue2481 - https://bugs.python.org/issue23195 - - http://stackoverflow.com/questions/3412933/python-not-sorting-unicode-properly-strcoll-doesnt-help - - http://stackoverflow.com/questions/22203550/sort-dictionary-by-key-using-locale-collation - - http://stackoverflow.com/questions/33459384/unicode-character-not-in-range-when-calling-locale-strxfrm - - http://stackoverflow.com/questions/36431810/sort-numeric-lines-with-thousand-separators + - https://stackoverflow.com/questions/3412933/python-not-sorting-unicode-properly-strcoll-doesnt-help + - https://stackoverflow.com/questions/22203550/sort-dictionary-by-key-using-locale-collation + - https://stackoverflow.com/questions/33459384/unicode-character-not-in-range-when-calling-locale-strxfrm + - https://stackoverflow.com/questions/36431810/sort-numeric-lines-with-thousand-separators + - https://stackoverflow.com/questions/45734562/how-can-i-get-a-reasonable-string-sorting-with-python These can be summed up as follows: @@ -787,6 +789,84 @@ the ``else:`` block of :func:`coerce_to_int`/:func:`coerce_to_float`. Of course, applying both *LOWERCASEFIRST* and *GROUPLETTERS* is just a matter of turning on both functions. +Basic Unicode Support ++++++++++++++++++++++ + +Unicode is hard and complicated. Here's an example. + +.. code-block:: python + + >>> b = [b'\x66', b'\x65', b'\xc3\xa9', b'\x65\xcc\x81', b'\x61', b'\x7a'] + >>> a = [x.decode('utf8') for x in b] + >>> a # doctest: +SKIP + ['f', 'e', 'é', 'é', 'a', 'z'] + >>> sorted(a) # doctest: +SKIP + ['a', 'e', 'é', 'f', 'z', 'é'] + + +There are more than one way to represent the character 'é' in Unicode. +In fact, many characters have multiple representations. This is a challenge +because comparing the two representations would return ``False`` even though +they *look* the same. + +.. code-block:: python + + >>> a[2] == a[3] + False + +Alas, since characters are compared based on the numerical value of their +representation, sorting Unicode often gives unexpected results (like seeing +'é' come both *before* and *after* 'z'). + +The original approach that :mod:`natsort` took with respect to non-ASCII +Unicode characters was to say "just use +the :mod:`locale` or :mod:`PyICU` library" and then cross it's fingers +and hope those libraries take care of it. As you will find in the following +sections, that comes with its own baggage, and turned out to not always work anyway +(see https://stackoverflow.com/q/45734562/1399279). A more robust approach is to +handle the Unicode out-of-the-box without invoking a heavy-handed library +like :mod:`locale` or :mod:`PyICU`. To do this, we must use *normalization*. + +To fully understand Unicode normalization, `check out some official Unicode documentation`_. +Just kidding... that's too much text. The following StackOverflow answers do +a good job at explaining Unicode normalization in simple terms: +https://stackoverflow.com/a/7934397/1399279 and +https://stackoverflow.com/a/7931547/1399279. Put simply, normalization +ensures that Unicode characters with multiple representations are in +some canonical and consistent representation so that (for example) comparisons +of the characters can be performed in a sane way. The following discussion +assumes you at least read the StackOverflow answers. + +Looking back at our 'é' example, we can see that the two versions were +constructed with the byte strings ``b'\xc3\xa9'`` and ``b'\x65\xcc\x81'``. +The former representation is actually +`LATIN SMALL LETTER E WITH ACUTE `_ +and is a single character in the Unicode standard. This is known as the +*compressed form* and corresponds to the 'NFC' normalization scheme. +The latter representation is actually the letter 'e' followed by +`COMBINING ACUTE ACCENT `_ +and so is two characters in the Unicode standard. This is known as the +*decompressed form* and corresponds to the 'NFD' normalization scheme. +Since the first character in the decompressed form is actually the letter 'e', +when compared to other ASCII characters it fits where you might expect. +Unfortunately, all Unicode compressed form characters come after the +ASCII characters and so they always will be placed after 'z' when sorting. + +It seems that most Unicode data is stored and shared in the compressed form +which makes it challenging to sort. This can be solved by normalizing all +incoming Unicode data to the decompressed form ('NFD') and *then* sorting. + +.. code-block:: python + + >>> import unicodedata + >>> c = [unicodedata.normalize('NFD', x) for x in a] + >>> c # doctest: +SKIP + ['f', 'e', 'é', 'é', 'a', 'z'] + >>> sorted(c) # doctest: +SKIP + ['a', 'e', 'é', 'é', 'f', 'z'] + +Huzzah! Sane sorting without having to resort to :mod:`locale`! + Using Locale to Compare Strings +++++++++++++++++++++++++++++++ @@ -1052,3 +1132,4 @@ what the rest of the world assumes. .. _Thousands separator support: https://github.com/SethMMorton/natsort/issues/36 .. _really good: https://hypothesis.readthedocs.io/en/latest/ .. _testing strategy: http://doc.pytest.org/en/latest/ +.. _check out some official Unicode documentation: http://unicode.org/reports/tr15/ From 059de483de650d91853de8103e31fae849e40493 Mon Sep 17 00:00:00 2001 From: Seth M Morton Date: Sat, 19 Aug 2017 00:22:53 -0700 Subject: [PATCH 3/3] Move input normalization to an earlier stage. The input normalization has been moved out of the "input_transform" function (which was called in the "parse_string" function) and now is the first step of the "parse_string" function. This is because the data needs to be normalized even if the "input_transform" function is skipped. Tests have been reworked to understand this change. --- natsort/utils.py | 33 +++++++++++------- test_natsort/slow_splitters.py | 2 ++ .../test_input_string_transform_factory.py | 34 +++++++------------ test_natsort/test_natsorted.py | 8 ++--- 4 files changed, 38 insertions(+), 39 deletions(-) diff --git a/natsort/utils.py b/natsort/utils.py index 28f1487d..c33de1dc 100644 --- a/natsort/utils.py +++ b/natsort/utils.py @@ -119,6 +119,22 @@ def _no_op(x): return x +def _normalize_input_factory(alg): + """Create a function that will normalize unicode input data.""" + normalization_form = 'NFKD' if alg & ns.COMPATIBILITYNORMALIZE else 'NFD' + + if NEWPY: + return partial(normalize, normalization_form) + else: + def func(x): + """Normalize unicode input.""" + if isinstance(x, py23_str): # unicode + return normalize(normalization_form, x) + else: + return x + return func + + def _natsort_key(val, key, string_func, bytes_func, num_func): """\ Key to sort strings and numbers naturally. @@ -209,11 +225,13 @@ def _parse_string_factory(alg, sep, splitter, # sometimes after. orig_after_xfrm = not (alg & ns._DUMB and alg & ns.LOCALEALPHA) original_func = input_transform if orig_after_xfrm else _no_op + normalize_input = _normalize_input_factory(alg) - def func(x, original_func=original_func): + def func(x): # Apply string input transformation function and return to x. # Original function is usually a no-op, but some algorithms require it # to also be the transformation function. + x = normalize_input(x) x, original = input_transform(x), original_func(x) x = splitter(x) # Split string into components. x = py23_filter(None, x) # Remove empty strings. @@ -268,20 +286,9 @@ def _input_string_transform_factory(alg): # Shortcuts. lowfirst = alg & ns.LOWERCASEFIRST dumb = alg & ns._DUMB - normalization_form = 'NFKD' if alg & ns.COMPATIBILITYNORMALIZE else 'NFD' - - if NEWPY: - careful_normalize = partial(normalize, normalization_form) - else: - def careful_normalize(x): - """Normalize unicode input.""" - if isinstance(x, py23_str): # unicode - return normalize(normalization_form, x) - else: - return x # Build the chain of functions to execute in order. - function_chain = [careful_normalize] + function_chain = [] if (dumb and not lowfirst) or (lowfirst and not dumb): function_chain.append(methodcaller('swapcase')) diff --git a/test_natsort/slow_splitters.py b/test_natsort/slow_splitters.py index 8a98948c..aef329ed 100644 --- a/test_natsort/slow_splitters.py +++ b/test_natsort/slow_splitters.py @@ -19,6 +19,7 @@ def int_splitter(iterable, signed, sep): """Alternate (slow) method to split a string into numbers.""" + iterable = unicodedata.normalize('NFD', iterable) split_by_digits = itertools.groupby(iterable, lambda a: a.isdigit()) split_by_digits = refine_split_grouping(split_by_digits) split = int_splitter_iter(split_by_digits, signed) @@ -32,6 +33,7 @@ def float_splitter(iterable, signed, exp, sep): def number_tester(x): return x.isdigit() or unicodedata.numeric(x, None) is not None + iterable = unicodedata.normalize('NFD', iterable) split_by_digits = itertools.groupby(iterable, number_tester) split_by_digits = peekable(refine_split_grouping(split_by_digits)) split = float_splitter_iter(split_by_digits, signed, exp) diff --git a/test_natsort/test_input_string_transform_factory.py b/test_natsort/test_input_string_transform_factory.py index 97acf216..3dbd8433 100644 --- a/test_natsort/test_input_string_transform_factory.py +++ b/test_natsort/test_input_string_transform_factory.py @@ -5,7 +5,6 @@ import pytest import locale from operator import methodcaller -from unicodedata import normalize from natsort.ns_enum import ns from natsort.utils import _input_string_transform_factory from natsort.compat.py23 import NEWPY @@ -29,22 +28,12 @@ def test_input_string_transform_factory_is_no_op_for_no_alg_options_examples(): x = 'feijGGAd' - assert _input_string_transform_factory(0)(x) == x + assert _input_string_transform_factory(0)(x) is x @given(text()) -def test_input_string_transform_factory_is_no_op_for_no_alg_options_except_normalization(x): - assert _input_string_transform_factory(0)(x) == normalize('NFD', x) - - -def test_input_string_transform_factory_performs_compatibility_normalization_with_COMPATIBILITYNORMALIZE_examples(): - x = '⑦' - assert _input_string_transform_factory(ns.COMPATIBILITYNORMALIZE)(x) == '7' - - -@given(text()) -def test_input_string_transform_factory_performs_compatibility_normalization_with_COMPATIBILITYNORMALIZE(x): - assert _input_string_transform_factory(ns.COMPATIBILITYNORMALIZE)(x) == normalize('NFKD', x) +def test_input_string_transform_factory_is_no_op_for_no_alg_options(x): + assert _input_string_transform_factory(0)(x) is x def test_input_string_transform_factory_performs_casefold_with_IGNORECASE_examples(): @@ -58,9 +47,9 @@ def test_input_string_transform_factory_performs_casefold_with_IGNORECASE_exampl @given(text()) def test_input_string_transform_factory_performs_casefold_with_IGNORECASE(x): if NEWPY: - assert _input_string_transform_factory(ns.IGNORECASE)(x) == normalize('NFD', x).casefold() + assert _input_string_transform_factory(ns.IGNORECASE)(x) == x.casefold() else: - assert _input_string_transform_factory(ns.IGNORECASE)(x) == normalize('NFD', x).lower() + assert _input_string_transform_factory(ns.IGNORECASE)(x) == x.lower() def test_input_string_transform_factory_performs_swapcase_with_DUMB_examples(): @@ -70,7 +59,7 @@ def test_input_string_transform_factory_performs_swapcase_with_DUMB_examples(): @given(text()) def test_input_string_transform_factory_performs_swapcase_with_DUMB(x): - assert _input_string_transform_factory(ns._DUMB)(x) == normalize('NFD', x).swapcase() + assert _input_string_transform_factory(ns._DUMB)(x) == x.swapcase() def test_input_string_transform_factory_performs_swapcase_with_LOWERCASEFIRST_example(): @@ -80,17 +69,18 @@ def test_input_string_transform_factory_performs_swapcase_with_LOWERCASEFIRST_ex @given(text()) def test_input_string_transform_factory_performs_swapcase_with_LOWERCASEFIRST(x): - assert _input_string_transform_factory(ns.LOWERCASEFIRST)(x) == normalize('NFD', x).swapcase() + x = 'feijGGAd' + assert _input_string_transform_factory(ns.LOWERCASEFIRST)(x) == x.swapcase() def test_input_string_transform_factory_is_no_op_with_both_LOWERCASEFIRST_AND_DUMB_example(): x = 'feijGGAd' - assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) == x + assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) is x @given(text()) def test_input_string_transform_factory_is_no_op_with_both_LOWERCASEFIRST_AND_DUMB(x): - assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) == normalize('NFD', x) + assert _input_string_transform_factory(ns._DUMB | ns.LOWERCASEFIRST)(x) is x def test_input_string_transform_factory_performs_swapcase_and_casefold_both_LOWERCASEFIRST_AND_IGNORECASE_example(): @@ -104,9 +94,9 @@ def test_input_string_transform_factory_performs_swapcase_and_casefold_both_LOWE @given(text()) def test_input_string_transform_factory_performs_swapcase_and_casefold_both_LOWERCASEFIRST_AND_IGNORECASE(x): if NEWPY: - assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == normalize('NFD', x).swapcase().casefold() + assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == x.swapcase().casefold() else: - assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == normalize('NFD', x).swapcase().lower() + assert _input_string_transform_factory(ns.IGNORECASE | ns.LOWERCASEFIRST)(x) == x.swapcase().lower() def test_input_string_transform_factory_removes_thousands_separator_with_LOCALE_example(): diff --git a/test_natsort/test_natsorted.py b/test_natsort/test_natsorted.py index 388e209f..fcbf75b6 100644 --- a/test_natsort/test_natsorted.py +++ b/test_natsort/test_natsorted.py @@ -251,16 +251,16 @@ def test_natsorted_with_LOCALE_and_mixed_input_returns_sorted_results_without_er def test_natsorted_with_LOCALE_and_UNGROUPLETTERS_and_mixed_input_returns_sorted_results_without_error(): load_locale('en_US') a = ['0', 'Á', '2', 'Z'] - assert natsorted(a, alg=ns.LOCALE | ns.UNGROUPLETTERS) == ['0', '2', 'Z', 'Á'] + assert natsorted(a, alg=ns.LOCALE | ns.UNGROUPLETTERS) == ['0', '2', 'Á', 'Z'] a = ['2', 'ä', 'b', 1.5, 3] - assert natsorted(a, alg=ns.LOCALE | ns.UNGROUPLETTERS) == [1.5, '2', 3, 'b', 'ä'] + assert natsorted(a, alg=ns.LOCALE | ns.UNGROUPLETTERS) == [1.5, '2', 3, 'ä', 'b'] locale.setlocale(locale.LC_ALL, str('')) def test_natsorted_with_PATH_and_LOCALE_and_UNGROUPLETTERS_and_mixed_input_returns_sorted_results_without_error(): load_locale('en_US') a = ['0', 'Á', '2', 'Z'] - assert natsorted(a, alg=ns.PATH | ns.LOCALE | ns.UNGROUPLETTERS) == ['0', '2', 'Z', 'Á'] + assert natsorted(a, alg=ns.PATH | ns.LOCALE | ns.UNGROUPLETTERS) == ['0', '2', 'Á', 'Z'] a = ['2', 'ä', 'b', 1.5, 3] - assert natsorted(a, alg=ns.PATH | ns.LOCALE | ns.UNGROUPLETTERS) == [1.5, '2', 3, 'b', 'ä'] + assert natsorted(a, alg=ns.PATH | ns.LOCALE | ns.UNGROUPLETTERS) == [1.5, '2', 3, 'ä', 'b'] locale.setlocale(locale.LC_ALL, str(''))