diff --git a/README.rst b/README.rst index 3a283f23..61ddb963 100644 --- a/README.rst +++ b/README.rst @@ -234,6 +234,7 @@ Other Useful Things +++++++++++++++++++ - recursively descend into lists of lists + - automatic unicode normalization of input data - `controlling the case-sensitivity `_ - `sorting file paths correctly `_ - `allow custom sorting keys `_ diff --git a/docs/source/howitworks.rst b/docs/source/howitworks.rst index f15db8fe..22de2a52 100644 --- a/docs/source/howitworks.rst +++ b/docs/source/howitworks.rst @@ -655,12 +655,14 @@ StdLib there can't be too many dragons, right? - https://github.com/SethMMorton/natsort/issues/22 - https://github.com/SethMMorton/natsort/issues/23 - https://github.com/SethMMorton/natsort/issues/36 + - https://github.com/SethMMorton/natsort/issues/44 - https://bugs.python.org/issue2481 - https://bugs.python.org/issue23195 - - http://stackoverflow.com/questions/3412933/python-not-sorting-unicode-properly-strcoll-doesnt-help - - http://stackoverflow.com/questions/22203550/sort-dictionary-by-key-using-locale-collation - - http://stackoverflow.com/questions/33459384/unicode-character-not-in-range-when-calling-locale-strxfrm - - http://stackoverflow.com/questions/36431810/sort-numeric-lines-with-thousand-separators + - https://stackoverflow.com/questions/3412933/python-not-sorting-unicode-properly-strcoll-doesnt-help + - https://stackoverflow.com/questions/22203550/sort-dictionary-by-key-using-locale-collation + - https://stackoverflow.com/questions/33459384/unicode-character-not-in-range-when-calling-locale-strxfrm + - https://stackoverflow.com/questions/36431810/sort-numeric-lines-with-thousand-separators + - https://stackoverflow.com/questions/45734562/how-can-i-get-a-reasonable-string-sorting-with-python These can be summed up as follows: @@ -787,6 +789,84 @@ the ``else:`` block of :func:`coerce_to_int`/:func:`coerce_to_float`. Of course, applying both *LOWERCASEFIRST* and *GROUPLETTERS* is just a matter of turning on both functions. +Basic Unicode Support ++++++++++++++++++++++ + +Unicode is hard and complicated. Here's an example. + +.. code-block:: python + + >>> b = [b'\x66', b'\x65', b'\xc3\xa9', b'\x65\xcc\x81', b'\x61', b'\x7a'] + >>> a = [x.decode('utf8') for x in b] + >>> a # doctest: +SKIP + ['f', 'e', 'é', 'é', 'a', 'z'] + >>> sorted(a) # doctest: +SKIP + ['a', 'e', 'é', 'f', 'z', 'é'] + + +There are more than one way to represent the character 'é' in Unicode. +In fact, many characters have multiple representations. This is a challenge +because comparing the two representations would return ``False`` even though +they *look* the same. + +.. code-block:: python + + >>> a[2] == a[3] + False + +Alas, since characters are compared based on the numerical value of their +representation, sorting Unicode often gives unexpected results (like seeing +'é' come both *before* and *after* 'z'). + +The original approach that :mod:`natsort` took with respect to non-ASCII +Unicode characters was to say "just use +the :mod:`locale` or :mod:`PyICU` library" and then cross it's fingers +and hope those libraries take care of it. As you will find in the following +sections, that comes with its own baggage, and turned out to not always work anyway +(see https://stackoverflow.com/q/45734562/1399279). A more robust approach is to +handle the Unicode out-of-the-box without invoking a heavy-handed library +like :mod:`locale` or :mod:`PyICU`. To do this, we must use *normalization*. + +To fully understand Unicode normalization, `check out some official Unicode documentation`_. +Just kidding... that's too much text. The following StackOverflow answers do +a good job at explaining Unicode normalization in simple terms: +https://stackoverflow.com/a/7934397/1399279 and +https://stackoverflow.com/a/7931547/1399279. Put simply, normalization +ensures that Unicode characters with multiple representations are in +some canonical and consistent representation so that (for example) comparisons +of the characters can be performed in a sane way. The following discussion +assumes you at least read the StackOverflow answers. + +Looking back at our 'é' example, we can see that the two versions were +constructed with the byte strings ``b'\xc3\xa9'`` and ``b'\x65\xcc\x81'``. +The former representation is actually +`LATIN SMALL LETTER E WITH ACUTE `_ +and is a single character in the Unicode standard. This is known as the +*compressed form* and corresponds to the 'NFC' normalization scheme. +The latter representation is actually the letter 'e' followed by +`COMBINING ACUTE ACCENT `_ +and so is two characters in the Unicode standard. This is known as the +*decompressed form* and corresponds to the 'NFD' normalization scheme. +Since the first character in the decompressed form is actually the letter 'e', +when compared to other ASCII characters it fits where you might expect. +Unfortunately, all Unicode compressed form characters come after the +ASCII characters and so they always will be placed after 'z' when sorting. + +It seems that most Unicode data is stored and shared in the compressed form +which makes it challenging to sort. This can be solved by normalizing all +incoming Unicode data to the decompressed form ('NFD') and *then* sorting. + +.. code-block:: python + + >>> import unicodedata + >>> c = [unicodedata.normalize('NFD', x) for x in a] + >>> c # doctest: +SKIP + ['f', 'e', 'é', 'é', 'a', 'z'] + >>> sorted(c) # doctest: +SKIP + ['a', 'e', 'é', 'é', 'f', 'z'] + +Huzzah! Sane sorting without having to resort to :mod:`locale`! + Using Locale to Compare Strings +++++++++++++++++++++++++++++++ @@ -1052,3 +1132,4 @@ what the rest of the world assumes. .. _Thousands separator support: https://github.com/SethMMorton/natsort/issues/36 .. _really good: https://hypothesis.readthedocs.io/en/latest/ .. _testing strategy: http://doc.pytest.org/en/latest/ +.. _check out some official Unicode documentation: http://unicode.org/reports/tr15/ diff --git a/natsort/ns_enum.py b/natsort/ns_enum.py index e5ffbf56..37a00deb 100644 --- a/natsort/ns_enum.py +++ b/natsort/ns_enum.py @@ -39,7 +39,7 @@ class ns(object): This is a shortcut for ``ns.FLOAT | ns.SIGNED``, which is useful when attempting to sort real numbers. NOEXP, N - Tell `natsort` to not search for exponents as part of the number. + Tell `natsort` to not search for exponents as part of a float number. For example, with `NOEXP` the number "5.6E5" would be interpreted as `5.6`, `"E"`, and `5` instead of `560000`. PATH, P @@ -51,6 +51,13 @@ class ns(object): sorted properly; 'Folder/' will be placed at the end, not at the front. It is the same as setting the old `as_path` option to `True`. + COMPATIBILITYNORMALIZE, CN + Use the "NFKD" unicode normalization form on input rather than the + default "NFD". This will transform characters such as '⑦' into + '7'. Please see https://stackoverflow.com/a/7934397/1399279, + https://stackoverflow.com/a/7931547/1399279, + and http://unicode.org/reports/tr15/ full details into unicode + normalization. LOCALE, L Tell `natsort` to be locale-aware when sorting. This includes both proper sorting of alphabetical characters as well as proper @@ -129,20 +136,21 @@ class ns(object): # The below are options. The values are stored as powers of two # so bitmasks can be used to extract the user's requested options. - FLOAT = F = 1 << 0 - SIGNED = S = 1 << 1 - REAL = R = FLOAT | SIGNED - NOEXP = N = 1 << 2 - PATH = P = 1 << 3 - LOCALEALPHA = LA = 1 << 4 - LOCALENUM = LN = 1 << 5 - LOCALE = L = LOCALEALPHA | LOCALENUM - IGNORECASE = IC = 1 << 6 - LOWERCASEFIRST = LF = 1 << 7 - GROUPLETTERS = G = 1 << 8 - UNGROUPLETTERS = UG = 1 << 9 - CAPITALFIRST = C = UNGROUPLETTERS - NANLAST = NL = 1 << 10 + FLOAT = F = 1 << 0 + SIGNED = S = 1 << 1 + REAL = R = FLOAT | SIGNED + NOEXP = N = 1 << 2 + PATH = P = 1 << 3 + LOCALEALPHA = LA = 1 << 4 + LOCALENUM = LN = 1 << 5 + LOCALE = L = LOCALEALPHA | LOCALENUM + IGNORECASE = IC = 1 << 6 + LOWERCASEFIRST = LF = 1 << 7 + GROUPLETTERS = G = 1 << 8 + UNGROUPLETTERS = UG = 1 << 9 + CAPITALFIRST = C = UNGROUPLETTERS + NANLAST = NL = 1 << 10 + COMPATIBILITYNORMALIZE = CN = 1 << 11 # The below are private options for internal use only. _NUMERIC_ONLY = REAL | NOEXP diff --git a/natsort/utils.py b/natsort/utils.py index c21d3b40..c33de1dc 100644 --- a/natsort/utils.py +++ b/natsort/utils.py @@ -54,6 +54,7 @@ from collections import deque from functools import partial, reduce from operator import methodcaller +from unicodedata import normalize # Local imports. from natsort.ns_enum import ns @@ -118,6 +119,22 @@ def _no_op(x): return x +def _normalize_input_factory(alg): + """Create a function that will normalize unicode input data.""" + normalization_form = 'NFKD' if alg & ns.COMPATIBILITYNORMALIZE else 'NFD' + + if NEWPY: + return partial(normalize, normalization_form) + else: + def func(x): + """Normalize unicode input.""" + if isinstance(x, py23_str): # unicode + return normalize(normalization_form, x) + else: + return x + return func + + def _natsort_key(val, key, string_func, bytes_func, num_func): """\ Key to sort strings and numbers naturally. @@ -208,11 +225,13 @@ def _parse_string_factory(alg, sep, splitter, # sometimes after. orig_after_xfrm = not (alg & ns._DUMB and alg & ns.LOCALEALPHA) original_func = input_transform if orig_after_xfrm else _no_op + normalize_input = _normalize_input_factory(alg) - def func(x, original_func=original_func): + def func(x): # Apply string input transformation function and return to x. # Original function is usually a no-op, but some algorithms require it # to also be the transformation function. + x = normalize_input(x) x, original = input_transform(x), original_func(x) x = splitter(x) # Split string into components. x = py23_filter(None, x) # Remove empty strings. @@ -272,6 +291,7 @@ def _input_string_transform_factory(alg): function_chain = [] if (dumb and not lowfirst) or (lowfirst and not dumb): function_chain.append(methodcaller('swapcase')) + if alg & ns.IGNORECASE: if NEWPY: function_chain.append(methodcaller('casefold')) diff --git a/test_natsort/slow_splitters.py b/test_natsort/slow_splitters.py index 8a98948c..aef329ed 100644 --- a/test_natsort/slow_splitters.py +++ b/test_natsort/slow_splitters.py @@ -19,6 +19,7 @@ def int_splitter(iterable, signed, sep): """Alternate (slow) method to split a string into numbers.""" + iterable = unicodedata.normalize('NFD', iterable) split_by_digits = itertools.groupby(iterable, lambda a: a.isdigit()) split_by_digits = refine_split_grouping(split_by_digits) split = int_splitter_iter(split_by_digits, signed) @@ -32,6 +33,7 @@ def float_splitter(iterable, signed, exp, sep): def number_tester(x): return x.isdigit() or unicodedata.numeric(x, None) is not None + iterable = unicodedata.normalize('NFD', iterable) split_by_digits = itertools.groupby(iterable, number_tester) split_by_digits = peekable(refine_split_grouping(split_by_digits)) split = float_splitter_iter(split_by_digits, signed, exp) diff --git a/test_natsort/test_natsorted.py b/test_natsort/test_natsorted.py index 146997a7..fcbf75b6 100644 --- a/test_natsort/test_natsorted.py +++ b/test_natsort/test_natsorted.py @@ -80,8 +80,10 @@ def test_natsorted_returns_sorted_list_with_mixed_type_input_and_does_not_raise_ def test_natsorted_with_mixed_input_returns_sorted_results_without_error(): + a = ['0', 'Á', '2', 'Z'] + assert natsorted(a) == ['0', '2', 'Á', 'Z'] a = ['2', 'ä', 'b', 1.5, 3] - assert natsorted(a) == [1.5, '2', 3, 'b', 'ä'] + assert natsorted(a) == [1.5, '2', 3, 'ä', 'b'] def test_natsorted_with_nan_input_returns_sorted_results_with_nan_last_with_NANLAST(): @@ -240,7 +242,7 @@ def test_natsorted_with_LOCALE_and_de_setting_returns_results_sorted_by_de_langu def test_natsorted_with_LOCALE_and_mixed_input_returns_sorted_results_without_error(): load_locale('en_US') a = ['0', 'Á', '2', 'Z'] - assert natsorted(a) == ['0', '2', 'Z', 'Á'] + assert natsorted(a, alg=ns.LOCALE) == ['0', '2', 'Á', 'Z'] a = ['2', 'ä', 'b', 1.5, 3] assert natsorted(a, alg=ns.LOCALE) == [1.5, '2', 3, 'ä', 'b'] locale.setlocale(locale.LC_ALL, str('')) @@ -249,16 +251,16 @@ def test_natsorted_with_LOCALE_and_mixed_input_returns_sorted_results_without_er def test_natsorted_with_LOCALE_and_UNGROUPLETTERS_and_mixed_input_returns_sorted_results_without_error(): load_locale('en_US') a = ['0', 'Á', '2', 'Z'] - assert natsorted(a, alg=ns.LOCALE | ns.UNGROUPLETTERS) == ['0', '2', 'Z', 'Á'] + assert natsorted(a, alg=ns.LOCALE | ns.UNGROUPLETTERS) == ['0', '2', 'Á', 'Z'] a = ['2', 'ä', 'b', 1.5, 3] - assert natsorted(a, alg=ns.LOCALE | ns.UNGROUPLETTERS) == [1.5, '2', 3, 'b', 'ä'] + assert natsorted(a, alg=ns.LOCALE | ns.UNGROUPLETTERS) == [1.5, '2', 3, 'ä', 'b'] locale.setlocale(locale.LC_ALL, str('')) def test_natsorted_with_PATH_and_LOCALE_and_UNGROUPLETTERS_and_mixed_input_returns_sorted_results_without_error(): load_locale('en_US') a = ['0', 'Á', '2', 'Z'] - assert natsorted(a, alg=ns.PATH | ns.LOCALE | ns.UNGROUPLETTERS) == ['0', '2', 'Z', 'Á'] + assert natsorted(a, alg=ns.PATH | ns.LOCALE | ns.UNGROUPLETTERS) == ['0', '2', 'Á', 'Z'] a = ['2', 'ä', 'b', 1.5, 3] - assert natsorted(a, alg=ns.PATH | ns.LOCALE | ns.UNGROUPLETTERS) == [1.5, '2', 3, 'b', 'ä'] + assert natsorted(a, alg=ns.PATH | ns.LOCALE | ns.UNGROUPLETTERS) == [1.5, '2', 3, 'ä', 'b'] locale.setlocale(locale.LC_ALL, str('')) diff --git a/test_natsort/test_utils.py b/test_natsort/test_utils.py index 934757a9..f1cffa20 100644 --- a/test_natsort/test_utils.py +++ b/test_natsort/test_utils.py @@ -149,6 +149,7 @@ def test_ns_enum_values_have_are_as_expected(): assert ns.CAPITALFIRST == ns.C assert ns.UNGROUPLETTERS == ns.CAPITALFIRST assert ns.NANLAST == ns.NL + assert ns.COMPATIBILITYNORMALIZE == ns.CN # Convenience assert ns.LOCALE == ns.LOCALEALPHA | ns.LOCALENUM