Merge pull request #45 from SethMMorton/unicode-normalization

Add normalization to incoming Unicode data
SethMMorton · Aug 19, 2017 · 11b7f8d · 11b7f8d
2 parents c2f4b5d + 059de48
commit 11b7f8d
Show file tree

Hide file tree

Showing 7 changed files with 141 additions and 26 deletions.
diff --git a/README.rst b/README.rst
@@ -234,6 +234,7 @@ Other Useful Things
 +++++++++++++++++++
 
  - recursively descend into lists of lists
+ - automatic unicode normalization of input data
  - `controlling the case-sensitivity <http://natsort.readthedocs.io/en/stable/examples.html#case-sort>`_
  - `sorting file paths correctly <http://natsort.readthedocs.io/en/stable/examples.html#path-sort>`_
  - `allow custom sorting keys <http://natsort.readthedocs.io/en/stable/examples.html#custom-sort>`_

diff --git a/docs/source/howitworks.rst b/docs/source/howitworks.rst
@@ -655,12 +655,14 @@ StdLib there can't be too many dragons, right?
     - https://github.com/SethMMorton/natsort/issues/22
     - https://github.com/SethMMorton/natsort/issues/23
     - https://github.com/SethMMorton/natsort/issues/36
+    - https://github.com/SethMMorton/natsort/issues/44
     - https://bugs.python.org/issue2481
     - https://bugs.python.org/issue23195
-    - http://stackoverflow.com/questions/3412933/python-not-sorting-unicode-properly-strcoll-doesnt-help
-    - http://stackoverflow.com/questions/22203550/sort-dictionary-by-key-using-locale-collation
-    - http://stackoverflow.com/questions/33459384/unicode-character-not-in-range-when-calling-locale-strxfrm
-    - http://stackoverflow.com/questions/36431810/sort-numeric-lines-with-thousand-separators
+    - https://stackoverflow.com/questions/3412933/python-not-sorting-unicode-properly-strcoll-doesnt-help
+    - https://stackoverflow.com/questions/22203550/sort-dictionary-by-key-using-locale-collation
+    - https://stackoverflow.com/questions/33459384/unicode-character-not-in-range-when-calling-locale-strxfrm
+    - https://stackoverflow.com/questions/36431810/sort-numeric-lines-with-thousand-separators
+    - https://stackoverflow.com/questions/45734562/how-can-i-get-a-reasonable-string-sorting-with-python
 
 These can be summed up as follows:
 
@@ -787,6 +789,84 @@ the ``else:`` block of :func:`coerce_to_int`/:func:`coerce_to_float`.
 Of course, applying both *LOWERCASEFIRST* and *GROUPLETTERS* is just
 a matter of turning on both functions.
 
+Basic Unicode Support
++++++++++++++++++++++
+
+Unicode is hard and complicated. Here's an example.
+
+.. code-block:: python
+
+    >>> b = [b'\x66', b'\x65', b'\xc3\xa9', b'\x65\xcc\x81', b'\x61', b'\x7a']
+    >>> a = [x.decode('utf8') for x in b]
+    >>> a  # doctest: +SKIP
+    ['f', 'e', 'é', 'é', 'a', 'z']
+    >>> sorted(a)  # doctest: +SKIP
+    ['a', 'e', 'é', 'f', 'z', 'é']
+
+
+There are more than one way to represent the character 'é' in Unicode.
+In fact, many characters have multiple representations. This is a challenge
+because comparing the two representations would return ``False`` even though
+they *look* the same.
+
+.. code-block:: python
+
+    >>> a[2] == a[3]
+    False
+
+Alas, since characters are compared based on the numerical value of their
+representation, sorting Unicode often gives unexpected results (like seeing
+'é' come both *before* and *after* 'z').
+
+The original approach that :mod:`natsort` took with respect to non-ASCII
+Unicode characters was to say "just use
+the :mod:`locale` or :mod:`PyICU` library" and then cross it's fingers
+and hope those libraries take care of it. As you will find in the following
+sections, that comes with its own baggage, and turned out to not always work anyway
+(see https://stackoverflow.com/q/45734562/1399279). A more robust approach is to
+handle the Unicode out-of-the-box without invoking a heavy-handed library
+like :mod:`locale` or :mod:`PyICU`. To do this, we must use *normalization*.
+
+To fully understand Unicode normalization, `check out some official Unicode documentation`_.
+Just kidding... that's too much text. The following StackOverflow answers do
+a good job at explaining Unicode normalization in simple terms:
+https://stackoverflow.com/a/7934397/1399279 and
+https://stackoverflow.com/a/7931547/1399279. Put simply, normalization
+ensures that Unicode characters with multiple representations are in
+some canonical and consistent representation so that (for example) comparisons
+of the characters can be performed in a sane way. The following discussion
+assumes you at least read the StackOverflow answers.
+
+Looking back at our 'é' example, we can see that the two versions were
+constructed with the byte strings ``b'\xc3\xa9'`` and ``b'\x65\xcc\x81'``.
+The former representation is actually
+`LATIN SMALL LETTER E WITH ACUTE <http://www.fileformat.info/info/unicode/char/e9/index.htm>`_
+and is a single character in the Unicode standard. This is known as the
+*compressed form* and corresponds to the 'NFC' normalization scheme.
+The latter representation is actually the letter 'e' followed by
+`COMBINING ACUTE ACCENT <http://www.fileformat.info/info/unicode/char/0301/index.htm>`_
+and so is two characters in the Unicode standard. This is known as the
+*decompressed form* and corresponds to the 'NFD' normalization scheme.
+Since the first character in the decompressed form is actually the letter 'e',
+when compared to other ASCII characters it fits where you might expect.
+Unfortunately, all Unicode compressed form characters come after the
+ASCII characters and so they always will be placed after 'z' when sorting.
+
+It seems that most Unicode data is stored and shared in the compressed form
+which makes it challenging to sort. This can be solved by normalizing all
+incoming Unicode data to the decompressed form ('NFD') and *then* sorting.
+
+.. code-block:: python
+
+    >>> import unicodedata
+    >>> c = [unicodedata.normalize('NFD', x) for x in a]
+    >>> c  # doctest: +SKIP
+    ['f', 'e', 'é', 'é', 'a', 'z']
+    >>> sorted(c)  # doctest: +SKIP
+    ['a', 'e', 'é', 'é', 'f', 'z']
+
+Huzzah! Sane sorting without having to resort to :mod:`locale`!
+
 Using Locale to Compare Strings
 +++++++++++++++++++++++++++++++
 
@@ -1052,3 +1132,4 @@ what the rest of the world assumes.
 .. _Thousands separator support: https://github.com/SethMMorton/natsort/issues/36
 .. _really good: https://hypothesis.readthedocs.io/en/latest/
 .. _testing strategy: http://doc.pytest.org/en/latest/
+.. _check out some official Unicode documentation: http://unicode.org/reports/tr15/
diff --git a/natsort/ns_enum.py b/natsort/ns_enum.py
@@ -39,7 +39,7 @@ class ns(object):
         This is a shortcut for ``ns.FLOAT | ns.SIGNED``, which is useful
         when attempting to sort real numbers.
     NOEXP, N
-        Tell `natsort` to not search for exponents as part of the number.
+        Tell `natsort` to not search for exponents as part of a float number.
         For example, with `NOEXP` the number "5.6E5" would be interpreted
         as `5.6`, `"E"`, and `5` instead of `560000`.
     PATH, P
@@ -51,6 +51,13 @@ class ns(object):
         sorted properly; 'Folder/' will be placed at the end, not at the
         front. It is the same as setting the old `as_path` option to
         `True`.
+    COMPATIBILITYNORMALIZE, CN
+        Use the "NFKD" unicode normalization form on input rather than the
+        default "NFD". This will transform characters such as '⑦' into
+        '7'. Please see https://stackoverflow.com/a/7934397/1399279,
+        https://stackoverflow.com/a/7931547/1399279,
+        and http://unicode.org/reports/tr15/ full details into unicode
+        normalization.
     LOCALE, L
         Tell `natsort` to be locale-aware when sorting. This includes both
         proper sorting of alphabetical characters as well as proper
@@ -129,20 +136,21 @@ class ns(object):
 
     # The below are options. The values are stored as powers of two
     # so bitmasks can be used to extract the user's requested options.
-    FLOAT            = F  = 1 << 0
-    SIGNED           = S  = 1 << 1
-    REAL             = R  = FLOAT | SIGNED
-    NOEXP            = N  = 1 << 2
-    PATH             = P  = 1 << 3
-    LOCALEALPHA      = LA = 1 << 4
-    LOCALENUM        = LN = 1 << 5
-    LOCALE           = L  = LOCALEALPHA | LOCALENUM
-    IGNORECASE       = IC = 1 << 6
-    LOWERCASEFIRST   = LF = 1 << 7
-    GROUPLETTERS     = G  = 1 << 8
-    UNGROUPLETTERS   = UG = 1 << 9
-    CAPITALFIRST     = C  = UNGROUPLETTERS
-    NANLAST          = NL = 1 << 10
+    FLOAT                  = F  = 1 << 0
+    SIGNED                 = S  = 1 << 1
+    REAL                   = R  = FLOAT | SIGNED
+    NOEXP                  = N  = 1 << 2
+    PATH                   = P  = 1 << 3
+    LOCALEALPHA            = LA = 1 << 4
+    LOCALENUM              = LN = 1 << 5
+    LOCALE                 = L  = LOCALEALPHA | LOCALENUM
+    IGNORECASE             = IC = 1 << 6
+    LOWERCASEFIRST         = LF = 1 << 7
+    GROUPLETTERS           = G  = 1 << 8
+    UNGROUPLETTERS         = UG = 1 << 9
+    CAPITALFIRST           = C  = UNGROUPLETTERS
+    NANLAST                = NL = 1 << 10
+    COMPATIBILITYNORMALIZE = CN = 1 << 11
 
     # The below are private options for internal use only.
     _NUMERIC_ONLY    = REAL | NOEXP

diff --git a/natsort/utils.py b/natsort/utils.py
@@ -54,6 +54,7 @@
 from collections import deque
 from functools import partial, reduce
 from operator import methodcaller
+from unicodedata import normalize
 
 # Local imports.
 from natsort.ns_enum import ns
@@ -118,6 +119,22 @@ def _no_op(x):
     return x
 
 
+def _normalize_input_factory(alg):
+    """Create a function that will normalize unicode input data."""
+    normalization_form = 'NFKD' if alg & ns.COMPATIBILITYNORMALIZE else 'NFD'
+
+    if NEWPY:
+        return partial(normalize, normalization_form)
+    else:
+        def func(x):
+            """Normalize unicode input."""
+            if isinstance(x, py23_str):  # unicode
+                return normalize(normalization_form, x)
+            else:
+                return x
+        return func
+
+
 def _natsort_key(val, key, string_func, bytes_func, num_func):
     """\
     Key to sort strings and numbers naturally.
@@ -208,11 +225,13 @@ def _parse_string_factory(alg, sep, splitter,
     # sometimes after.
     orig_after_xfrm = not (alg & ns._DUMB and alg & ns.LOCALEALPHA)
     original_func = input_transform if orig_after_xfrm else _no_op
+    normalize_input = _normalize_input_factory(alg)
 
-    def func(x, original_func=original_func):
+    def func(x):
         # Apply string input transformation function and return to x.
         # Original function is usually a no-op, but some algorithms require it
         # to also be the transformation function.
+        x = normalize_input(x)
         x, original = input_transform(x), original_func(x)
         x = splitter(x)                       # Split string into components.
         x = py23_filter(None, x)              # Remove empty strings.
@@ -272,6 +291,7 @@ def _input_string_transform_factory(alg):
     function_chain = []
     if (dumb and not lowfirst) or (lowfirst and not dumb):
         function_chain.append(methodcaller('swapcase'))
+
     if alg & ns.IGNORECASE:
         if NEWPY:
             function_chain.append(methodcaller('casefold'))

diff --git a/test_natsort/slow_splitters.py b/test_natsort/slow_splitters.py
@@ -19,6 +19,7 @@
 
 def int_splitter(iterable, signed, sep):
     """Alternate (slow) method to split a string into numbers."""
+    iterable = unicodedata.normalize('NFD', iterable)
     split_by_digits = itertools.groupby(iterable, lambda a: a.isdigit())
     split_by_digits = refine_split_grouping(split_by_digits)
     split = int_splitter_iter(split_by_digits, signed)
@@ -32,6 +33,7 @@ def float_splitter(iterable, signed, exp, sep):
     def number_tester(x):
         return x.isdigit() or unicodedata.numeric(x, None) is not None
 
+    iterable = unicodedata.normalize('NFD', iterable)
     split_by_digits = itertools.groupby(iterable, number_tester)
     split_by_digits = peekable(refine_split_grouping(split_by_digits))
     split = float_splitter_iter(split_by_digits, signed, exp)

diff --git a/test_natsort/test_natsorted.py b/test_natsort/test_natsorted.py
@@ -80,8 +80,10 @@ def test_natsorted_returns_sorted_list_with_mixed_type_input_and_does_not_raise_
 
 
 def test_natsorted_with_mixed_input_returns_sorted_results_without_error():
+    a = ['0', 'Á', '2', 'Z']
+    assert natsorted(a) == ['0', '2', 'Á', 'Z']
     a = ['2', 'ä', 'b', 1.5, 3]
-    assert natsorted(a) == [1.5, '2', 3, 'b', 'ä']
+    assert natsorted(a) == [1.5, '2', 3, 'ä', 'b']
 
 
 def test_natsorted_with_nan_input_returns_sorted_results_with_nan_last_with_NANLAST():
@@ -240,7 +242,7 @@ def test_natsorted_with_LOCALE_and_de_setting_returns_results_sorted_by_de_langu
 def test_natsorted_with_LOCALE_and_mixed_input_returns_sorted_results_without_error():
     load_locale('en_US')
     a = ['0', 'Á', '2', 'Z']
-    assert natsorted(a) == ['0', '2', 'Z', 'Á']
+    assert natsorted(a, alg=ns.LOCALE) == ['0', '2', 'Á', 'Z']
     a = ['2', 'ä', 'b', 1.5, 3]
     assert natsorted(a, alg=ns.LOCALE) == [1.5, '2', 3, 'ä', 'b']
     locale.setlocale(locale.LC_ALL, str(''))
@@ -249,16 +251,16 @@ def test_natsorted_with_LOCALE_and_mixed_input_returns_sorted_results_without_er
 def test_natsorted_with_LOCALE_and_UNGROUPLETTERS_and_mixed_input_returns_sorted_results_without_error():
     load_locale('en_US')
     a = ['0', 'Á', '2', 'Z']
-    assert natsorted(a, alg=ns.LOCALE | ns.UNGROUPLETTERS) == ['0', '2', 'Z', 'Á']
+    assert natsorted(a, alg=ns.LOCALE | ns.UNGROUPLETTERS) == ['0', '2', 'Á', 'Z']
     a = ['2', 'ä', 'b', 1.5, 3]
-    assert natsorted(a, alg=ns.LOCALE | ns.UNGROUPLETTERS) == [1.5, '2', 3, 'b', 'ä']
+    assert natsorted(a, alg=ns.LOCALE | ns.UNGROUPLETTERS) == [1.5, '2', 3, 'ä', 'b']
     locale.setlocale(locale.LC_ALL, str(''))
 
 
 def test_natsorted_with_PATH_and_LOCALE_and_UNGROUPLETTERS_and_mixed_input_returns_sorted_results_without_error():
     load_locale('en_US')
     a = ['0', 'Á', '2', 'Z']
-    assert natsorted(a, alg=ns.PATH | ns.LOCALE | ns.UNGROUPLETTERS) == ['0', '2', 'Z', 'Á']
+    assert natsorted(a, alg=ns.PATH | ns.LOCALE | ns.UNGROUPLETTERS) == ['0', '2', 'Á', 'Z']
     a = ['2', 'ä', 'b', 1.5, 3]
-    assert natsorted(a, alg=ns.PATH | ns.LOCALE | ns.UNGROUPLETTERS) == [1.5, '2', 3, 'b', 'ä']
+    assert natsorted(a, alg=ns.PATH | ns.LOCALE | ns.UNGROUPLETTERS) == [1.5, '2', 3, 'ä', 'b']
     locale.setlocale(locale.LC_ALL, str(''))
diff --git a/test_natsort/test_utils.py b/test_natsort/test_utils.py
@@ -149,6 +149,7 @@ def test_ns_enum_values_have_are_as_expected():
     assert ns.CAPITALFIRST == ns.C
     assert ns.UNGROUPLETTERS == ns.CAPITALFIRST
     assert ns.NANLAST == ns.NL
+    assert ns.COMPATIBILITYNORMALIZE == ns.CN
 
     # Convenience
     assert ns.LOCALE == ns.LOCALEALPHA | ns.LOCALENUM