wip Fully implement the UAX python#15 quick-check algorithm.

TODO: - news etc.? - test somehow? at least make sure semantic tests are adequate - that "older version" path... shouldn't it be MAYBE? - mention explicitly in commit message that *this* is the actual algorithm from UAX python#15 - think if there are counter-cases where this is slower. If caller treats MAYBE same as NO... e.g. if caller actually just wants to normalize? May need to parametrize and offer both behaviors. This lets us return a NO answer instead of MAYBE when that's what a Quick_Check property tells us; or also when that's what the canonical combining classes tell us, after a Quick_Check property has said "maybe". At a quick test on my laptop, the existing code takes about 6.7 ms/MB (so 6.7 ns per byte) when the quick check returns MAYBE and it has to do the slow comparison: $ ./python -m timeit -s 'import unicodedata; s = "\uf900"*500000' -- \ 'unicodedata.is_normalized("NFD", s)' 50 loops, best of 5: 6.67 msec per loop With this patch, it gets the answer instantly (78 ns) on the same 1 MB string: $ ./python -m timeit -s 'import unicodedata; s = "\uf900"*500000' -- \ 'unicodedata.is_normalized("NFD", s)' 5000000 loops, best of 5: 78 nsec per loop
gnprice · Aug 28, 2019 · 322e702 · 322e702
1 parent bb1dea2
commit 322e702
Showing 1 changed file with 22 additions and 15 deletions.
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
@@ -775,29 +775,31 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
     return result;
 }
 
-typedef enum {YES, NO, MAYBE} NormalMode;
+typedef enum {YES, MAYBE, NO} NormalMode;
 
 /* Return YES if the input is certainly normalized, NO or MAYBE if it might not be. */
 static NormalMode
 is_normalized(PyObject *self, PyObject *input, int nfc, int k)
 {
-    Py_ssize_t i, len;
-    int kind;
-    void *data;
-    unsigned char prev_combining = 0, quickcheck_mask;
+    /* This is an implementation of the following algorithm:
+       https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
+       See there for background.
+    */
 
     /* An older version of the database is requested, quickchecks must be
        disabled. */
     if (self && UCD_Check(self))
         return NO;
 
-    /* This is an implementation of the following algorithm:
-       https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
-       See there for background.
-    */
+    Py_ssize_t i, len;
+    int kind;
+    void *data;
+    unsigned char prev_combining = 0;
 
     /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No. */
-    quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
+    int quickcheck_mask_shift = ((nfc ? 4 : 0) + (k ? 2 : 0));
+
+    NormalMode result = YES; /* certainly normalized, unless we find something */
 
     i = 0;
     kind = PyUnicode_KIND(input);
@@ -806,16 +808,21 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
     while (i < len) {
         Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
         const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
-        unsigned char combining = record->combining;
-        unsigned char quickcheck = record->normalization_quick_check;
 
-        if (quickcheck & quickcheck_mask)
-            return MAYBE; /* this string might need normalization */
+        unsigned char combining = record->combining;
         if (combining && prev_combining > combining)
             return NO; /* non-canonical sort order, not normalized */
         prev_combining = combining;
+
+        unsigned char quickcheck = record->normalization_quick_check;
+        switch ((quickcheck >> quickcheck_mask_shift) & 3) {
+        case NO:
+          return NO;
+        case MAYBE:
+          result = MAYBE; /* this string might need normalization */
+        }
     }
-    return YES; /* certainly normalized */
+    return result;
 }
 
 /*[clinic input]