A failing test and a bit more clarity.

eggrobin · Feb 6, 2024 · 6b23406 · 6b23406
1 parent 04fb3b8
commit 6b23406
Show file tree

Hide file tree

Showing 3 changed files with 180 additions and 34 deletions.
diff --git a/unicodetools/src/main/java/org/unicode/props/ShimUnicodePropertyFactory.java b/unicodetools/src/main/java/org/unicode/props/ShimUnicodePropertyFactory.java
@@ -67,6 +67,8 @@ public ShimUnicodePropertyFactory(IndexUnicodeProperties factory) {
                                             oldValue == null ? UTF16.valueOf(cp) : oldValue);
                     break;
                 case "Bidi_Paired_Bracket":
+                    // The default is <none> in PropertyValueAliases.txt, but TUP incorrectly
+                    // has it as U+0000.
                     prop = replaceValues(prop, oldValue -> oldValue == null ? "\u0000" : oldValue);
                     break;
                 case "FC_NFKC_Closure":
@@ -76,9 +78,6 @@ public ShimUnicodePropertyFactory(IndexUnicodeProperties factory) {
                             replaceCpValues(
                                     prop, (cp, oldValue) -> fixFC_NFKC_Closure(cp, oldValue));
 
-                    break;
-                case "Jamo_Short_Name":
-                    prop = modifyJamo_Short_Name(prop);
                     break;
                 case "Name":
                     // TUP reports the special label <control-XXXX> as the value of the Name
@@ -315,11 +314,6 @@ private String fixFC_NFKC_Closure(int cp, String oldValue) {
         }
     }
 
-    // Jamo_Short_Name needs fix in IUP
-    private UnicodeProperty modifyJamo_Short_Name(UnicodeProperty prop) {
-        return copyPropReplacingMap(prop, prop.getUnicodeMap().put('ᄋ', ""));
-    }
-
     /** Very useful. May already be in ICU, but not sure. */
     public boolean equalsString(int codepoint, String value) {
         return codepoint == value.codePointAt(0)

diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java
@@ -17,10 +17,13 @@
 import java.util.ArrayList;
 import java.util.Comparator;
 import java.util.HashMap;
-import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Objects;
 import java.util.TreeMap;
+import java.util.function.Function;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
 import org.unicode.cldr.draft.FileUtilities;
 import org.unicode.cldr.util.Tabber;
 import org.unicode.cldr.util.Tabber.HTMLTabber;
@@ -549,12 +552,15 @@ static class FilterOrProp {
             enum Type {
                 filter,
                 prop,
-                stringprop
+                stringprop,
+                sequenceTransformation,
             };
 
             private Type type;
             private UnicodeProperty prop;
             private UnicodeSet filter;
+            private Function<List<String>, List<String>> sequenceTransformation;
+            private Function<List<String>, String> sequenceReduction;
         }
 
         private static final UnicodeSet PROPCHARS =
@@ -571,6 +577,87 @@ static UnicodeProperty of(
                     propOrFilter.filter = parseUnicodeSet(line, pp);
                     propOrFilter.type = FilterOrProp.Type.filter;
                     result.propOrFilters.add(propOrFilter);
+                } else if (line.charAt(pp.getIndex()) == '(') {
+                    final FilterOrProp propOrFilter = new FilterOrProp();
+                    final var matcher =
+                            Pattern.compile("(\\( *([^ )]+)(?: +([^)]+))? *\\)).*")
+                                    .matcher(line.substring(pp.getIndex()));
+                    if (matcher.matches()) {
+                        propOrFilter.type = FilterOrProp.Type.sequenceTransformation;
+                        final String expression = matcher.group(1);
+                        final String operation = matcher.group(2);
+                        final String args = matcher.group(3);
+                        switch (operation) {
+                            case "take":
+                                {
+                                    final int count = Integer.parseInt(args);
+                                    propOrFilter.sequenceTransformation = s -> s.subList(0, count);
+                                    break;
+                                }
+                            case "drop":
+                                {
+                                    final int count = Integer.parseInt(args);
+                                    propOrFilter.sequenceTransformation =
+                                            s -> s.subList(count, s.size());
+                                    break;
+                                }
+                            case "delete-adjacent-duplicates":
+                                {
+                                    propOrFilter.sequenceTransformation =
+                                            s -> {
+                                                if (s.isEmpty()) {
+                                                    return s;
+                                                }
+                                                int j = 0;
+                                                for (int i = 1; i < s.size(); ++i) {
+                                                    if (!Objects.equals(s.get(i), s.get(j))) {
+                                                        s.set(++j, s.get(i));
+                                                    }
+                                                }
+                                                s.subList(j + 1, s.size()).clear();
+                                                return s;
+                                            };
+                                    break;
+                                }
+                            case "prepend":
+                                {
+                                    propOrFilter.sequenceTransformation =
+                                            s -> {
+                                                s.add(0, args);
+                                                return s;
+                                            };
+                                    break;
+                                }
+                            case "append":
+                                {
+                                    propOrFilter.sequenceTransformation =
+                                            s -> {
+                                                s.add(args);
+                                                return s;
+                                            };
+                                    break;
+                                }
+                            case "string-join":
+                                {
+                                    propOrFilter.sequenceReduction = s -> String.join("", s);
+                                    break;
+                                }
+                            case "constant":
+                                {
+                                    propOrFilter.sequenceReduction = s -> args;
+                                    break;
+                                }
+                            default:
+                                throw new IllegalArgumentException(
+                                        "Unknown operation " + matcher.group(1));
+                        }
+                        result.propOrFilters.add(propOrFilter);
+                        pp.setIndex(pp.getIndex() + expression.length());
+                    } else {
+                        throw new IllegalArgumentException(
+                                "Expected (<operation> <args>), got "
+                                        + line.substring(pp.getIndex()));
+                    }
                 } else {
                     final String propName = scan(PROPCHARS, line, pp, true);
                     if (propName.length() > 0) {
@@ -583,9 +670,11 @@ static UnicodeProperty of(
                                     "Can't create property for: " + propName);
                         }
                         propOrFilter.type =
-                                propOrFilter.prop.getType() != UnicodeProperty.STRING
-                                        ? FilterOrProp.Type.prop
-                                        : FilterOrProp.Type.stringprop;
+                                propOrFilter.prop.getType() == UnicodeProperty.STRING
+                                                || propOrFilter.prop.getType()
+                                                        == UnicodeProperty.EXTENDED_STRING
+                                        ? FilterOrProp.Type.stringprop
+                                        : FilterOrProp.Type.prop;
                         result.propOrFilters.add(propOrFilter);
                     } else {
                         break;
@@ -629,13 +718,21 @@ protected List<String> _getNameAliases(List<String> result) {
         @Override
         protected String _getValue(int codepoint) {
             final StringBuffer buffer = new StringBuffer();
-            String value = UTF16.valueOf(codepoint);
+            String value = Character.toString(codepoint);
+            List<String> values = null;
             int cp;
 
             for (int i = propOrFilters.size() - 1; i >= 0; --i) {
                 final FilterOrProp propOrFilter = propOrFilters.get(i);
                 switch (propOrFilter.type) {
                     case filter:
+                        if (value == null) {
+                            throw new IllegalArgumentException(
+                                    "Cannot apply filter  "
+                                            + propOrFilter.filter.toString()
+                                            + " to sequence "
+                                            + values);
+                        }
                         buffer.setLength(0);
                         for (int j = 0; j < value.length(); j += UTF16.getCharCount(cp)) {
                             cp = UTF16.charAt(value, j);
@@ -647,6 +744,13 @@ protected String _getValue(int codepoint) {
                         value = buffer.toString();
                         break;
                     case stringprop:
+                        if (value == null) {
+                            throw new IllegalArgumentException(
+                                    "Cannot apply string property "
+                                            + propOrFilter.prop.getName()
+                                            + " to sequence "
+                                            + values);
+                        }
                         buffer.setLength(0);
                         for (int j = 0; j < value.length(); j += UTF16.getCharCount(cp)) {
                             cp = UTF16.charAt(value, j);
@@ -656,19 +760,53 @@ protected String _getValue(int codepoint) {
                         value = buffer.toString();
                         break;
                     case prop:
-                        final LinkedHashSet<String> values = new LinkedHashSet<String>();
+                        if (value == null) {
+                            throw new IllegalArgumentException(
+                                    "Cannot apply enumerated property "
+                                            + propOrFilter.prop.getName()
+                                            + " to sequence "
+                                            + values);
+                        }
+                        values = new ArrayList<>();
                         for (int j = 0; j < value.length(); j += UTF16.getCharCount(cp)) {
                             cp = UTF16.charAt(value, j);
                             final String value2 = propOrFilter.prop.getValue(cp);
                             values.add(value2);
                         }
-                        if (values.size() == 0) {
-                            value = "";
-                        } else if (values.size() == 1) {
-                            value = values.iterator().next();
+                        value = null;
+                        break;
+                    case sequenceTransformation:
+                        final boolean wasString = value != null;
+                        if (wasString) {
+                            values =
+                                    value.codePoints()
+                                            .mapToObj(Character::toString)
+                                            .collect(
+                                                    Collectors.toCollection(
+                                                            () -> new ArrayList<>()));
+                            value = null;
+                        }
+                        if (propOrFilter.sequenceTransformation != null) {
+                            values = propOrFilter.sequenceTransformation.apply(values);
+                            if (wasString) {
+                                value = String.join("", values);
+                                values = null;
+                            }
                         } else {
-                            value = values.toString();
+                            value = propOrFilter.sequenceReduction.apply(values);
+                            values = null;
                         }
+                        break;
+                }
+            }
+            if (value == null) {
+                if (values.isEmpty()) {
+                    return "";
+                } else if (values.size() == 1) {
+                    return values.get(0);
+                } else {
+                    throw new IllegalArgumentException(
+                            "Compound property must return a string, not sequence " + values);
                 }
             }
             return value;

diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt
@@ -53,25 +53,37 @@
 #
 # For each character in <unicodeSet>, verify that the result of applying the left <props>
 # is (=|≠) the result of applying the right <props>.
-#   <props> is of the form (<unicodeSet> | <prop>) ("*" (<unicodeSet> | <prop>))?
+#   <props> is of the form (<unicodeSet> | <prop>) ("*" (<unicodeSet> | <prop> | <sequenceTransformation>))?
 #   It is the functional composition of the properties applied to strings, whereby
 #   <unicodeSet> is used to filter the result.
 #   <prop> for a string property is applied to each character, and the result concatenated
 #       That is, cf("A1") is cf("A")+cf("1") = "a1"
-#   <prop> for an enumerated property, is applied to each character, and the result is a concatenated set.
-#       That is, gc("A1") is gc("A")+gc("1") = "Uppercase_LetterDecimal_Number"
+#   <prop> for an enumerated property is applied to each character, and the result is sequence of
+#   strings.
+#       That is, gc("A1") is [gc("A"), gc("1")] = ["Uppercase_Letter", "Decimal_Number"]
+#   <sequenceTransformation> may be applied to a seqence or strings or to a string.  On a string it
+#   operates on the code points and returns a string.
+#       The available operations are:
+#           (append <string>)
+#           (prepend <string>)
+#           (take <n>)
+#           (drop <n>)
+#           (delete-adjacent-duplicates)
+#           (string-join)
+#   The result of the <props> must be a string, a single-string sequence, or an empty sequence.
 #
 #   Example: for <props> of bc * \P{bc=NSM} * cf * dm, the result applied to Å (angstrom sign) are:
 #       bc * \P{bc=NSM} * cf * dm ("Å")
 #       bc * \P{bc=NSM} * cf ("A" + umlaut)
 #       bc * \P{bc=NSM} ("a" + umlaut)
 #       bc ("a")
-#       "Left"
+#       "Left_To_Right"
 #
-#   Example: In \p{dt=canonical} bc * \P{bc=NSM} * dm = bc * \P{bc=NSM}
+#   Example: In \p{dt=canonical} (delete-adjacent-duplicates) * bc * \P{bc=NSM} * dm = bc * \P{bc=NSM}
 #       This examines only those characters that have canonical compositions. For each such character X
-#       it gets the decomposition mapping of X, then filters out all NSM characters, then gets the Bidi_Class.
-#       It then tests that against the result of filtering out NSM characters from X, then getting the BIDI_Class.
+#       it gets the decomposition mapping of X, then filters out all NSM characters, then gets the Bidi_Class,
+#       then deduplicates runs of the same Bidi_Class.
+#       It then compares that with the result of filtering out NSM characters from X, then getting the Bidi_Class.
 #
 ##########################
 # EquivalencesOf <unicodeSet> <props> (⇐|⇔|⇒|⇍|⇎|⇏) <props>
@@ -241,13 +253,13 @@ Let $caseOverlap = [\u02B0-\u02B8\u02C0\u02C1\u02E0-\u02E4\u0345\u037A\u10FC\u1D
 
 # Stability: The property values for the bidirectional properties Bidi_Class and Bidi_Mirrored preserve canonical equivalence.
 # This test utilizes the fact that bc=NSM inherit behavior in the algorithm, so these are just filtered
-In \p{dt=canonical} bc * \P{bc=NSM} * dm = bc * \P{bc=NSM}
+In \p{dt=canonical} (delete-adjacent-duplicates) * bc * \P{bc=NSM} * dm = bc * \P{bc=NSM}
 
 # Stability: The property values for the bidirectional properties Bidi_Class and Bidi_Mirrored preserve canonical equivalence.
 # This test utilizes the fact that bc=NSM inherit behavior in the algorithm, so these are just filtered
 # There are 6 special cases, all symmetric symbols—which are not mirrored—with a solidus overlay:
 Let $BMExclusions = [ ≠ ∤ ∦ ≢ ≭ ⫝̸ ]
-In [\p{dt=canonical}-$BMExclusions] Bidi_M * \P{bc=NSM} * dm = Bidi_M * \P{bc=NSM}
+In [\p{dt=canonical}-$BMExclusions] (delete-adjacent-duplicates) * Bidi_M * \P{bc=NSM} * dm = Bidi_M * \P{bc=NSM}
 
 # Additional BIDI invariant constants
 Let $AL_blocks = [\u0600-\u07BF \u0860-\u08FF \uFB50-\uFDCF \uFDF0-\uFDFF \uFE70-\uFEFF \U00010D00-\U00010D3F \U00010EC0-\U00010EFF \U00010F30-\U00010F6F \U0001EC70-\U0001ECBF \U0001ED00-\U0001ED4F \U0001EE00-\U0001EEFF]
@@ -321,7 +333,7 @@ EquivalencesOf $codepoints Case_Folding ⇔ Simple_Case_Folding
 
 # Stability: Once a character is assigned, both its Name and its Jamo_Short_Name will never change. 
 # Name is covered in Main policies
-# TODO: Short Name
+In \P{U-1:GC=Cn} Jamo_Short_Name=U-1:Jamo_Short_Name
 
 # Stability: The Noncharacter_Code_Point property is an immutable code point property, which means that its property values for all Unicode code points will never change.
 \p{NChar} = \p{U-1:NChar}
@@ -963,9 +975,7 @@ Let $nonIdeographicStrokes = \p{Name=/^CJK STROKE (T|WG|XG|BXG|SW|HZZ|HP|HZWG|SZ
 
 \P{Equivalent_Unified_Ideograph=@none@} ⊆ $strokesAndRadicals
 [$strokesAndRadicals - \P{Equivalent_Unified_Ideograph=@none@}] = [$nonIdeographicStrokes $nonIdeographicRadicals]
-# TODO(egg): NFC_Quick_Check is a stupid way to get a Yes here; we are checking
-# that Equivalent_Unified_Ideograph values are single unified ideographs.
-In \P{Equivalent_Unified_Ideograph=@none@} Unified_Ideograph * Equivalent_Unified_Ideograph = NFC_Quick_Check
+In \P{Equivalent_Unified_Ideograph=@none@} Unified_Ideograph * Equivalent_Unified_Ideograph = (constant Yes)
 
 # InPC-InSC-gc invariants
 # See https://www.unicode.org/L2/L2023/23200-category-invariants.pdf.
@@ -977,4 +987,8 @@ In \P{Equivalent_Unified_Ideograph=@none@} Unified_Ideograph * Equivalent_Unifie
 
 # Script Extensions (mostly testing the proper handling of multivalued properties).
 \p{sc=Deva} ⊂ \p{scx=Deva}
-[\p{scx=Deva} & \p{scx=Beng}] ⊃ []
+[\p{scx=Deva} & \p{scx=Beng}] ⊃ []
+
+# Hangul Syllable Name Generation,
+# https://www.unicode.org/versions/latest/ch03.pdf#G59675.
+In [\p{Block=Hangul Syllables} - \p{gc=Cn}] (prepend HANGUL SYLLABLE ) * (string-join) * Jamo_Short_Name * toNFD = Name