unicode-org · eggrobin · Feb 6, 2024 · Feb 2, 2024 · Feb 2, 2024 · Feb 2, 2024
diff --git a/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java b/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java
@@ -10,7 +10,6 @@
 import com.ibm.icu.text.Normalizer2;
 import com.ibm.icu.text.Transform;
 import com.ibm.icu.text.Transliterator;
-import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.UnicodeSet;
 import com.ibm.icu.util.ICUException;
 import com.ibm.icu.util.VersionInfo;
@@ -535,28 +534,12 @@ private UnicodeMap<String> getCachedMap(UcdProperty prop2, String sourceFileName
 
     public static String getResolvedValue(
             IndexUnicodeProperties props, UcdProperty prop, String codepoint, String value) {
-        if (value == null && props != null) {
-            if (getResolvedDefaultValueType(prop) == DefaultValueType.CODE_POINT) {
-                return codepoint;
-            }
-        }
-        if (prop == UcdProperty.Name && value != null && value.endsWith("-#")) {
-            return value.substring(0, value.length() - 1) + Utility.hex(codepoint);
-        }
-        return value;
+        return props.getProperty(prop).getValue(codepoint.codePointAt(0));
     }
 
     public static String getResolvedValue(
             IndexUnicodeProperties props, UcdProperty prop, int codepoint, String value) {
-        if (value == null && props != null) {
-            if (getResolvedDefaultValueType(prop) == DefaultValueType.CODE_POINT) {
-                return UTF16.valueOf(codepoint);
-            }
-        }
-        if (prop == UcdProperty.Name && value != null && value.endsWith("-#")) {
-            return value.substring(0, value.length() - 1) + Utility.hex(codepoint);
-        }
-        return value;
+        return props.getProperty(prop).getValue(codepoint);
     }
 
     UnicodeMap<String> nameMap;
@@ -693,17 +676,66 @@ class IndexUnicodeProperty extends UnicodeProperty.BaseProperty {
             }
         }
 
+        @Override
+        public boolean isTrivial() {
+            return _getRawUnicodeMap().isEmpty()
+                    || _getRawUnicodeMap().keySet("").equals(UnicodeSet.ALL_CODE_POINTS);
+        }
+
+        @Override
         protected UnicodeMap<String> _getUnicodeMap() {
+            var raw = _getRawUnicodeMap();
+            if (prop == UcdProperty.Name
+                    || raw.containsValue("<code point>")
+                    || raw.containsValue("<codepoint>")) {
+                final long start = System.currentTimeMillis();
+                UnicodeMap<String> newMap = new UnicodeMap<>();
+                for (UnicodeMap.EntryRange<String> range : raw.entryRanges()) {
+                    if (range.codepoint == -1) {
+                        newMap.put(range.string, range.value);
+                    } else if (DefaultValueType.forString(range.value)
+                                    == DefaultValueType.CODE_POINT
+                            || (prop == UcdProperty.Name && range.value.endsWith("#"))) {
+                        for (int c = range.codepoint; c <= range.codepointEnd; ++c) {
+                            newMap.put(c, resolveValue(range.value, c));
+                        }
+                    } else {
+                        newMap.putAll(range.codepoint, range.codepointEnd, range.value);
+                    }
+                }
+                final long stop = System.currentTimeMillis();
+                final long Δt_in_ms = stop - start;
+                // We do not want to construct these UnicodeMaps that map most of the code space to
+                // itself, not so much because building them is costly, but because whatever we do
+                // on them is almost certainly a bad idea (for instance calling `values()` will be
+                // extremely slow).  Log a trace so we can figure out where we are using this.
+                System.out.println(
+                        "Built " + prop + " " + ucdVersion + " map in " + Δt_in_ms + " ms");
+                new Throwable().printStackTrace(System.out);
+
+                return newMap;
+            } else {
+                return raw;
+            }
+        }
+
+        protected UnicodeMap<String> _getRawUnicodeMap() {
             return load(prop);
         }
 
         @Override
         protected String _getValue(int codepoint) {
-            final String result = _getUnicodeMap().get(codepoint);
-            if (DefaultValueType.forString(result) == DefaultValueType.CODE_POINT) {
+            final String result = _getRawUnicodeMap().get(codepoint);
+            return resolveValue(result, codepoint);
+        }
+
+        private String resolveValue(String rawValue, int codepoint) {
+            if (DefaultValueType.forString(rawValue) == DefaultValueType.CODE_POINT) {
                 return Character.toString(codepoint);
+            } else if (prop == UcdProperty.Name && rawValue != null && rawValue.endsWith("#")) {
+                return rawValue.substring(0, rawValue.length() - 1) + Utility.hex(codepoint);
             } else {
-                return result;
+                return rawValue;
             }
         }
 
@@ -731,6 +763,7 @@ protected List _getValueAliases(String valueAlias, List result) {
                 }
             }
             if (!result.contains(valueAlias)) {
+                // TODO(egg): We should not be constructing this map for this.
                 if (_getUnicodeMap().containsValue(valueAlias)) {
                     result.add(valueAlias);
                 }

diff --git a/unicodetools/src/main/java/org/unicode/props/ShimUnicodePropertyFactory.java b/unicodetools/src/main/java/org/unicode/props/ShimUnicodePropertyFactory.java
@@ -58,6 +58,8 @@ public ShimUnicodePropertyFactory(IndexUnicodeProperties factory) {
             UnicodeProperty prop = factory.getProperty(propName);
             switch (propName) {
                 case "Bidi_Mirroring_Glyph":
+                    // The default is <none> in BidiMirroring.txt, but TUP incorrectly has it as
+                    // <code point>.
                     prop =
                             replaceCpValues(
                                     prop,
@@ -68,22 +70,21 @@ public ShimUnicodePropertyFactory(IndexUnicodeProperties factory) {
                     prop = replaceValues(prop, oldValue -> oldValue == null ? "\u0000" : oldValue);
                     break;
                 case "FC_NFKC_Closure":
+                    // The default is <code point> in PropertyValueAliases.txt, but TUP incorrectly
+                    // has it as <none>.
                     prop =
                             replaceCpValues(
                                     prop, (cp, oldValue) -> fixFC_NFKC_Closure(cp, oldValue));
 
-                    break;
-                case "Joining_Type":
-                    prop = replaceCpValues(prop, (cp, oldValue) -> fixJoining_Type(cp, oldValue));
-                    break;
-                case "Joining_Group":
-                    prop = modifyJoining_Group(prop);
                     break;
                 case "Jamo_Short_Name":
                     prop = modifyJamo_Short_Name(prop);
                     break;
                 case "Name":
-                    // prop = modifyName(prop);
+                    // TUP reports the special label <control-XXXX> as the value of the Name
+                    // property. This is incorrect, the actual value of the Name property is "" for
+                    // those, see https://www.unicode.org/versions/latest/ch04.pdf#G135245 and
+                    // https://www.unicode.org/reports/tr44/#Name.
                     prop = replaceCpValues(prop, (cp, x) -> fixName(cp, x));
                     break;
                 case "Numeric_Value":
@@ -301,40 +302,24 @@ private String fixNumericValue(String value) {
     private String fixName(int cp, String value) {
         if (control.contains(cp)) {
             return "<control-" + Utility.hex(cp, 4) + ">";
-        } else if (value != null && value.contains("#")) {
-            return value.replace("#", Utility.hex(cp, 4));
         } else {
             return value;
         }
     }
 
     private String fixFC_NFKC_Closure(int cp, String oldValue) {
-        if (oldValue.equals("<code point>") || equalsString(cp, oldValue)) {
+        if (equalsString(cp, oldValue)) {
             return null;
         } else {
             return oldValue;
         }
     }
 
-    // Joining_Type needs fix in IUP
-    private String fixJoining_Type(int cp, String oldValue) {
-        if (defaultTransparent.contains(cp) && "Non_Joining".equals(oldValue)) {
-            return "Transparent";
-        } else {
-            return oldValue;
-        }
-    }
-
     // Jamo_Short_Name needs fix in IUP
     private UnicodeProperty modifyJamo_Short_Name(UnicodeProperty prop) {
         return copyPropReplacingMap(prop, prop.getUnicodeMap().put('ᄋ', ""));
     }
 
-    // Joining_Group needs fix in IUP (really, in UCD data)
-    private UnicodeProperty modifyJoining_Group(UnicodeProperty prop) {
-        return copyPropReplacingMap(prop, prop.getUnicodeMap().put('ۃ', "Teh_Marbuta_Goal"));
-    }
-
     /** Very useful. May already be in ICU, but not sure. */
     public boolean equalsString(int codepoint, String value) {
         return codepoint == value.codePointAt(0)

diff --git a/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java b/unicodetools/src/main/java/org/unicode/props/UnicodeProperty.java
@@ -550,6 +550,12 @@ public UnicodeMap<String> getUnicodeMap() {
         return getUnicodeMap(false);
     }
 
+    public boolean isTrivial() {
+        final var map = getUnicodeMap();
+        return map.isEmpty()
+                || map.keySet("").equals(UnicodeSet.ALL_CODE_POINTS) && map.stringKeys().isEmpty();
+    }
+
     /**
      * @return the unicode map
      */

diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/VersionedProperty.java b/unicodetools/src/main/java/org/unicode/text/UCD/VersionedProperty.java
@@ -1,6 +1,5 @@
 package org.unicode.text.UCD;
 
-import com.ibm.icu.dev.util.UnicodeMap;
 import com.ibm.icu.text.SymbolTable;
 import com.ibm.icu.text.UnicodeSet;
 import java.text.ParsePosition;
@@ -63,13 +62,6 @@ public static VersionedProperty forJSPs() {
     private static final Set<String> TOOL_ONLY_PROPERTIES =
             Set.of("toNFC", "toNFD", "toNFKC", "toNFKD");
 
-    private static boolean isTrivial(UnicodeMap<String> map) {
-        return map.isEmpty()
-                || (map.values().size() == 1
-                        && map.getSet(map.values().iterator().next())
-                                .equals(UnicodeSet.ALL_CODE_POINTS));
-    }
-
     public UnicodeProperty getProperty() {
         return property;
     }
@@ -112,11 +104,11 @@ public VersionedProperty set(String xPropertyName) {
         propSource = getIndexedProperties(version);
         property = propSource.getProperty(xPropertyName);
         if ((property == null && TOOL_ONLY_PROPERTIES.contains(xPropertyName))
-                || (property != null && isTrivial(property.getUnicodeMap()) && allowRetroactive)) {
+                || (property != null && property.isTrivial() && allowRetroactive)) {
             propSource = ToolUnicodePropertySource.make(version);
             property = propSource.getProperty(xPropertyName);
         }
-        if (property == null || isTrivial(property.getUnicodeMap())) {
+        if (property == null || property.isTrivial()) {
             if (!throwOnUnknownProperty) {
                 return null;
             }