Skip to content

Commit

Permalink
A failing test and a bit more clarity.
Browse files Browse the repository at this point in the history
  • Loading branch information
eggrobin committed Feb 6, 2024
1 parent 04fb3b8 commit 6b23406
Show file tree
Hide file tree
Showing 3 changed files with 180 additions and 34 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ public ShimUnicodePropertyFactory(IndexUnicodeProperties factory) {
oldValue == null ? UTF16.valueOf(cp) : oldValue);
break;
case "Bidi_Paired_Bracket":
// The default is <none> in PropertyValueAliases.txt, but TUP incorrectly
// has it as U+0000.
prop = replaceValues(prop, oldValue -> oldValue == null ? "\u0000" : oldValue);
break;
case "FC_NFKC_Closure":
Expand All @@ -76,9 +78,6 @@ public ShimUnicodePropertyFactory(IndexUnicodeProperties factory) {
replaceCpValues(
prop, (cp, oldValue) -> fixFC_NFKC_Closure(cp, oldValue));

break;
case "Jamo_Short_Name":
prop = modifyJamo_Short_Name(prop);
break;
case "Name":
// TUP reports the special label <control-XXXX> as the value of the Name
Expand Down Expand Up @@ -315,11 +314,6 @@ private String fixFC_NFKC_Closure(int cp, String oldValue) {
}
}

// Jamo_Short_Name needs fix in IUP
private UnicodeProperty modifyJamo_Short_Name(UnicodeProperty prop) {
return copyPropReplacingMap(prop, prop.getUnicodeMap().put('ᄋ', ""));
}

/** Very useful. May already be in ICU, but not sure. */
public boolean equalsString(int codepoint, String value) {
return codepoint == value.codePointAt(0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,13 @@
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.TreeMap;
import java.util.function.Function;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.unicode.cldr.draft.FileUtilities;
import org.unicode.cldr.util.Tabber;
import org.unicode.cldr.util.Tabber.HTMLTabber;
Expand Down Expand Up @@ -549,12 +552,15 @@ static class FilterOrProp {
enum Type {
filter,
prop,
stringprop
stringprop,
sequenceTransformation,
};

private Type type;
private UnicodeProperty prop;
private UnicodeSet filter;
private Function<List<String>, List<String>> sequenceTransformation;
private Function<List<String>, String> sequenceReduction;
}

private static final UnicodeSet PROPCHARS =
Expand All @@ -571,6 +577,87 @@ static UnicodeProperty of(
propOrFilter.filter = parseUnicodeSet(line, pp);
propOrFilter.type = FilterOrProp.Type.filter;
result.propOrFilters.add(propOrFilter);
} else if (line.charAt(pp.getIndex()) == '(') {
final FilterOrProp propOrFilter = new FilterOrProp();
final var matcher =
Pattern.compile("(\\( *([^ )]+)(?: +([^)]+))? *\\)).*")
.matcher(line.substring(pp.getIndex()));
if (matcher.matches()) {
propOrFilter.type = FilterOrProp.Type.sequenceTransformation;
final String expression = matcher.group(1);
final String operation = matcher.group(2);
final String args = matcher.group(3);
switch (operation) {
case "take":
{
final int count = Integer.parseInt(args);
propOrFilter.sequenceTransformation = s -> s.subList(0, count);
break;
}
case "drop":
{
final int count = Integer.parseInt(args);
propOrFilter.sequenceTransformation =
s -> s.subList(count, s.size());
break;
}
case "delete-adjacent-duplicates":
{
propOrFilter.sequenceTransformation =
s -> {
if (s.isEmpty()) {
return s;
}
int j = 0;
for (int i = 1; i < s.size(); ++i) {
if (!Objects.equals(s.get(i), s.get(j))) {
s.set(++j, s.get(i));
}
}
s.subList(j + 1, s.size()).clear();
return s;
};
break;
}
case "prepend":
{
propOrFilter.sequenceTransformation =
s -> {
s.add(0, args);
return s;
};
break;
}
case "append":
{
propOrFilter.sequenceTransformation =
s -> {
s.add(args);
return s;
};
break;
}
case "string-join":
{
propOrFilter.sequenceReduction = s -> String.join("", s);
break;
}
case "constant":
{
propOrFilter.sequenceReduction = s -> args;
break;
}
default:
throw new IllegalArgumentException(
"Unknown operation " + matcher.group(1));
}
result.propOrFilters.add(propOrFilter);
pp.setIndex(pp.getIndex() + expression.length());
} else {
throw new IllegalArgumentException(
"Expected (<operation> <args>), got "
+ line.substring(pp.getIndex()));
}
} else {
final String propName = scan(PROPCHARS, line, pp, true);
if (propName.length() > 0) {
Expand All @@ -583,9 +670,11 @@ static UnicodeProperty of(
"Can't create property for: " + propName);
}
propOrFilter.type =
propOrFilter.prop.getType() != UnicodeProperty.STRING
? FilterOrProp.Type.prop
: FilterOrProp.Type.stringprop;
propOrFilter.prop.getType() == UnicodeProperty.STRING
|| propOrFilter.prop.getType()
== UnicodeProperty.EXTENDED_STRING
? FilterOrProp.Type.stringprop
: FilterOrProp.Type.prop;
result.propOrFilters.add(propOrFilter);
} else {
break;
Expand Down Expand Up @@ -629,13 +718,21 @@ protected List<String> _getNameAliases(List<String> result) {
@Override
protected String _getValue(int codepoint) {
final StringBuffer buffer = new StringBuffer();
String value = UTF16.valueOf(codepoint);
String value = Character.toString(codepoint);
List<String> values = null;
int cp;

for (int i = propOrFilters.size() - 1; i >= 0; --i) {
final FilterOrProp propOrFilter = propOrFilters.get(i);
switch (propOrFilter.type) {
case filter:
if (value == null) {
throw new IllegalArgumentException(
"Cannot apply filter "
+ propOrFilter.filter.toString()
+ " to sequence "
+ values);
}
buffer.setLength(0);
for (int j = 0; j < value.length(); j += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(value, j);
Expand All @@ -647,6 +744,13 @@ protected String _getValue(int codepoint) {
value = buffer.toString();
break;
case stringprop:
if (value == null) {
throw new IllegalArgumentException(
"Cannot apply string property "
+ propOrFilter.prop.getName()
+ " to sequence "
+ values);
}
buffer.setLength(0);
for (int j = 0; j < value.length(); j += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(value, j);
Expand All @@ -656,19 +760,53 @@ protected String _getValue(int codepoint) {
value = buffer.toString();
break;
case prop:
final LinkedHashSet<String> values = new LinkedHashSet<String>();
if (value == null) {
throw new IllegalArgumentException(
"Cannot apply enumerated property "
+ propOrFilter.prop.getName()
+ " to sequence "
+ values);
}
values = new ArrayList<>();
for (int j = 0; j < value.length(); j += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(value, j);
final String value2 = propOrFilter.prop.getValue(cp);
values.add(value2);
}
if (values.size() == 0) {
value = "";
} else if (values.size() == 1) {
value = values.iterator().next();
value = null;
break;
case sequenceTransformation:
final boolean wasString = value != null;
if (wasString) {
values =
value.codePoints()
.mapToObj(Character::toString)
.collect(
Collectors.toCollection(
() -> new ArrayList<>()));
value = null;
}
if (propOrFilter.sequenceTransformation != null) {
values = propOrFilter.sequenceTransformation.apply(values);
if (wasString) {
value = String.join("", values);
values = null;
}
} else {
value = values.toString();
value = propOrFilter.sequenceReduction.apply(values);
values = null;
}
break;
}
}
if (value == null) {
if (values.isEmpty()) {
return "";
} else if (values.size() == 1) {
return values.get(0);
} else {
throw new IllegalArgumentException(
"Compound property must return a string, not sequence " + values);
}
}
return value;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,25 +53,37 @@
#
# For each character in <unicodeSet>, verify that the result of applying the left <props>
# is (=|≠) the result of applying the right <props>.
# <props> is of the form (<unicodeSet> | <prop>) ("*" (<unicodeSet> | <prop>))?
# <props> is of the form (<unicodeSet> | <prop>) ("*" (<unicodeSet> | <prop> | <sequenceTransformation>))?
# It is the functional composition of the properties applied to strings, whereby
# <unicodeSet> is used to filter the result.
# <prop> for a string property is applied to each character, and the result concatenated
# That is, cf("A1") is cf("A")+cf("1") = "a1"
# <prop> for an enumerated property, is applied to each character, and the result is a concatenated set.
# That is, gc("A1") is gc("A")+gc("1") = "Uppercase_LetterDecimal_Number"
# <prop> for an enumerated property is applied to each character, and the result is sequence of
# strings.
# That is, gc("A1") is [gc("A"), gc("1")] = ["Uppercase_Letter", "Decimal_Number"]
# <sequenceTransformation> may be applied to a seqence or strings or to a string. On a string it
# operates on the code points and returns a string.
# The available operations are:
# (append <string>)
# (prepend <string>)
# (take <n>)
# (drop <n>)
# (delete-adjacent-duplicates)
# (string-join)
# The result of the <props> must be a string, a single-string sequence, or an empty sequence.
#
# Example: for <props> of bc * \P{bc=NSM} * cf * dm, the result applied to Å (angstrom sign) are:
# bc * \P{bc=NSM} * cf * dm ("Å")
# bc * \P{bc=NSM} * cf ("A" + umlaut)
# bc * \P{bc=NSM} ("a" + umlaut)
# bc ("a")
# "Left"
# "Left_To_Right"
#
# Example: In \p{dt=canonical} bc * \P{bc=NSM} * dm = bc * \P{bc=NSM}
# Example: In \p{dt=canonical} (delete-adjacent-duplicates) * bc * \P{bc=NSM} * dm = bc * \P{bc=NSM}
# This examines only those characters that have canonical compositions. For each such character X
# it gets the decomposition mapping of X, then filters out all NSM characters, then gets the Bidi_Class.
# It then tests that against the result of filtering out NSM characters from X, then getting the BIDI_Class.
# it gets the decomposition mapping of X, then filters out all NSM characters, then gets the Bidi_Class,
# then deduplicates runs of the same Bidi_Class.
# It then compares that with the result of filtering out NSM characters from X, then getting the Bidi_Class.
#
##########################
# EquivalencesOf <unicodeSet> <props> (⇐|⇔|⇒|⇍|⇎|⇏) <props>
Expand Down Expand Up @@ -241,13 +253,13 @@ Let $caseOverlap = [\u02B0-\u02B8\u02C0\u02C1\u02E0-\u02E4\u0345\u037A\u10FC\u1D

# Stability: The property values for the bidirectional properties Bidi_Class and Bidi_Mirrored preserve canonical equivalence.
# This test utilizes the fact that bc=NSM inherit behavior in the algorithm, so these are just filtered
In \p{dt=canonical} bc * \P{bc=NSM} * dm = bc * \P{bc=NSM}
In \p{dt=canonical} (delete-adjacent-duplicates) * bc * \P{bc=NSM} * dm = bc * \P{bc=NSM}

# Stability: The property values for the bidirectional properties Bidi_Class and Bidi_Mirrored preserve canonical equivalence.
# This test utilizes the fact that bc=NSM inherit behavior in the algorithm, so these are just filtered
# There are 6 special cases, all symmetric symbols—which are not mirrored—with a solidus overlay:
Let $BMExclusions = [ ≠ ∤ ∦ ≢ ≭ ⫝̸ ]
In [\p{dt=canonical}-$BMExclusions] Bidi_M * \P{bc=NSM} * dm = Bidi_M * \P{bc=NSM}
In [\p{dt=canonical}-$BMExclusions] (delete-adjacent-duplicates) * Bidi_M * \P{bc=NSM} * dm = Bidi_M * \P{bc=NSM}

# Additional BIDI invariant constants
Let $AL_blocks = [\u0600-\u07BF \u0860-\u08FF \uFB50-\uFDCF \uFDF0-\uFDFF \uFE70-\uFEFF \U00010D00-\U00010D3F \U00010EC0-\U00010EFF \U00010F30-\U00010F6F \U0001EC70-\U0001ECBF \U0001ED00-\U0001ED4F \U0001EE00-\U0001EEFF]
Expand Down Expand Up @@ -321,7 +333,7 @@ EquivalencesOf $codepoints Case_Folding ⇔ Simple_Case_Folding

# Stability: Once a character is assigned, both its Name and its Jamo_Short_Name will never change.
# Name is covered in Main policies
# TODO: Short Name
In \P{U-1:GC=Cn} Jamo_Short_Name=U-1:Jamo_Short_Name

# Stability: The Noncharacter_Code_Point property is an immutable code point property, which means that its property values for all Unicode code points will never change.
\p{NChar} = \p{U-1:NChar}
Expand Down Expand Up @@ -963,9 +975,7 @@ Let $nonIdeographicStrokes = \p{Name=/^CJK STROKE (T|WG|XG|BXG|SW|HZZ|HP|HZWG|SZ

\P{Equivalent_Unified_Ideograph=@none@} ⊆ $strokesAndRadicals
[$strokesAndRadicals - \P{Equivalent_Unified_Ideograph=@none@}] = [$nonIdeographicStrokes $nonIdeographicRadicals]
# TODO(egg): NFC_Quick_Check is a stupid way to get a Yes here; we are checking
# that Equivalent_Unified_Ideograph values are single unified ideographs.
In \P{Equivalent_Unified_Ideograph=@none@} Unified_Ideograph * Equivalent_Unified_Ideograph = NFC_Quick_Check
In \P{Equivalent_Unified_Ideograph=@none@} Unified_Ideograph * Equivalent_Unified_Ideograph = (constant Yes)

# InPC-InSC-gc invariants
# See https://www.unicode.org/L2/L2023/23200-category-invariants.pdf.
Expand All @@ -977,4 +987,8 @@ In \P{Equivalent_Unified_Ideograph=@none@} Unified_Ideograph * Equivalent_Unifie

# Script Extensions (mostly testing the proper handling of multivalued properties).
\p{sc=Deva} ⊂ \p{scx=Deva}
[\p{scx=Deva} & \p{scx=Beng}] ⊃ []
[\p{scx=Deva} & \p{scx=Beng}] ⊃ []

# Hangul Syllable Name Generation,
# https://www.unicode.org/versions/latest/ch03.pdf#G59675.
In [\p{Block=Hangul Syllables} - \p{gc=Cn}] (prepend HANGUL SYLLABLE ) * (string-join) * Jamo_Short_Name * toNFD = Name

Check failure on line 994 in unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt

View workflow job for this annotation

GitHub Actions / Check UCD consistency, invariants, smoke-test generators

Invariant test failure

Got unexpected differences: 588 C544 ; HANGUL SYLLABLE nullA≠HANGUL SYLLABLE A # (아) HANGUL SYLLABLE A C545 ; HANGUL SYLLABLE nullAG≠HANGUL SYLLABLE AG # (악) HANGUL SYLLABLE AG C546 ; HANGUL SYLLABLE nullAGG≠HANGUL SYLLABLE AGG # (앆) HANGUL SYLLABLE AGG C547 ; HANGUL SYLLABLE nullAGS≠HANGUL SYLLABLE AGS # (앇) HANGUL SYLLABLE AGS C548 ; HANGUL SYLLABLE nullAN≠HANGUL SYLLABLE AN # (안) HANGUL SYLLABLE AN C549 ; HANGUL SYLLABLE nullANJ≠HANGUL SYLLABLE ANJ # (앉) HANGUL SYLLABLE ANJ C54A ; HANGUL SYLLABLE nullANH≠HANGUL SYLLABLE ANH # (않) HANGUL SYLLABLE ANH C54B ; HANGUL SYLLABLE nullAD≠HANGUL SYLLABLE AD # (앋) HANGUL SYLLABLE AD C54C ; HANGUL SYLLABLE nullAL≠HANGUL SYLLABLE AL # (알) HANGUL SYLLABLE AL C54D ; HANGUL SYLLABLE nullALG≠HANGUL SYLLABLE ALG # (앍) HANGUL SYLLABLE ALG C54E ; HANGUL SYLLABLE nullALM≠HANGUL SYLLABLE ALM # (앎) HANGUL SYLLABLE ALM C54F ; HANGUL SYLLABLE nullALB≠HANGUL SYLLABLE ALB # (앏) HANGUL SYLLABLE ALB C550 ; HANGUL SYLLABLE nullALS≠HANGUL SYLLABLE ALS # (앐) HANGUL SYLLABLE ALS C551 ; HANGUL SYLLABLE nullALT≠HANGUL SYLLABLE ALT # (앑) HANGUL SYLLABLE ALT C552 ; HANGUL SYLLABLE nullALP≠HANGUL SYLLABLE ALP # (앒) HANGUL SYLLABLE ALP C553 ; HANGUL SYLLABLE nullALH≠HANGUL SYLLABLE ALH # (앓) HANGUL SYLLABLE ALH C554 ; HANGUL SYLLABLE nullAM≠HANGUL SYLLABLE AM # (암) HANGUL SYLLABLE AM C555 ; HANGUL SYLLABLE nullAB≠HANGUL SYLLABLE AB # (압) HANGUL SYLLABLE AB C556 ; HANGUL SYLLABLE nullABS≠HANGUL SYLLABLE ABS # (앖) HANGUL SYLLABLE ABS C557 ; HANGUL SYLLABLE nullAS≠HANGUL SYLLABLE AS # (앗) HANGUL SYLLABLE AS C558 ; HANGUL SYLLABLE nullASS≠HANGUL SYLLABLE ASS # (았) HANGUL SYLLABLE ASS C559 ; HANGUL SYLLABLE nullANG≠HANGUL SYLLABLE ANG # (앙) HANGUL SYLLABLE ANG C55A ; HANGUL SYLLABLE nullAJ≠HANGUL SYLLABLE AJ # (앚) HANGUL SYLLABLE AJ C55B ; HANGUL SYLLABLE nullAC≠HANGUL SYLLABLE AC # (앛) HANGUL SYLLABLE AC C55C ; HANGUL SYLLABLE nullAK≠HANGUL SYLLABLE AK # (앜) HANGUL SYLLABLE AK C55D ; HANGUL SYLLABLE nullAT≠HANGUL SYLLABLE AT # (앝) HANGUL SYLLABLE AT C55E ; HANGUL SYLLABLE nullAP≠HANGUL SYLLABLE AP # (앞) HANGUL SYLLABLE AP C55F ; HANGUL SYLLABLE nullAH≠HANGUL SYLLABLE AH # (앟) HANGUL SYLLABLE AH C560 ; HANGUL SYLLABLE nullAE≠HANGUL SYLLABLE AE # (애) HANGUL SYLLABLE AE C561 ; HANGUL SYLLABLE nullAEG≠HANGUL SYLLABLE AEG # (액) HANGUL SYLLABLE AEG C562 ; HANGUL SYLLABLE nullAEGG≠HANGUL SYLLABLE AEGG # (앢) HANGUL SYLLABLE AEGG C563 ; HANGUL SYLLABLE nullAEGS≠HANGUL SYLLABLE AEGS # (앣) HANGUL SYLLABLE AEGS C564 ; HANGUL SYLLABLE nullAEN≠HANGUL SYLLABLE AEN # (앤) HANGUL SYLLABLE AEN C565 ; HANGUL SYLLABLE nullAENJ≠HANGUL SYLLABLE AENJ # (앥) HANGUL SYLLABLE AENJ C566 ; HANGUL SYLLABLE nullAENH≠HANGUL SYLLABLE AENH # (앦) HANGUL SYLLABLE AENH C567 ; HANGUL SYLLABLE nullAED≠HANGUL SYLLABLE AED # (앧) HANGUL SYLLABLE AED C568 ; HANGUL SYLLABLE nullAEL≠HANGUL SYLLABLE AEL # (앨) HANGUL SYLLABLE AEL C569 ; HANGUL SYLLABLE nullAELG≠HANGUL SYLLABLE AELG # (앩) HANGUL SYLLABLE AELG C56A ; HANGUL SYLLABLE nullAELM≠HANGUL SYLLABLE AELM # (앪) HANGUL SYLLABLE AELM C56B ; HANGUL SYLLABLE nullAELB≠HANGUL SYLLABLE AELB # (앫) HANGUL SYLLABLE AELB C56C ; HANGUL SYLLABLE nullAELS≠HANGUL SYLLABLE AELS # (앬) HANGUL SYLLABLE AELS C56D ; HANGUL SYLLABLE nullAELT≠HANGUL SYLLABLE AELT # (앭) HANGUL SYLLABLE AELT C56E ; HANGUL SYLLABLE nullAEL

0 comments on commit 6b23406

Please sign in to comment.