ICU-21592 Update cj normal/loose linebreak per CSS

unicode-org · Feb 22, 2022 · 9203ec6 · 9203ec6
1 parent 7d825cb
commit 9203ec6
Show file tree

Hide file tree

Showing 13 changed files with 65 additions and 51 deletions.
diff --git a/icu4c/source/data/brkitr/rules/line_loose_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_cj.txt
@@ -17,7 +17,8 @@
 #         line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
 #         It sets characters of class CJ to behave like ID.
 #         In addition, it allows breaks:
-#         * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
+#         * between ID and hyphens 2010 & 2013 (both BA)
+#         * before 301C, 30A0 (both NS)
 #         * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
 #         * between characters of LineBreak class IN such as 2026
 #         * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
@@ -238,7 +239,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?;    # by rule 10, stand-alone CM behaves as AL
 #        See issue ICU-20303
 
 
-$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
+$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $BAX $HY $NS $ALPlus $HL $IN];
 $SP $IS           / [^ $CanFollowIS $NU $CM];
 $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
 
@@ -294,8 +295,10 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
 # LB 21        x   (BA | HY | NS)
 #           BB x
 #
-# DO allow breaks here before $BAX and $NSX, so don't include them
-$LB20NonBreaks $CM* ($BA | $HY | $NS);
+# DO allow breaks here before $NSX, so don't include it.
+# And DO allow breaks between ID and $BAX, so split out the handling of ID and do not include $BAX for them.
+[$LB20NonBreaks - $ID] $CM* ($BA | $BAX | $HY | $NS);
+$ID $CM* ($BA | $HY | $NS);
 
 
 ^$CM+ ($BA | $HY | $NS);

diff --git a/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt
@@ -15,7 +15,8 @@
 #         line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
 #         It sets characters of class CJ to behave like ID.
 #         In addition, it allows breaks:
-#         * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
+#         * between ID and hyphens 2010 & 2013 (both BA)
+#         * before 301C, 30A0 (both NS)
 #         * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
 #         * between characters of LineBreak class IN such as 2026
 #         * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
@@ -251,7 +252,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?;    # by rule 10, stand-alone CM behaves as AL
 #        See issue ICU-20303
 
 
-$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
+$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $BAX $HY $NS $ALPlus $HL $IN];
 $SP $IS           / [^ $CanFollowIS $NU $CM];
 $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
 
@@ -307,8 +308,10 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
 # LB 21        x   (BA | HY | NS)
 #           BB x
 #
-# DO allow breaks here before $BAX and $NSX, so don't include them
-$LB20NonBreaks $CM* ($BA | $HY | $NS);
+# DO allow breaks here before $NSX, so don't include it.
+# And DO allow breaks between ID and $BAX, so split out the handling of ID and do not include $BAX for them.
+[$LB20NonBreaks - $ID] $CM* ($BA | $BAX | $HY | $NS);
+$ID $CM* ($BA | $HY | $NS);
 
 
 ^$CM+ ($BA | $HY | $NS);

diff --git a/icu4c/source/data/brkitr/rules/line_normal_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_cj.txt
@@ -17,7 +17,7 @@
 #         line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
 #         It sets characters of class CJ to behave like ID.
 #         In addition, it allows breaks:
-#         * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
+#         * before 301C, 30A0 (both NS)
 #         It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
 
 #
@@ -29,8 +29,7 @@
 
 $AI = [:LineBreak =  Ambiguous:];
 $AL = [:LineBreak =  Alphabetic:];
-$BAX = [\u2010 \u2013];
-$BA = [[:LineBreak =  Break_After:] - $BAX];
+$BA = [:LineBreak =  Break_After:];
 $HH = [\u2010];     # \u2010 is HYPHEN, default line break is BA.
 $BB = [:LineBreak =  Break_Before:];
 $BK = [:LineBreak =  Mandatory_Break:];
@@ -184,7 +183,7 @@ $GL $CM* .;
 # LB 12a  Do not break before NBSP and related characters ...
 #            [^SP BA HY] x GL
 #
-[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GL;
+[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL;
 ^$CM+ $GL;
 
 
@@ -282,7 +281,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
 # LB 21        x   (BA | HY | NS)
 #           BB x
 #
-# DO allow breaks here before $BAX and $NSX, so don't include them
+# DO allow breaks here before $NSX, so don't include it
 $LB20NonBreaks $CM* ($BA | $HY | $NS);
 
 
@@ -294,7 +293,7 @@ $BB $CM* $LB20NonBreaks;
 # LB 21a Don't break after Hebrew + Hyphen
 #   HL (HY | BA) x
 #
-$HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?;
+$HL $CM* ($HY | $BA) $CM* [^$CB]?;
 
 # LB 21b (forward) Don't break between SY and HL
 # (break between HL and SY already disallowed by LB 13 above)

diff --git a/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt
@@ -15,7 +15,7 @@
 #         line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
 #         It sets characters of class CJ to behave like ID.
 #         In addition, it allows breaks:
-#         * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
+#         * before 301C, 30A0 (both NS)
 #         It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
 #
 #         The content is the same as line_normal_cj.txt except the following
@@ -31,8 +31,7 @@
 
 $AI = [:LineBreak =  Ambiguous:];
 $AL = [:LineBreak =  Alphabetic:];
-$BAX = [\u2010 \u2013];
-$BA = [[:LineBreak =  Break_After:] - $BAX];
+$BA = [:LineBreak =  Break_After:];
 $HH = [\u2010];     # \u2010 is HYPHEN, default line break is BA.
 $BB = [:LineBreak =  Break_Before:];
 $BK = [:LineBreak =  Mandatory_Break:];
@@ -197,7 +196,7 @@ $GL $CM* .;
 # LB 12a  Do not break before NBSP and related characters ...
 #            [^SP BA HY] x GL
 #
-[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GL;
+[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL;
 ^$CM+ $GL;
 
 
@@ -295,7 +294,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
 # LB 21        x   (BA | HY | NS)
 #           BB x
 #
-# DO allow breaks here before $BAX and $NSX, so don't include them
+# DO allow breaks here before $NSX, so don't include it
 $LB20NonBreaks $CM* ($BA | $HY | $NS);
 
 
@@ -307,7 +306,7 @@ $BB $CM* $LB20NonBreaks;
 # LB 21a Don't break after Hebrew + Hyphen
 #   HL (HY | BA) x
 #
-$HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?;
+$HL $CM* ($HY | $BA) $CM* [^$CB]?;
 
 # LB 21b (forward) Don't break between SY and HL
 # (break between HL and SY already disallowed by LB 13 above)

diff --git a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt
@@ -20,7 +20,8 @@
 #         line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
 #         It sets characters of class CJ to behave like ID.
 #         In addition, it allows breaks:
-#         * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
+#         * between ID and hyphens 2010 & 2013 (both BA)
+#         * before 301C, 30A0 (both NS)
 #         * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
 #         * between characters of LineBreak class IN such as 2026
 #         * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
@@ -200,8 +201,10 @@ LB20.09:     ^(HY | HH) CM* AL;
 
 LB21a:       HL CM* (HY | BA | BAX) CM* [^CM CB]?;
 
-LB21.1:      . CM* [BA HY NS];
-LB21.2:      BB CM* [^CM CB];
+LB21.1:      [^BK CR LF NL CM ZW SP CB ID] CM* [BA BAX HY NS];
+LB21.2:      ID CM* [BA HY NS];
+LB21.3:      CM+ [BA HY NS];
+LB21.4:      BB CM* [^CM CB];
 
 LB21b:       SY CM* HL;
 

diff --git a/icu4c/source/test/testdata/break_rules/line_normal_cj.txt b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt
@@ -20,16 +20,15 @@
 #         line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
 #         It sets characters of class CJ to behave like ID.
 #         In addition, it allows breaks:
-#         * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
+#         * before 301C, 30A0 (both NS)
 #         It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
 
 type = line;
 locale = ja@lb=normal;
 
 AI = [:LineBreak =  Ambiguous:];
 AL = [:LineBreak =  Alphabetic:];
-BAX = [\u2010 \u2013];
-BA = [[:LineBreak =  Break_After:] - BAX];
+BA = [:LineBreak =  Break_After:];
 HH = [\u2010];      # \u2010 is HYPHEN, default line break is BA.
 BB = [:LineBreak =  Break_Before:];
 BK = [:LineBreak =  Mandatory_Break:];
@@ -154,7 +153,7 @@ LB11.3:      WJ CM* [^CM];
 
 LB12:        GL CM* [^CM];
 
-LB12a:       [^SP BA BAX HY] CM* GL;
+LB12a:       [^SP BA HY] CM* GL;
 
 # LB 13 Do not break before ‘]’ or ‘!’ or ‘/’, even after spaces.
 LB13.1: [^SP] CM* [CL CP EX SY];
@@ -182,12 +181,9 @@ LB20.09:     ^(HY | HH) CM* AL;
 
 # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
 #       not picking up the continuing match after the BA from 21a.
-# TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so,
-#       should "HL BAX" not break when followed by a CB? Thats what the current
-#       rules do, which is why "[^CM CB]?" includes the ?.
-LB21a:       HL CM* (HY | BA | BAX) CM* [^CM CB]?;
+LB21a:       HL CM* (HY | BA) CM* [^CM CB]?;
 
-# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
+# DO allow breaks here before $NSXcm, so don't include it
 LB21.1:      . CM* [BA HY NS];
 LB21.2:      BB CM* [^CM CB];
 

diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt
@@ -1646,11 +1646,17 @@ Bangkok)•</data>
 #     •brk OK before 3063               •brk OK before 301C •no brk btw 2026   •no brk before FF01•
 <data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026\u2026\u0020•\u30A2\uFF01\u0020•</data>
 
+#     •no brk before 2010                                 •
+<data>•\u3042\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020•</data>
+
 <locale ja@lb=loose>
 <line>
 #     •brk OK before 3063               •brk OK before 301C •brk OK btw 2026    •brk OK before FF01•
 <data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026•\u2026\u0020•u30A2•\uFF01\u0020•</data>
 
+#     •no brk before 2010 except ok after ID               •
+<data>•\u3042•\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020•</data>
+
 <locale en@lb=strict>
 <line>
 #     •no brk before 3063              •no brk before 301C•no brk btw 2026   •no brk before FF01•

diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar
diff --git a/icu4j/main/shared/data/icutzdata.jar b/icu4j/main/shared/data/icutzdata.jar
diff --git a/icu4j/main/shared/data/testdata.jar b/icu4j/main/shared/data/testdata.jar
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt
@@ -20,7 +20,8 @@
 #         line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
 #         It sets characters of class CJ to behave like ID.
 #         In addition, it allows breaks:
-#         * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
+#         * between ID and hyphens 2010 & 2013 (both BA)
+#         * before 301C, 30A0 (both NS)
 #         * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
 #         * between characters of LineBreak class IN such as 2026
 #         * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
@@ -200,8 +201,10 @@ LB20.09:     ^(HY | HH) CM* AL;
 
 LB21a:       HL CM* (HY | BA | BAX) CM* [^CM CB]?;
 
-LB21.1:      . CM* [BA HY NS];
-LB21.2:      BB CM* [^CM CB];
+LB21.1:      [^BK CR LF NL CM ZW SP CB ID] CM* [BA BAX HY NS];
+LB21.2:      ID CM* [BA HY NS];
+LB21.3:      CM+ [BA HY NS];
+LB21.4:      BB CM* [^CM CB];
 
 LB21b:       SY CM* HL;
 

diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt
@@ -20,16 +20,15 @@
 #         line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
 #         It sets characters of class CJ to behave like ID.
 #         In addition, it allows breaks:
-#         * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
+#         * before 301C, 30A0 (both NS)
 #         It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
 
 type = line;
 locale = ja@lb=normal;
 
 AI = [:LineBreak =  Ambiguous:];
 AL = [:LineBreak =  Alphabetic:];
-BAX = [\u2010 \u2013];
-BA = [[:LineBreak =  Break_After:] - BAX];
+BA = [:LineBreak =  Break_After:];
 HH = [\u2010];      # \u2010 is HYPHEN, default line break is BA.
 BB = [:LineBreak =  Break_Before:];
 BK = [:LineBreak =  Mandatory_Break:];
@@ -154,7 +153,7 @@ LB11.3:      WJ CM* [^CM];
 
 LB12:        GL CM* [^CM];
 
-LB12a:       [^SP BA BAX HY] CM* GL;
+LB12a:       [^SP BA HY] CM* GL;
 
 # LB 13 Do not break before ‘]’ or ‘!’ or ‘/’, even after spaces.
 LB13.1: [^SP] CM* [CL CP EX SY];
@@ -182,12 +181,9 @@ LB20.09:     ^(HY | HH) CM* AL;
 
 # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
 #       not picking up the continuing match after the BA from 21a.
-# TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so,
-#       should "HL BAX" not break when followed by a CB? Thats what the current
-#       rules do, which is why "[^CM CB]?" includes the ?.
-LB21a:       HL CM* (HY | BA | BAX) CM* [^CM CB]?;
+LB21a:       HL CM* (HY | BA) CM* [^CM CB]?;
 
-# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
+# DO allow breaks here before $NSXcm, so don't include it
 LB21.1:      . CM* [BA HY NS];
 LB21.2:      BB CM* [^CM CB];
 

diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
@@ -1646,11 +1646,17 @@ Bangkok)•</data>
 #     •brk OK before 3063               •brk OK before 301C •no brk btw 2026   •no brk before FF01•
 <data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026\u2026\u0020•\u30A2\uFF01\u0020•</data>
 
+#     •no brk before 2010                                 •
+<data>•\u3042\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020•</data>
+
 <locale ja@lb=loose>
 <line>
 #     •brk OK before 3063               •brk OK before 301C •brk OK btw 2026    •brk OK before FF01•
 <data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026•\u2026\u0020•u30A2•\uFF01\u0020•</data>
 
+#     •no brk before 2010 except ok after ID               •
+<data>•\u3042•\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020•</data>
+
 <locale en@lb=strict>
 <line>
 #     •no brk before 3063              •no brk before 301C•no brk btw 2026   •no brk before FF01•
@@ -1888,7 +1894,7 @@ Bangkok)•</data>
 <line>
 #[京都観光］時雨殿に行った。-> [京都•観光］•時雨•殿に•行った。•
 <data>•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002•</data>
-#９月に東京から友達が遊びに来た -> ９月に•東京から•友達が•遊びに•来た
+#９月に東京から友達が遊びに来た -> ９月に•東京から•友達が•遊びに•来た•
 <data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
 #る文字「そうだ、京都」-> る•文字•「そうだ、•京都」•
 <data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>