Skip to content

Commit

Permalink
ICU-21592 Update cj normal/loose linebreak per CSS
Browse files Browse the repository at this point in the history
  • Loading branch information
pedberg-icu committed Feb 22, 2022
1 parent 7d825cb commit 9203ec6
Show file tree
Hide file tree
Showing 13 changed files with 65 additions and 51 deletions.
11 changes: 7 additions & 4 deletions icu4c/source/data/brkitr/rules/line_loose_cj.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
# * between ID and hyphens 2010 & 2013 (both BA)
# * before 301C, 30A0 (both NS)
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
# * between characters of LineBreak class IN such as 2026
# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
Expand Down Expand Up @@ -238,7 +239,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
# See issue ICU-20303


$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $BAX $HY $NS $ALPlus $HL $IN];
$SP $IS / [^ $CanFollowIS $NU $CM];
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];

Expand Down Expand Up @@ -294,8 +295,10 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# LB 21 x (BA | HY | NS)
# BB x
#
# DO allow breaks here before $BAX and $NSX, so don't include them
$LB20NonBreaks $CM* ($BA | $HY | $NS);
# DO allow breaks here before $NSX, so don't include it.
# And DO allow breaks between ID and $BAX, so split out the handling of ID and do not include $BAX for them.
[$LB20NonBreaks - $ID] $CM* ($BA | $BAX | $HY | $NS);
$ID $CM* ($BA | $HY | $NS);


^$CM+ ($BA | $HY | $NS);
Expand Down
11 changes: 7 additions & 4 deletions icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
# * between ID and hyphens 2010 & 2013 (both BA)
# * before 301C, 30A0 (both NS)
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
# * between characters of LineBreak class IN such as 2026
# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
Expand Down Expand Up @@ -251,7 +252,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
# See issue ICU-20303


$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $BAX $HY $NS $ALPlus $HL $IN];
$SP $IS / [^ $CanFollowIS $NU $CM];
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];

Expand Down Expand Up @@ -307,8 +308,10 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# LB 21 x (BA | HY | NS)
# BB x
#
# DO allow breaks here before $BAX and $NSX, so don't include them
$LB20NonBreaks $CM* ($BA | $HY | $NS);
# DO allow breaks here before $NSX, so don't include it.
# And DO allow breaks between ID and $BAX, so split out the handling of ID and do not include $BAX for them.
[$LB20NonBreaks - $ID] $CM* ($BA | $BAX | $HY | $NS);
$ID $CM* ($BA | $HY | $NS);


^$CM+ ($BA | $HY | $NS);
Expand Down
11 changes: 5 additions & 6 deletions icu4c/source/data/brkitr/rules/line_normal_cj.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
# * before 301C, 30A0 (both NS)
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.

#
Expand All @@ -29,8 +29,7 @@

$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
$BAX = [\u2010 \u2013];
$BA = [[:LineBreak = Break_After:] - $BAX];
$BA = [:LineBreak = Break_After:];
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
$BB = [:LineBreak = Break_Before:];
$BK = [:LineBreak = Mandatory_Break:];
Expand Down Expand Up @@ -184,7 +183,7 @@ $GL $CM* .;
# LB 12a Do not break before NBSP and related characters ...
# [^SP BA HY] x GL
#
[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GL;
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL;
^$CM+ $GL;


Expand Down Expand Up @@ -282,7 +281,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# LB 21 x (BA | HY | NS)
# BB x
#
# DO allow breaks here before $BAX and $NSX, so don't include them
# DO allow breaks here before $NSX, so don't include it
$LB20NonBreaks $CM* ($BA | $HY | $NS);


Expand All @@ -294,7 +293,7 @@ $BB $CM* $LB20NonBreaks;
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
#
$HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?;
$HL $CM* ($HY | $BA) $CM* [^$CB]?;

# LB 21b (forward) Don't break between SY and HL
# (break between HL and SY already disallowed by LB 13 above)
Expand Down
11 changes: 5 additions & 6 deletions icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
# * before 301C, 30A0 (both NS)
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
#
# The content is the same as line_normal_cj.txt except the following
Expand All @@ -31,8 +31,7 @@

$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
$BAX = [\u2010 \u2013];
$BA = [[:LineBreak = Break_After:] - $BAX];
$BA = [:LineBreak = Break_After:];
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
$BB = [:LineBreak = Break_Before:];
$BK = [:LineBreak = Mandatory_Break:];
Expand Down Expand Up @@ -197,7 +196,7 @@ $GL $CM* .;
# LB 12a Do not break before NBSP and related characters ...
# [^SP BA HY] x GL
#
[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GL;
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL;
^$CM+ $GL;


Expand Down Expand Up @@ -295,7 +294,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# LB 21 x (BA | HY | NS)
# BB x
#
# DO allow breaks here before $BAX and $NSX, so don't include them
# DO allow breaks here before $NSX, so don't include it
$LB20NonBreaks $CM* ($BA | $HY | $NS);


Expand All @@ -307,7 +306,7 @@ $BB $CM* $LB20NonBreaks;
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
#
$HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?;
$HL $CM* ($HY | $BA) $CM* [^$CB]?;

# LB 21b (forward) Don't break between SY and HL
# (break between HL and SY already disallowed by LB 13 above)
Expand Down
9 changes: 6 additions & 3 deletions icu4c/source/test/testdata/break_rules/line_loose_cj.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
# * between ID and hyphens 2010 & 2013 (both BA)
# * before 301C, 30A0 (both NS)
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
# * between characters of LineBreak class IN such as 2026
# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
Expand Down Expand Up @@ -200,8 +201,10 @@ LB20.09: ^(HY | HH) CM* AL;

LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;

LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];
LB21.1: [^BK CR LF NL CM ZW SP CB ID] CM* [BA BAX HY NS];
LB21.2: ID CM* [BA HY NS];
LB21.3: CM+ [BA HY NS];
LB21.4: BB CM* [^CM CB];

LB21b: SY CM* HL;

Expand Down
14 changes: 5 additions & 9 deletions icu4c/source/test/testdata/break_rules/line_normal_cj.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,15 @@
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
# * before 301C, 30A0 (both NS)
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.

type = line;
locale = ja@lb=normal;

AI = [:LineBreak = Ambiguous:];
AL = [:LineBreak = Alphabetic:];
BAX = [\u2010 \u2013];
BA = [[:LineBreak = Break_After:] - BAX];
BA = [:LineBreak = Break_After:];
HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
BB = [:LineBreak = Break_Before:];
BK = [:LineBreak = Mandatory_Break:];
Expand Down Expand Up @@ -154,7 +153,7 @@ LB11.3: WJ CM* [^CM];

LB12: GL CM* [^CM];

LB12a: [^SP BA BAX HY] CM* GL;
LB12a: [^SP BA HY] CM* GL;

# LB 13 Do not break before ‘]’ or ‘!’ or ‘/’, even after spaces.
LB13.1: [^SP] CM* [CL CP EX SY];
Expand Down Expand Up @@ -182,12 +181,9 @@ LB20.09: ^(HY | HH) CM* AL;

# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
# TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so,
# should "HL BAX" not break when followed by a CB? Thats what the current
# rules do, which is why "[^CM CB]?" includes the ?.
LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
LB21a: HL CM* (HY | BA) CM* [^CM CB]?;

# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
# DO allow breaks here before $NSXcm, so don't include it
LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];

Expand Down
6 changes: 6 additions & 0 deletions icu4c/source/test/testdata/rbbitst.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1646,11 +1646,17 @@ Bangkok)•</data>
# •brk OK before 3063 •brk OK before 301C •no brk btw 2026 •no brk before FF01•
<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026\u2026\u0020•\u30A2\uFF01\u0020•</data>

# •no brk before 2010 •
<data>•\u3042\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020•</data>

<locale ja@lb=loose>
<line>
# •brk OK before 3063 •brk OK before 301C •brk OK btw 2026 •brk OK before FF01•
<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026•\u2026\u0020•u30A2•\uFF01\u0020•</data>

# •no brk before 2010 except ok after ID •
<data>•\u3042•\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020•</data>

<locale en@lb=strict>
<line>
# •no brk before 3063 •no brk before 301C•no brk btw 2026 •no brk before FF01•
Expand Down
4 changes: 2 additions & 2 deletions icu4j/main/shared/data/icudata.jar
Git LFS file not shown
4 changes: 2 additions & 2 deletions icu4j/main/shared/data/icutzdata.jar
Git LFS file not shown
4 changes: 2 additions & 2 deletions icu4j/main/shared/data/testdata.jar
Git LFS file not shown
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
# * between ID and hyphens 2010 & 2013 (both BA)
# * before 301C, 30A0 (both NS)
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
# * between characters of LineBreak class IN such as 2026
# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
Expand Down Expand Up @@ -200,8 +201,10 @@ LB20.09: ^(HY | HH) CM* AL;

LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;

LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];
LB21.1: [^BK CR LF NL CM ZW SP CB ID] CM* [BA BAX HY NS];
LB21.2: ID CM* [BA HY NS];
LB21.3: CM+ [BA HY NS];
LB21.4: BB CM* [^CM CB];

LB21b: SY CM* HL;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,15 @@
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
# * before 301C, 30A0 (both NS)
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.

type = line;
locale = ja@lb=normal;

AI = [:LineBreak = Ambiguous:];
AL = [:LineBreak = Alphabetic:];
BAX = [\u2010 \u2013];
BA = [[:LineBreak = Break_After:] - BAX];
BA = [:LineBreak = Break_After:];
HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
BB = [:LineBreak = Break_Before:];
BK = [:LineBreak = Mandatory_Break:];
Expand Down Expand Up @@ -154,7 +153,7 @@ LB11.3: WJ CM* [^CM];

LB12: GL CM* [^CM];

LB12a: [^SP BA BAX HY] CM* GL;
LB12a: [^SP BA HY] CM* GL;

# LB 13 Do not break before ‘]’ or ‘!’ or ‘/’, even after spaces.
LB13.1: [^SP] CM* [CL CP EX SY];
Expand Down Expand Up @@ -182,12 +181,9 @@ LB20.09: ^(HY | HH) CM* AL;

# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
# TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so,
# should "HL BAX" not break when followed by a CB? Thats what the current
# rules do, which is why "[^CM CB]?" includes the ?.
LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
LB21a: HL CM* (HY | BA) CM* [^CM CB]?;

# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
# DO allow breaks here before $NSXcm, so don't include it
LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1646,11 +1646,17 @@ Bangkok)•</data>
# •brk OK before 3063 •brk OK before 301C •no brk btw 2026 •no brk before FF01•
<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026\u2026\u0020•\u30A2\uFF01\u0020•</data>

# •no brk before 2010 •
<data>•\u3042\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020•</data>

<locale ja@lb=loose>
<line>
# •brk OK before 3063 •brk OK before 301C •brk OK btw 2026 •brk OK before FF01•
<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026•\u2026\u0020•u30A2•\uFF01\u0020•</data>

# •no brk before 2010 except ok after ID •
<data>•\u3042•\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020•</data>

<locale en@lb=strict>
<line>
# •no brk before 3063 •no brk before 301C•no brk btw 2026 •no brk before FF01•
Expand Down Expand Up @@ -1888,7 +1894,7 @@ Bangkok)•</data>
<line>
#[京都観光]時雨殿に行った。-> [京都•観光]•時雨•殿に•行った。•
<data>•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002•</data>
#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た
#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た
<data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」•
<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>
Expand Down

0 comments on commit 9203ec6

Please sign in to comment.