Skip to content

Commit ae68efb

Browse files
rscgopherbot
authored andcommitted
internal/export/unicode: add CategoryAliases, Cn, and LC
CategoryAliases is for regexp to use, for things like \p{Letter} as an alias for \p{L}. Cn and LC are special-case categories that were never implemented but should have been. For golang/go#70780. Change-Id: I1401c1be42106a0ebecabb085c25e97485c363cf Reviewed-on: https://go-review.googlesource.com/c/text/+/641395 Auto-Submit: Russ Cox <rsc@golang.org> Reviewed-by: Marcel van Lohuizen <mpvl@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Ian Lance Taylor <iant@google.com>
1 parent 518d9c0 commit ae68efb

File tree

1 file changed

+51
-13
lines changed

1 file changed

+51
-13
lines changed

internal/export/unicode/gen.go

+51-13
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,10 @@ import (
1313
"flag"
1414
"fmt"
1515
"log"
16+
"maps"
1617
"os"
1718
"regexp"
19+
"slices"
1820
"sort"
1921
"strings"
2022
"unicode"
@@ -90,13 +92,15 @@ func println(args ...interface{}) {
9092
var category = map[string]bool{
9193
// Nd Lu etc.
9294
// We use one-character names to identify merged categories
93-
"L": true, // Lu Ll Lt Lm Lo
94-
"P": true, // Pc Pd Ps Pe Pu Pf Po
95-
"M": true, // Mn Mc Me
96-
"N": true, // Nd Nl No
97-
"S": true, // Sm Sc Sk So
98-
"Z": true, // Zs Zl Zp
99-
"C": true, // Cc Cf Cs Co Cn
95+
"L": true, // Lu Ll Lt Lm Lo
96+
"LC": true, // Lu Ll Lt
97+
"P": true, // Pc Pd Ps Pe Pu Pf Po
98+
"M": true, // Mn Mc Me
99+
"N": true, // Nd Nl No
100+
"S": true, // Sm Sc Sk So
101+
"Z": true, // Zs Zl Zp
102+
"C": true, // Cc Cf Cs Co Cn
103+
"Cn": true, // unassigned
100104
}
101105

102106
// This contains only the properties we're interested in.
@@ -149,6 +153,9 @@ func categoryOp(code rune, class uint8) bool {
149153
}
150154

151155
func loadChars() {
156+
for code := range chars {
157+
chars[code].category = "Cn" // unassigned
158+
}
152159
ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
153160
c := Char{codePoint: p.Rune(0)}
154161

@@ -201,6 +208,7 @@ func loadCasefold() {
201208
}
202209

203210
var categoryMapping = map[string]string{
211+
"LC": "Letter, cased: Ll | Lt | Lu",
204212
"Lu": "Letter, uppercase",
205213
"Ll": "Letter, lowercase",
206214
"Lt": "Letter, titlecase",
@@ -257,6 +265,7 @@ func printCategories() {
257265
printf("\t%q: %s,\n", k, k)
258266
}
259267
print("}\n\n")
268+
printCategoryAliases()
260269
}
261270

262271
decl := make(sort.StringSlice, len(list))
@@ -315,14 +324,14 @@ func printCategories() {
315324
}
316325
decl[ndecl] = varDecl
317326
ndecl++
327+
match := func(cat string) bool { return cat == name }
318328
if len(name) == 1 { // unified categories
319-
dumpRange(
320-
"_"+name,
321-
func(code rune) bool { return categoryOp(code, name[0]) })
322-
continue
329+
match = func(cat string) bool { return strings.HasPrefix(cat, name) }
323330
}
324-
dumpRange("_"+name,
325-
func(code rune) bool { return chars[code].category == name })
331+
if name == "LC" { // special unified category
332+
match = func(cat string) bool { return cat == "Ll" || cat == "Lt" || cat == "Lu" }
333+
}
334+
dumpRange("_"+name, func(code rune) bool { return match(chars[code].category) })
326335
}
327336
decl.Sort()
328337
println("// These variables have type *RangeTable.")
@@ -333,6 +342,35 @@ func printCategories() {
333342
print(")\n\n")
334343
}
335344

345+
func printCategoryAliases() {
346+
known := make(map[string]bool)
347+
for _, name := range allCategories() {
348+
known[name] = true
349+
}
350+
351+
table := make(map[string]string)
352+
ucd.Parse(gen.OpenUCDFile("PropertyValueAliases.txt"), func(p *ucd.Parser) {
353+
if p.String(0) != "gc" {
354+
return
355+
}
356+
name := p.String(1)
357+
if !known[name] {
358+
logger.Print("unknown category: ", name)
359+
}
360+
table[p.String(2)] = name
361+
if a := p.String(3); a != "" {
362+
table[a] = name
363+
}
364+
})
365+
366+
println("// CategoryAliases maps category aliases to standard category names.")
367+
println("var CategoryAliases = map[string]string{")
368+
for _, name := range slices.Sorted(maps.Keys(table)) {
369+
printf("\t%q: %q,\n", name, table[name])
370+
}
371+
print("}\n\n")
372+
}
373+
336374
type Op func(code rune) bool
337375

338376
func dumpRange(name string, inCategory Op) {

0 commit comments

Comments
 (0)