Skip to content

Commit

Permalink
Merge pull request #277 from eschnett/eschnett/unicode16
Browse files Browse the repository at this point in the history
Redesign combining table
  • Loading branch information
eschnett authored Dec 29, 2024
2 parents 53177fb + ecd7d1b commit c8d815a
Show file tree
Hide file tree
Showing 10 changed files with 14,062 additions and 14,006 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci-fuzz.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ jobs:
if: failure()
with:
name: artifacts
path: ./out/artifacts
path: ./out/artifacts
6 changes: 3 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@ include (utils.cmake)

disallow_intree_builds()

project (utf8proc VERSION 2.9.0 LANGUAGES C)
project (utf8proc VERSION 2.10.0 LANGUAGES C)

# This is the ABI version number, which may differ from the
# API version number (defined in utf8proc.h and above).
# Be sure to also update these in Makefile and MANIFEST!
set(SO_MAJOR 3)
set(SO_MINOR 0)
set(SO_MINOR 1)
set(SO_PATCH 0)

option(UTF8PROC_INSTALL "Enable installation of utf8proc" On)
Expand Down Expand Up @@ -65,7 +65,7 @@ endif()
if(UTF8PROC_ENABLE_TESTING)
enable_testing()
file(MAKE_DIRECTORY data)
set(UNICODE_VERSION 15.1.0)
set(UNICODE_VERSION 16.0.0)
file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/NormalizationTest.txt ${CMAKE_BINARY_DIR}/data/NormalizationTest.txt SHOW_PROGRESS)
file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/auxiliary/GraphemeBreakTest.txt ${CMAKE_BINARY_DIR}/data/GraphemeBreakTest.txt SHOW_PROGRESS)
add_executable(case test/tests.h test/tests.c utf8proc.h test/case.c)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ The C library is found in this directory after successful compilation
and is named `libutf8proc.a` (for the static library) and
`libutf8proc.so` (for the dynamic library).

The Unicode version supported is 15.1.0.
The Unicode version supported is 16.0.0.

For Unicode normalizations, the following options are used:

Expand Down
2 changes: 1 addition & 1 deletion data/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ utf8proc_data.c.new: data_generator.jl $(RAWDATA)
$(JULIA) --project=. data_generator.jl > $@

# Unicode data version (must also update utf8proc_unicode_version function)
UNICODE_VERSION=15.1.0
UNICODE_VERSION=16.0.0

UnicodeData.txt:
$(CURL) $(CURLFLAGS) -o $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt
Expand Down
67 changes: 7 additions & 60 deletions data/Manifest.toml
Original file line number Diff line number Diff line change
@@ -1,69 +1,16 @@
# This file is machine-generated - editing it directly is not advised

julia_version = "1.9.3"
julia_version = "1.11.2"
manifest_format = "2.0"
project_hash = "bc0740aa2247b17bd49ba693fb87f41bbbddead6"

[[deps.Adapt]]
deps = ["LinearAlgebra", "Requires"]
git-tree-sha1 = "cde29ddf7e5726c9fb511f340244ea3481267608"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "3.7.2"

[deps.Adapt.extensions]
AdaptStaticArraysExt = "StaticArrays"

[deps.Adapt.weakdeps]
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"

[[deps.Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"

[[deps.CompilerSupportLibraries_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
version = "1.0.5+0"

[[deps.Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"

[[deps.LinearAlgebra]]
deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

[[deps.OffsetArrays]]
deps = ["Adapt"]
git-tree-sha1 = "2ac17d29c523ce1cd38e27785a7d23024853a4bb"
git-tree-sha1 = "5e1897147d1ff8d98883cda2be2187dcf57d8f0c"
uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
version = "1.12.10"

[[deps.OpenBLAS_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
version = "0.3.21+4"

[[deps.Random]]
deps = ["SHA", "Serialization"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"

[[deps.Requires]]
deps = ["UUIDs"]
git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "1.3.0"

[[deps.SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
version = "0.7.0"

[[deps.Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
version = "1.15.0"

[[deps.UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[deps.OffsetArrays.extensions]
OffsetArraysAdaptExt = "Adapt"

[[deps.libblastrampoline_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
version = "5.8.0+0"
[deps.OffsetArrays.weakdeps]
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
141 changes: 43 additions & 98 deletions data/data_generator.jl
Original file line number Diff line number Diff line change
Expand Up @@ -236,8 +236,8 @@ let ea_widths = read_east_asian_widths("EastAsianWidth.txt")
width = 1
elseif code == 0x2028 || code == 0x2029
#By definition, should have zero width (on the same line)
#0x002028 '' category: Zl name: LINE SEPARATOR/
#0x002029 '' category: Zp name: PARAGRAPH SEPARATOR/
#0x002028 '\u2028' category: Zl name: LINE SEPARATOR/
#0x002029 '\u2029' category: Zp name: PARAGRAPH SEPARATOR/
width = 0
end

Expand All @@ -256,79 +256,33 @@ end
# decompressed on the C side at runtime.

# Inverse decomposition mapping tables for combining two characters into a single one.
comb1st_indices = Dict{UInt32,Int}()
comb1st_indices_sorted_keys = Origin(0)(UInt32[])
comb2nd_indices = Dict{UInt32,Int}()
comb2nd_indices_sorted_keys = Origin(0)(UInt32[])
comb2nd_indices_nonbasic = Set{UInt32}()
comb_array = Origin(0)(Vector{Dict{Int,UInt32}}())
comb_mapping = Dict{UInt32, Dict{UInt32, UInt32}}()
comb_issecond = Set{UInt32}()
for char in char_props
# What happens with decompositions that are longer than 2?
if isnothing(char.decomp_type) && !isnothing(char.decomp_mapping) &&
length(char.decomp_mapping) == 2 && !isnothing(char_hash[char.decomp_mapping[1]]) &&
char_hash[char.decomp_mapping[1]].combining_class == 0 &&
char.code exclusions
(char.code exclusions && char.code excl_version)
dm0 = char.decomp_mapping[1]
dm1 = char.decomp_mapping[2]
if !haskey(comb1st_indices, dm0)
comb1st_indices[dm0] = length(comb1st_indices)
push!(comb1st_indices_sorted_keys, dm0)
push!(comb_array, Dict{Int,UInt32}())
@assert length(comb1st_indices) == length(comb_array)
end
if !haskey(comb2nd_indices, dm1)
push!(comb2nd_indices_sorted_keys, dm1)
comb2nd_indices[dm1] = length(comb2nd_indices)
end
@assert !haskey(comb_array[comb1st_indices[dm0]], comb2nd_indices[dm1])
comb_array[comb1st_indices[dm0]][comb2nd_indices[dm1]] = char.code
if char.code > 0xFFFF
push!(comb2nd_indices_nonbasic, dm1)
if !haskey(comb_mapping, dm0)
comb_mapping[dm0] = Dict{UInt32, UInt32}()
end
comb_mapping[dm0][dm1] = char.code
push!(comb_issecond, dm1)
end
end

comb_indices = Dict{UInt32,Int}()
comb1st_indices_lastoffsets = Origin(0)(zeros(Int, length(comb1st_indices)))
comb1st_indices_firstoffsets = Origin(0)(zeros(Int, length(comb1st_indices)))
comb_index = Dict{UInt32, UInt32}()
comb_length = Dict{UInt32, UInt32}()
let
cumoffset = 0
for dm0 in comb1st_indices_sorted_keys
index = comb1st_indices[dm0]
first = nothing
last = nothing
offset = 0
for b in eachindex(comb2nd_indices_sorted_keys)
dm1 = comb2nd_indices_sorted_keys[b]
if haskey(comb_array[index], b)
if isnothing(first)
first = offset
end
last = offset
if dm1 in comb2nd_indices_nonbasic
last += 1
end
end
offset += 1
if dm1 in comb2nd_indices_nonbasic
offset += 1
end
end
comb1st_indices_firstoffsets[index] = first
comb1st_indices_lastoffsets[index] = last
@assert !haskey(comb_indices, dm0)
comb_indices[dm0] = cumoffset
cumoffset += last - first + 1 + 2
end

offset = 0
for dm1 in comb2nd_indices_sorted_keys
@assert !haskey(comb_indices, dm1)
comb_indices[dm1] = 0x8000 | (comb2nd_indices[dm1] + offset)
@assert comb2nd_indices[dm1] + offset <= 0x4000
if dm1 in comb2nd_indices_nonbasic
comb_indices[dm1] |= 0x4000
offset += 1
end
ind = 0
for dm0 in sort!(collect(keys(comb_mapping)))
comb_index[dm0] = ind
len = length(comb_mapping[dm0])
comb_length[dm0] = len
ind += len
end
end

Expand Down Expand Up @@ -391,7 +345,9 @@ function char_table_properties!(sequences, char)
uppercase_seqindex = encode_sequence!(sequences, char.uppercase_mapping),
lowercase_seqindex = encode_sequence!(sequences, char.lowercase_mapping),
titlecase_seqindex = encode_sequence!(sequences, char.titlecase_mapping),
comb_index = get(comb_indices, code, typemax(UInt16)),
comb_index = get(comb_index, code, 0x3FF), # see utf8proc_property_struct::comb_index
comb_length = get(comb_length, code, 0),
comb_issecond = code in comb_issecond,
bidi_mirrored = char.bidi_mirrored,
comp_exclusion = code in exclusions || code in excl_version,
ignorable = code in ignorable,
Expand Down Expand Up @@ -473,8 +429,7 @@ function c_uint16(seqindex)
end

function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, deduplicated_props,
comb1st_indices_firstoffsets, comb1st_indices_lastoffsets,
comb2nd_indices_sorted_keys, comb_array, comb2nd_indices_nonbasic)
comb_index, comb_length, comb_issecond)
print(io, "static const utf8proc_uint16_t utf8proc_sequences[] = ")
write_c_index_array(io, sequences.storage, 8)
print(io, "static const utf8proc_uint16_t utf8proc_stage1table[] = ")
Expand All @@ -484,7 +439,7 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup

print(io, """
static const utf8proc_property_t utf8proc_properties[] = {
{0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
{0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, 0x3FF,0,false, false,false,false,false, 1, 0, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
""")
for prop in deduplicated_props
print(io, " {",
Expand All @@ -498,6 +453,8 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
c_uint16(prop.lowercase_seqindex), ", ",
c_uint16(prop.titlecase_seqindex), ", ",
c_uint16(prop.comb_index), ", ",
c_uint16(prop.comb_length), ", ",
prop.comb_issecond, ", ",
prop.bidi_mirrored, ", ",
prop.comp_exclusion, ", ",
prop.ignorable, ", ",
Expand All @@ -512,42 +469,30 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
end
print(io, "};\n\n")

print(io, "static const utf8proc_uint16_t utf8proc_combinations[] = {\n ")
i = 0
for a in eachindex(comb1st_indices_firstoffsets)
offset = 0
print(io, comb1st_indices_firstoffsets[a], ", ", comb1st_indices_lastoffsets[a], ", ")
for b in eachindex(comb2nd_indices_sorted_keys)
dm1 = comb2nd_indices_sorted_keys[b]
if offset > comb1st_indices_lastoffsets[a]
break
end
if offset >= comb1st_indices_firstoffsets[a]
i += 1
if i == 8
i = 0
print(io, "\n ")
end
v = get(comb_array[a], b, 0)
if dm1 in comb2nd_indices_nonbasic
print(io, (v & 0xFFFF0000) >> 16, ", ")
end
print(io, v & 0xFFFF, ", ")
end
offset += 1
if dm1 in comb2nd_indices_nonbasic
offset += 1
end
print(io, "static const utf8proc_uint32_t utf8proc_combinations_second[] = {\n")
for dm0 in sort!(collect(keys(comb_mapping)))
print(io, " ");
for dm1 in sort!(collect(keys(comb_mapping[dm0])))
print(io, " ", dm1, ",")
end
print(io, "\n");
end
print(io, "};\n\n")

print(io, "static const utf8proc_uint32_t utf8proc_combinations_combined[] = {\n")
for dm0 in sort!(collect(keys(comb_mapping)))
print(io, " ");
for dm1 in sort!(collect(keys(comb_mapping[dm0])))
code = comb_mapping[dm0][dm1]
print(io, " ", code, ",")
end
print(io, "\n")
print(io, "\n");
end
print(io, "};\n\n")
end


if !isinteractive()
print_c_data_tables(stdout, sequences, prop_page_indices, prop_pages, deduplicated_props,
comb1st_indices_firstoffsets, comb1st_indices_lastoffsets,
comb2nd_indices_sorted_keys, comb_array, comb2nd_indices_nonbasic)
comb_index, comb_length, comb_issecond)
end

4 changes: 4 additions & 0 deletions test/printproperty.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ int main(int argc, char **argv)
" titlecase_mapping = %04x (seqindex %04x)\n"
" casefold = %s\n"
" comb_index = %d\n"
" comb_length = %d\n"
" comb_issecond = %d\n"
" bidi_mirrored = %d\n"
" comp_exclusion = %d\n"
" ignorable = %d\n"
Expand All @@ -51,6 +53,8 @@ int main(int argc, char **argv)
utf8proc_totitle(c), p->titlecase_seqindex,
(char *) map,
p->comb_index,
p->comb_length,
p->comb_issecond,
p->bidi_mirrored,
p->comp_exclusion,
p->ignorable,
Expand Down
Loading

0 comments on commit c8d815a

Please sign in to comment.