Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Redesign combining table #277

Merged
merged 8 commits into from
Dec 29, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ include (utils.cmake)

disallow_intree_builds()

project (utf8proc VERSION 2.9.0 LANGUAGES C)
project (utf8proc VERSION 2.10.0 LANGUAGES C)

# This is the ABI version number, which may differ from the
# API version number (defined in utf8proc.h and above).
eschnett marked this conversation as resolved.
Show resolved Hide resolved
Expand Down Expand Up @@ -65,7 +65,7 @@ endif()
if(UTF8PROC_ENABLE_TESTING)
enable_testing()
file(MAKE_DIRECTORY data)
set(UNICODE_VERSION 15.1.0)
set(UNICODE_VERSION 16.0.0)
file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/NormalizationTest.txt ${CMAKE_BINARY_DIR}/data/NormalizationTest.txt SHOW_PROGRESS)
file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/auxiliary/GraphemeBreakTest.txt ${CMAKE_BINARY_DIR}/data/GraphemeBreakTest.txt SHOW_PROGRESS)
add_executable(case test/tests.h test/tests.c utf8proc.h test/case.c)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ The C library is found in this directory after successful compilation
and is named `libutf8proc.a` (for the static library) and
`libutf8proc.so` (for the dynamic library).

The Unicode version supported is 15.1.0.
The Unicode version supported is 16.0.0.

For Unicode normalizations, the following options are used:

Expand Down
2 changes: 1 addition & 1 deletion data/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ utf8proc_data.c.new: data_generator.jl $(RAWDATA)
$(JULIA) --project=. data_generator.jl > $@

# Unicode data version (must also update utf8proc_unicode_version function)
UNICODE_VERSION=15.1.0
UNICODE_VERSION=16.0.0

UnicodeData.txt:
$(CURL) $(CURLFLAGS) -o $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt
Expand Down
67 changes: 7 additions & 60 deletions data/Manifest.toml
Original file line number Diff line number Diff line change
@@ -1,69 +1,16 @@
# This file is machine-generated - editing it directly is not advised

julia_version = "1.9.3"
julia_version = "1.11.2"
manifest_format = "2.0"
project_hash = "bc0740aa2247b17bd49ba693fb87f41bbbddead6"

[[deps.Adapt]]
deps = ["LinearAlgebra", "Requires"]
git-tree-sha1 = "cde29ddf7e5726c9fb511f340244ea3481267608"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "3.7.2"

[deps.Adapt.extensions]
AdaptStaticArraysExt = "StaticArrays"

[deps.Adapt.weakdeps]
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"

[[deps.Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"

[[deps.CompilerSupportLibraries_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
version = "1.0.5+0"

[[deps.Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"

[[deps.LinearAlgebra]]
deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

[[deps.OffsetArrays]]
deps = ["Adapt"]
git-tree-sha1 = "2ac17d29c523ce1cd38e27785a7d23024853a4bb"
git-tree-sha1 = "5e1897147d1ff8d98883cda2be2187dcf57d8f0c"
uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
version = "1.12.10"

[[deps.OpenBLAS_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
version = "0.3.21+4"

[[deps.Random]]
deps = ["SHA", "Serialization"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"

[[deps.Requires]]
deps = ["UUIDs"]
git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
version = "1.3.0"

[[deps.SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
version = "0.7.0"

[[deps.Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
version = "1.15.0"

[[deps.UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[deps.OffsetArrays.extensions]
OffsetArraysAdaptExt = "Adapt"

[[deps.libblastrampoline_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
version = "5.8.0+0"
[deps.OffsetArrays.weakdeps]
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
129 changes: 31 additions & 98 deletions data/data_generator.jl
Original file line number Diff line number Diff line change
Expand Up @@ -236,8 +236,8 @@ let ea_widths = read_east_asian_widths("EastAsianWidth.txt")
width = 1
elseif code == 0x2028 || code == 0x2029
#By definition, should have zero width (on the same line)
#0x002028 '' category: Zl name: LINE SEPARATOR/
#0x002029 '' category: Zp name: PARAGRAPH SEPARATOR/
#0x002028 '\u2028' category: Zl name: LINE SEPARATOR/
#0x002029 '\u2029' category: Zp name: PARAGRAPH SEPARATOR/
width = 0
end

Expand All @@ -256,79 +256,33 @@ end
# decompressed on the C side at runtime.

# Inverse decomposition mapping tables for combining two characters into a single one.
comb1st_indices = Dict{UInt32,Int}()
comb1st_indices_sorted_keys = Origin(0)(UInt32[])
comb2nd_indices = Dict{UInt32,Int}()
comb2nd_indices_sorted_keys = Origin(0)(UInt32[])
comb2nd_indices_nonbasic = Set{UInt32}()
comb_array = Origin(0)(Vector{Dict{Int,UInt32}}())
comb_mapping = Dict{UInt32, Dict{UInt32, UInt32}}()
comb_issecond = Set{UInt32}()
for char in char_props
# What happens with decompositions that are longer than 2?
if isnothing(char.decomp_type) && !isnothing(char.decomp_mapping) &&
length(char.decomp_mapping) == 2 && !isnothing(char_hash[char.decomp_mapping[1]]) &&
char_hash[char.decomp_mapping[1]].combining_class == 0 &&
char.code ∉ exclusions
(char.code ∉ exclusions && char.code ∉ excl_version)
dm0 = char.decomp_mapping[1]
dm1 = char.decomp_mapping[2]
if !haskey(comb1st_indices, dm0)
comb1st_indices[dm0] = length(comb1st_indices)
push!(comb1st_indices_sorted_keys, dm0)
push!(comb_array, Dict{Int,UInt32}())
@assert length(comb1st_indices) == length(comb_array)
end
if !haskey(comb2nd_indices, dm1)
push!(comb2nd_indices_sorted_keys, dm1)
comb2nd_indices[dm1] = length(comb2nd_indices)
end
@assert !haskey(comb_array[comb1st_indices[dm0]], comb2nd_indices[dm1])
comb_array[comb1st_indices[dm0]][comb2nd_indices[dm1]] = char.code
if char.code > 0xFFFF
push!(comb2nd_indices_nonbasic, dm1)
if !haskey(comb_mapping, dm0)
comb_mapping[dm0] = Dict{UInt32, UInt32}()
end
comb_mapping[dm0][dm1] = char.code
push!(comb_issecond, dm1)
end
end

comb_indices = Dict{UInt32,Int}()
comb1st_indices_lastoffsets = Origin(0)(zeros(Int, length(comb1st_indices)))
comb1st_indices_firstoffsets = Origin(0)(zeros(Int, length(comb1st_indices)))
comb_index = Dict{UInt32, UInt32}()
comb_length = Dict{UInt32, UInt32}()
let
cumoffset = 0
for dm0 in comb1st_indices_sorted_keys
index = comb1st_indices[dm0]
first = nothing
last = nothing
offset = 0
for b in eachindex(comb2nd_indices_sorted_keys)
dm1 = comb2nd_indices_sorted_keys[b]
if haskey(comb_array[index], b)
if isnothing(first)
first = offset
end
last = offset
if dm1 in comb2nd_indices_nonbasic
last += 1
end
end
offset += 1
if dm1 in comb2nd_indices_nonbasic
offset += 1
end
end
comb1st_indices_firstoffsets[index] = first
comb1st_indices_lastoffsets[index] = last
@assert !haskey(comb_indices, dm0)
comb_indices[dm0] = cumoffset
cumoffset += last - first + 1 + 2
end

offset = 0
for dm1 in comb2nd_indices_sorted_keys
@assert !haskey(comb_indices, dm1)
comb_indices[dm1] = 0x8000 | (comb2nd_indices[dm1] + offset)
@assert comb2nd_indices[dm1] + offset <= 0x4000
if dm1 in comb2nd_indices_nonbasic
comb_indices[dm1] |= 0x4000
offset += 1
end
ind = 0
for dm0 in sort!(collect(keys(comb_mapping)))
comb_index[dm0] = ind
len = length(comb_mapping[dm0])
comb_length[dm0] = len
ind += len
end
end

Expand Down Expand Up @@ -391,7 +345,9 @@ function char_table_properties!(sequences, char)
uppercase_seqindex = encode_sequence!(sequences, char.uppercase_mapping),
lowercase_seqindex = encode_sequence!(sequences, char.lowercase_mapping),
titlecase_seqindex = encode_sequence!(sequences, char.titlecase_mapping),
comb_index = get(comb_indices, code, typemax(UInt16)),
comb_index = get(comb_index, code, 0x3FF), # see utf8proc_property_struct::comb_index
comb_length = get(comb_length, code, 0),
comb_issecond = code in comb_issecond,
bidi_mirrored = char.bidi_mirrored,
comp_exclusion = code in exclusions || code in excl_version,
ignorable = code in ignorable,
Expand Down Expand Up @@ -473,8 +429,7 @@ function c_uint16(seqindex)
end

function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, deduplicated_props,
comb1st_indices_firstoffsets, comb1st_indices_lastoffsets,
comb2nd_indices_sorted_keys, comb_array, comb2nd_indices_nonbasic)
comb_index, comb_length, comb_issecond)
print(io, "static const utf8proc_uint16_t utf8proc_sequences[] = ")
write_c_index_array(io, sequences.storage, 8)
print(io, "static const utf8proc_uint16_t utf8proc_stage1table[] = ")
Expand All @@ -484,7 +439,7 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup

print(io, """
static const utf8proc_property_t utf8proc_properties[] = {
{0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
{0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, 0x3FF,0,false, false,false,false,false, 1, 0, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
""")
for prop in deduplicated_props
print(io, " {",
Expand All @@ -498,6 +453,8 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
c_uint16(prop.lowercase_seqindex), ", ",
c_uint16(prop.titlecase_seqindex), ", ",
c_uint16(prop.comb_index), ", ",
c_uint16(prop.comb_length), ", ",
prop.comb_issecond, ", ",
prop.bidi_mirrored, ", ",
prop.comp_exclusion, ", ",
prop.ignorable, ", ",
Expand All @@ -512,42 +469,18 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
end
print(io, "};\n\n")

print(io, "static const utf8proc_uint16_t utf8proc_combinations[] = {\n ")
i = 0
for a in eachindex(comb1st_indices_firstoffsets)
offset = 0
print(io, comb1st_indices_firstoffsets[a], ", ", comb1st_indices_lastoffsets[a], ", ")
for b in eachindex(comb2nd_indices_sorted_keys)
dm1 = comb2nd_indices_sorted_keys[b]
if offset > comb1st_indices_lastoffsets[a]
break
end
if offset >= comb1st_indices_firstoffsets[a]
i += 1
if i == 8
i = 0
print(io, "\n ")
end
v = get(comb_array[a], b, 0)
if dm1 in comb2nd_indices_nonbasic
print(io, (v & 0xFFFF0000) >> 16, ", ")
end
print(io, v & 0xFFFF, ", ")
end
offset += 1
if dm1 in comb2nd_indices_nonbasic
offset += 1
end
print(io, "static const utf8proc_uint32_t utf8proc_combinations[][2] = {\n")
for dm0 in sort!(collect(keys(comb_mapping)))
for dm1 in sort!(collect(keys(comb_mapping[dm0])))
code = comb_mapping[dm0][dm1]
print(io, " {", dm1, ", ", code, "},\n")
end
print(io, "\n")
end
print(io, "};\n\n")
end


if !isinteractive()
print_c_data_tables(stdout, sequences, prop_page_indices, prop_pages, deduplicated_props,
comb1st_indices_firstoffsets, comb1st_indices_lastoffsets,
comb2nd_indices_sorted_keys, comb_array, comb2nd_indices_nonbasic)
comb_index, comb_length, comb_issecond)
end

Loading