Skip to content

Commit

Permalink
Optimize table layout
Browse files Browse the repository at this point in the history
  • Loading branch information
eschnett committed Dec 29, 2024
1 parent adb0f44 commit ecd7d1b
Show file tree
Hide file tree
Showing 4 changed files with 809 additions and 971 deletions.
16 changes: 14 additions & 2 deletions data/data_generator.jl
Original file line number Diff line number Diff line change
Expand Up @@ -469,12 +469,24 @@ function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, dedup
end
print(io, "};\n\n")

print(io, "static const utf8proc_uint32_t utf8proc_combinations[][2] = {\n")
print(io, "static const utf8proc_uint32_t utf8proc_combinations_second[] = {\n")
for dm0 in sort!(collect(keys(comb_mapping)))
print(io, " ");
for dm1 in sort!(collect(keys(comb_mapping[dm0])))
print(io, " ", dm1, ",")
end
print(io, "\n");
end
print(io, "};\n\n")

print(io, "static const utf8proc_uint32_t utf8proc_combinations_combined[] = {\n")
for dm0 in sort!(collect(keys(comb_mapping)))
print(io, " ");
for dm1 in sort!(collect(keys(comb_mapping[dm0])))
code = comb_mapping[dm0][dm1]
print(io, " {", dm1, ", ", code, "},\n")
print(io, " ", code, ",")
end
print(io, "\n");
end
print(io, "};\n\n")
end
Expand Down
6 changes: 3 additions & 3 deletions utf8proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -687,18 +687,18 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *b
int idx = starter_property->comb_index;
if (idx < 0x3FF && current_property->comb_issecond) {
int len = starter_property->comb_length;
utf8proc_uint32_t max_second = utf8proc_combinations[idx + len - 1][0];
utf8proc_uint32_t max_second = utf8proc_combinations_second[idx + len - 1];
if (current_char <= max_second) {
// TODO: binary search? arithmetic search?
for (int off = 0; off < len; ++off) {
utf8proc_uint32_t second = utf8proc_combinations[idx + off][0];
utf8proc_uint32_t second = utf8proc_combinations_second[idx + off];
if (current_char < second) {
/* not found */
break;
}
if (current_char == second) {
/* found */
utf8proc_uint32_t composition = utf8proc_combinations[idx + off][1];
utf8proc_uint32_t composition = utf8proc_combinations_combined[idx + off];
*starter = composition;
starter_property = NULL;
break;
Expand Down
10 changes: 6 additions & 4 deletions utf8proc.h
Original file line number Diff line number Diff line change
Expand Up @@ -266,10 +266,12 @@ typedef struct utf8proc_property_struct {
* combining pair, and for most, there are only a handful for
* possible second characters.
*
* The combining table is stored as `utf8proc_uint32_t
* utf8proc_combinations[][2]`. That is, it contains a pair `(second
* combining character, combined character)` for every character
* that can be a first combining character.
* The combining table is stored as sparse matrix in the CSR
* (compressed sparse row) format. That is, it is stored as two
* arrays, `utf8proc_uint32_t utf8proc_combinations_second[]` and
* `utf8proc_uint32_t utf8proc_combinations_combined[]`. These
* contain the second combining characters and the combined
* character of every combining pair.
*
* - `comb_index`: Index into the combining table if this character
* is the first character in a combining pair, else 0x3ff
Expand Down
Loading

0 comments on commit ecd7d1b

Please sign in to comment.