Skip to content

Commit

Permalink
Improve table search speed through lookups
Browse files Browse the repository at this point in the history
Prior to this change table search would have to do a binary search over
about 1000 entries which resulted in around 10 memory loads on average.
In this commit we reduce the search space by doing a pre-lookup in a
generated table to get a smaller (often zero-length) slice of the full
sorted range list. On average this gives us just one entry of the range
list to perform binary search on, which reduces the average number of
memory loads to 2.
  • Loading branch information
indutny committed Jan 30, 2023
1 parent 243af2c commit b4c9ce1
Show file tree
Hide file tree
Showing 2 changed files with 387 additions and 21 deletions.
62 changes: 57 additions & 5 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,13 +274,36 @@ def emit_break_module(f, break_table, break_cats, name):
pub enum %sCat {
""" % (name, Name, Name))

# We don't want the lookup table to be too large so choose a reasonable
# cutoff. 0x20000 is selected because most of the range table entries are
# within the interval of [0x0, 0x20000]
lookup_value_cutoff = 0x20000

# Length of lookup table. It has to be a divisor of `lookup_value_cutoff`.
lookup_table_len = 0x400

lookup_interval = round(lookup_value_cutoff / lookup_table_len)

# Lookup table is a mapping from `character code / lookup_interval` to
# the index in the range table that covers the `character code`.
lookup_table = [0] * lookup_table_len
j = 0
for i in range(0, lookup_table_len):
lookup_from = i * lookup_interval
while j < len(break_table):
(_, entry_to, _) = break_table[j]
if entry_to >= lookup_from:
break
j += 1
lookup_table[i] = j

break_cats.append("Any")
break_cats.sort()
for cat in break_cats:
f.write((" %sC_" % Name[0]) + cat + ",\n")
f.write(""" }
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) {
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
Expand All @@ -293,19 +316,48 @@ def emit_break_module(f, break_table, break_cats, name):
}
Err(idx) => {
(
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower },
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper),
%sC_Any,
)
}
}
}
pub fn %s_category(c: char) -> (u32, u32, %sCat) {
bsearch_range_value_table(c, %s_cat_table)
// Perform a quick O(1) lookup in a precomputed table to determine
// the slice of the range table to search in.
let lookup_interval = 0x%x;
let idx = (c as u32 / lookup_interval) as usize;
let range = %s_cat_lookup.get(idx..(idx + 2)).map_or(
// If the `idx` is outside of the precomputed table - use the slice
// starting from the last covered index in the precomputed table and
// ending with the length of the range table.
%d..%d,
|r| (r[0] as usize)..((r[1] + 1) as usize)
);
// Compute pessimistic default lower and upper bounds on the category.
// If character doesn't map to any range and there is no adjacent range
// in the table slice - these bounds has to apply.
let lower = idx as u32 * lookup_interval;
let upper = lower + lookup_interval - 1;
bsearch_range_value_table(c, &%s_cat_table[range], lower, upper)
}
""" % (Name, Name, Name[0], name, Name, name))
""" % (Name, Name, Name[0], name, Name, lookup_interval, name, j, len(break_table), name))


if len(break_table) <= 0xff:
lookup_type = "u8"
elif len(break_table) <= 0xffff:
lookup_type = "u16"
else:
lookup_type = "u32"

emit_table(f, "%s_cat_lookup" % name, lookup_table, "&'static [%s]" % lookup_type,
pfun=lambda x: "%d" % x,
is_pub=False, is_const=True)

emit_table(f, "%s_cat_table" % name, break_table, "&'static [(char, char, %sCat)]" % Name,
pfun=lambda x: "(%s,%s,%sC_%s)" % (escape_char(x[0]), escape_char(x[1]), Name[0], x[2]),
Expand Down
Loading

0 comments on commit b4c9ce1

Please sign in to comment.