Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve table search speed through lookups #112

Merged
merged 1 commit into from Jan 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
62 changes: 57 additions & 5 deletions scripts/unicode.py
Expand Up @@ -274,13 +274,36 @@ def emit_break_module(f, break_table, break_cats, name):
pub enum %sCat {
""" % (name, Name, Name))

# We don't want the lookup table to be too large so choose a reasonable
# cutoff. 0x20000 is selected because most of the range table entries are
# within the interval of [0x0, 0x20000]
lookup_value_cutoff = 0x20000

# Length of lookup table. It has to be a divisor of `lookup_value_cutoff`.
lookup_table_len = 0x400

lookup_interval = round(lookup_value_cutoff / lookup_table_len)

# Lookup table is a mapping from `character code / lookup_interval` to
# the index in the range table that covers the `character code`.
lookup_table = [0] * lookup_table_len
j = 0
for i in range(0, lookup_table_len):
lookup_from = i * lookup_interval
while j < len(break_table):
(_, entry_to, _) = break_table[j]
if entry_to >= lookup_from:
break
j += 1
lookup_table[i] = j

break_cats.append("Any")
break_cats.sort()
for cat in break_cats:
f.write((" %sC_" % Name[0]) + cat + ",\n")
f.write(""" }

fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) {
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
Expand All @@ -293,19 +316,48 @@ def emit_break_module(f, break_table, break_cats, name):
}
Err(idx) => {
(
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower },
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper),
%sC_Any,
)
}
}
}

pub fn %s_category(c: char) -> (u32, u32, %sCat) {
bsearch_range_value_table(c, %s_cat_table)
// Perform a quick O(1) lookup in a precomputed table to determine
// the slice of the range table to search in.
let lookup_interval = 0x%x;
let idx = (c as u32 / lookup_interval) as usize;
let range = %s_cat_lookup.get(idx..(idx + 2)).map_or(
// If the `idx` is outside of the precomputed table - use the slice
// starting from the last covered index in the precomputed table and
// ending with the length of the range table.
%d..%d,
|r| (r[0] as usize)..((r[1] + 1) as usize)
);

// Compute pessimistic default lower and upper bounds on the category.
// If character doesn't map to any range and there is no adjacent range
// in the table slice - these bounds has to apply.
let lower = idx as u32 * lookup_interval;
let upper = lower + lookup_interval - 1;
bsearch_range_value_table(c, &%s_cat_table[range], lower, upper)
}

""" % (Name, Name, Name[0], name, Name, name))
""" % (Name, Name, Name[0], name, Name, lookup_interval, name, j, len(break_table), name))


if len(break_table) <= 0xff:
lookup_type = "u8"
elif len(break_table) <= 0xffff:
lookup_type = "u16"
else:
lookup_type = "u32"

emit_table(f, "%s_cat_lookup" % name, lookup_table, "&'static [%s]" % lookup_type,
pfun=lambda x: "%d" % x,
is_pub=False, is_const=True)

emit_table(f, "%s_cat_table" % name, break_table, "&'static [(char, char, %sCat)]" % Name,
pfun=lambda x: "(%s,%s,%sC_%s)" % (escape_char(x[0]), escape_char(x[1]), Name[0], x[2]),
Expand Down