Skip to content

Commit

Permalink
Improve table search speed through lookups
Browse files Browse the repository at this point in the history
Prior to this change table search would have to do a binary search over
about 1000 entries which resulted in around 10 memory loads on average.
In this commit we reduce the search space by doing a pre-lookup in a
generated table to get a smaller (often zero-length) slice of the full
sorted range list. On average this gives us just one entry of the range
list to perform binary search on, which reduces the average number of
memory loads to 2.
  • Loading branch information
indutny committed Jan 28, 2023
1 parent 07e6155 commit 0b2442c
Show file tree
Hide file tree
Showing 2 changed files with 314 additions and 16 deletions.
40 changes: 36 additions & 4 deletions scripts/unicode.py
Expand Up @@ -274,13 +274,29 @@ def emit_break_module(f, break_table, break_cats, name):
pub enum %sCat {
""" % (name, Name, Name))

max_lookup_value = 0x20000
lookup_range = 0x400
lookup_interval = round(max_lookup_value / lookup_range)

lookup_table = [0] * lookup_range
j = 0
for i in range(0, lookup_range):
lookup_from = i * lookup_interval
lookup_to = i * lookup_interval
while j < len(break_table):
(_, entry_to, _) = break_table[j]
if entry_to >= lookup_from:
break
j += 1
lookup_table[i] = j

break_cats.append("Any")
break_cats.sort()
for cat in break_cats:
f.write((" %sC_" % Name[0]) + cat + ",\n")
f.write(""" }
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) {
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)], min: u32) -> (u32, u32, %sCat) {
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
Expand All @@ -293,7 +309,7 @@ def emit_break_module(f, break_table, break_cats, name):
}
Err(idx) => {
(
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
if idx > 0 { r[idx-1].1 as u32 + 1 } else { min },
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
%sC_Any,
)
Expand All @@ -302,10 +318,26 @@ def emit_break_module(f, break_table, break_cats, name):
}
pub fn %s_category(c: char) -> (u32, u32, %sCat) {
bsearch_range_value_table(c, %s_cat_table)
let idx = c as usize / 0x%x;
let r = %s_cat_lookup.get(idx..(idx + 2)).map_or(
%d..%d,
|r| (r[0] as usize)..((r[1] + 1) as usize)
);
bsearch_range_value_table(c, &%s_cat_table[r], idx as u32 * 0x%x)
}
""" % (Name, Name, Name[0], name, Name, name))
""" % (Name, Name, Name[0], name, Name, lookup_interval, name, j, len(break_table), name, lookup_interval))

if len(break_table) <= 0xff:
lookup_type = "u8"
elif len(break_table) <= 0xffff:
lookup_type = "u16"
else:
lookup_type = "u32"

emit_table(f, "%s_cat_lookup" % name, lookup_table, "&'static [%s]" % lookup_type,
pfun=lambda x: "%d" % x,
is_pub=False, is_const=True)

emit_table(f, "%s_cat_table" % name, break_table, "&'static [(char, char, %sCat)]" % Name,
pfun=lambda x: "(%s,%s,%sC_%s)" % (escape_char(x[0]), escape_char(x[1]), Name[0], x[2]),
Expand Down

0 comments on commit 0b2442c

Please sign in to comment.