From f52636c11ae7562b572dcf8f29eb28d3c5f3e299 Mon Sep 17 00:00:00 2001 From: Fedor Indutny <238531+indutny@users.noreply.github.com> Date: Sat, 28 Jan 2023 10:06:19 -0800 Subject: [PATCH] Improve table search speed through lookups Prior to this change table search would have to do a binary search over about 1000 entries which resulted in around 10 memory loads on average. In this commit we reduce the search space by doing a pre-lookup in a generated table to get a smaller (often zero-length) slice of the full sorted range list. On average this gives us just one entry of the range list to perform binary search on, which reduces the average number of memory loads to 2. --- scripts/unicode.py | 82 ++++++++- src/tables.rs | 430 ++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 486 insertions(+), 26 deletions(-) diff --git a/scripts/unicode.py b/scripts/unicode.py index 7aed85e..c135b20 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -274,13 +274,36 @@ def emit_break_module(f, break_table, break_cats, name): pub enum %sCat { """ % (name, Name, Name)) + # We don't want the lookup table to be too large so choose a reasonable + # cutoff. 0x20000 is selected because most of the range table entries are + # within the interval of [0x0, 0x20000] + lookup_value_cutoff = 0x20000 + + # Length of lookup table. It has to be a divisor of `lookup_value_cutoff`. + lookup_table_len = 0x400 + + lookup_interval = round(lookup_value_cutoff / lookup_table_len) + + # Lookup table is a mapping from `character code / lookup_interval` to + # the index in the range table that covers the `character code`. + lookup_table = [0] * lookup_table_len + j = 0 + for i in range(0, lookup_table_len): + lookup_from = i * lookup_interval + while j < len(break_table): + (_, entry_to, _) = break_table[j] + if entry_to >= lookup_from: + break + j += 1 + lookup_table[i] = j + break_cats.append("Any") break_cats.sort() for cat in break_cats: f.write((" %sC_" % Name[0]) + cat + ",\n") f.write(""" } - fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) { + fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (Option, Option, %sCat) { use core::cmp::Ordering::{Equal, Less, Greater}; match r.binary_search_by(|&(lo, hi, _)| { if lo <= c && c <= hi { Equal } @@ -289,12 +312,12 @@ def emit_break_module(f, break_table, break_cats, name): }) { Ok(idx) => { let (lower, upper, cat) = r[idx]; - (lower as u32, upper as u32, cat) + (Some(lower as u32), Some(upper as u32), cat) } Err(idx) => { ( - if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 }, - r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX), + if idx > 0 { Some(r[idx-1].1 as u32 + 1) } else { None }, + r.get(idx).map(|c|c.0 as u32 - 1), %sC_Any, ) } @@ -302,10 +325,57 @@ def emit_break_module(f, break_table, break_cats, name): } pub fn %s_category(c: char) -> (u32, u32, %sCat) { - bsearch_range_value_table(c, %s_cat_table) + // Perform a quick O(1) lookup in a precomputed table to determine + // the slice of the range table to search in. + let lookup_interval = 0x%x; + let idx = (c as u32 / lookup_interval) as usize; + let range = %s_cat_lookup.get(idx..(idx + 2)).map_or( + // If the `idx` is outside of the precomputed table - use the slice + // starting from the last covered index in the precomputed table and + // ending with the length of the range table. + %d..%d, + |r| (r[0] as usize)..((r[1] + 1) as usize) + ); + let (lower, upper, cat) = bsearch_range_value_table(c, &%s_cat_table[range]); + + ( + lower.unwrap_or_else(|| { + if idx == 0 { + 0 + } else { + // Use an entry just before the lookup index to find the lower + // bound for Any category. + let i = *%s_cat_lookup.get(idx - 1).unwrap_or(&%d) as usize; + %s_cat_table[i].1 as u32 + } + }), + upper.unwrap_or_else(|| { + %s_cat_lookup.get(idx + 1).map_or_else(|| { + // If idx was outside of the lookup table - upper bound is either + // already found in the ranges table, or has to be a u32::MAX. + core::u32::MAX + }, |&i| { + // Otherwise use an entry just after the lookup index to find the + // lower bound for Any category. + %s_cat_table[i as usize].0 as u32 + }) + }), + cat + ) } -""" % (Name, Name, Name[0], name, Name, name)) +""" % (Name, Name, Name[0], name, Name, lookup_interval, name, j, len(break_table), name, name, j, name, name, name)) + + if len(break_table) <= 0xff: + lookup_type = "u8" + elif len(break_table) <= 0xffff: + lookup_type = "u16" + else: + lookup_type = "u32" + + emit_table(f, "%s_cat_lookup" % name, lookup_table, "&'static [%s]" % lookup_type, + pfun=lambda x: "%d" % x, + is_pub=False, is_const=True) emit_table(f, "%s_cat_table" % name, break_table, "&'static [(char, char, %sCat)]" % Name, pfun=lambda x: "(%s,%s,%sC_%s)" % (escape_char(x[0]), escape_char(x[1]), Name[0], x[2]), diff --git a/src/tables.rs b/src/tables.rs index 5a811c9..8ee5bf1 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -365,7 +365,7 @@ pub mod grapheme { GC_ZWJ, } - fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> (u32, u32, GraphemeCat) { + fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> (Option, Option, GraphemeCat) { use core::cmp::Ordering::{Equal, Less, Greater}; match r.binary_search_by(|&(lo, hi, _)| { if lo <= c && c <= hi { Equal } @@ -374,12 +374,12 @@ pub mod grapheme { }) { Ok(idx) => { let (lower, upper, cat) = r[idx]; - (lower as u32, upper as u32, cat) + (Some(lower as u32), Some(upper as u32), cat) } Err(idx) => { ( - if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 }, - r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX), + if idx > 0 { Some(r[idx-1].1 as u32 + 1) } else { None }, + r.get(idx).map(|c|c.0 as u32 - 1), GC_Any, ) } @@ -387,9 +387,112 @@ pub mod grapheme { } pub fn grapheme_category(c: char) -> (u32, u32, GraphemeCat) { - bsearch_range_value_table(c, grapheme_cat_table) + // Perform a quick O(1) lookup in a precomputed table to determine + // the slice of the range table to search in. + let lookup_interval = 0x80; + let idx = (c as u32 / lookup_interval) as usize; + let range = grapheme_cat_lookup.get(idx..(idx + 2)).map_or( + // If the `idx` is outside of the precomputed table - use the slice + // starting from the last covered index in the precomputed table and + // ending with the length of the range table. + 1443..1449, + |r| (r[0] as usize)..((r[1] + 1) as usize) + ); + let (lower, upper, cat) = bsearch_range_value_table(c, &grapheme_cat_table[range]); + + ( + lower.unwrap_or_else(|| { + if idx == 0 { + 0 + } else { + // Use an entry just before the lookup index to find the lower + // bound for Any category. + let i = *grapheme_cat_lookup.get(idx - 1).unwrap_or(&1443) as usize; + grapheme_cat_table[i].1 as u32 + } + }), + upper.unwrap_or_else(|| { + grapheme_cat_lookup.get(idx + 1).map_or_else(|| { + // If idx was outside of the lookup table - upper bound is either + // already found in the ranges table, or has to be a u32::MAX. + core::u32::MAX + }, |&i| { + // Otherwise use an entry just after the lookup index to find the + // lower bound for Any category. + grapheme_cat_table[i as usize].0 as u32 + }) + }), + cat + ) } + const grapheme_cat_lookup: &'static [u16] = &[ + 0, 5, 9, 9, 9, 9, 9, 10, 10, 10, 11, 11, 16, 21, 26, 29, 32, 37, 41, 53, 65, 75, 86, 97, + 106, 116, 131, 143, 153, 157, 161, 168, 173, 183, 188, 189, 191, 191, 191, 192, 192, 192, + 192, 192, 192, 192, 192, 198, 206, 209, 211, 219, 219, 232, 233, 242, 258, 262, 270, 270, + 271, 271, 271, 271, 271, 279, 280, 282, 284, 284, 284, 286, 290, 290, 291, 291, 295, 297, + 298, 313, 317, 317, 317, 318, 318, 318, 318, 322, 322, 322, 323, 324, 325, 325, 325, 325, + 325, 328, 329, 329, 329, 329, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, + 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, + 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, + 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, + 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, + 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, + 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, + 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, + 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, + 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, + 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, + 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, + 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, 331, + 331, 331, 331, 333, 335, 335, 335, 342, 347, 351, 360, 369, 379, 379, 386, 395, 405, 413, + 423, 431, 441, 450, 459, 469, 477, 487, 495, 505, 514, 523, 533, 541, 551, 559, 569, 578, + 587, 597, 605, 615, 623, 633, 642, 651, 661, 669, 679, 687, 697, 706, 715, 725, 733, 743, + 751, 761, 770, 779, 789, 797, 807, 815, 825, 834, 843, 853, 861, 871, 879, 889, 898, 907, + 917, 925, 935, 943, 953, 962, 971, 981, 989, 999, 1007, 1017, 1026, 1035, 1045, 1053, 1063, + 1071, 1081, 1090, 1099, 1109, 1117, 1127, 1135, 1145, 1154, 1163, 1173, 1181, 1186, 1186, + 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, + 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, + 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, + 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, + 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1186, 1187, 1187, 1187, 1187, 1187, 1187, + 1189, 1190, 1190, 1192, 1192, 1192, 1192, 1193, 1193, 1194, 1195, 1195, 1195, 1195, 1195, + 1195, 1195, 1195, 1195, 1195, 1195, 1195, 1195, 1195, 1200, 1201, 1201, 1201, 1201, 1201, + 1202, 1202, 1202, 1204, 1205, 1206, 1212, 1221, 1227, 1236, 1244, 1247, 1260, 1260, 1267, + 1278, 1278, 1286, 1292, 1299, 1303, 1303, 1307, 1307, 1318, 1324, 1333, 1337, 1337, 1337, + 1342, 1349, 1355, 1361, 1361, 1363, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, + 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, + 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, 1372, + 1372, 1372, 1372, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, + 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, + 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, + 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, + 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, + 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, + 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1375, + 1375, 1375, 1375, 1375, 1375, 1375, 1375, 1376, 1377, 1377, 1377, 1377, 1377, 1377, 1377, + 1377, 1378, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, + 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, + 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, + 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, + 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, + 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, + 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, + 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, + 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, + 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, 1382, + 1382, 1382, 1382, 1382, 1382, 1382, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, + 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, + 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1384, 1386, 1386, + 1386, 1386, 1392, 1395, 1396, 1396, 1396, 1396, 1396, 1396, 1396, 1396, 1396, 1396, 1396, + 1396, 1396, 1396, 1396, 1396, 1399, 1402, 1402, 1402, 1402, 1402, 1402, 1402, 1402, 1402, + 1402, 1402, 1407, 1408, 1409, 1409, 1409, 1411, 1411, 1411, 1411, 1412, 1412, 1412, 1412, + 1412, 1412, 1412, 1412, 1413, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414, 1414, + 1414, 1414, 1414, 1414, 1414, 1415, 1419, 1423, 1428, 1428, 1428, 1430, 1430, 1430, 1431, + 1431, 1432, 1433, 1434, 1435, 1438, 1440, 1442, 1442, 1442, 1443, 1443, 1443, 1443, 1443, + 1443, 1443, 1443, 1443, 1443 + ]; + const grapheme_cat_table: &'static [(char, char, GraphemeCat)] = &[ ('\u{0}', '\u{9}', GC_Control), ('\u{a}', '\u{a}', GC_LF), ('\u{b}', '\u{c}', GC_Control), ('\u{d}', '\u{d}', GC_CR), ('\u{e}', '\u{1f}', GC_Control), ('\u{7f}', '\u{9f}', @@ -1028,7 +1131,7 @@ pub mod word { WC_ZWJ, } - fn bsearch_range_value_table(c: char, r: &'static [(char, char, WordCat)]) -> (u32, u32, WordCat) { + fn bsearch_range_value_table(c: char, r: &'static [(char, char, WordCat)]) -> (Option, Option, WordCat) { use core::cmp::Ordering::{Equal, Less, Greater}; match r.binary_search_by(|&(lo, hi, _)| { if lo <= c && c <= hi { Equal } @@ -1037,12 +1140,12 @@ pub mod word { }) { Ok(idx) => { let (lower, upper, cat) = r[idx]; - (lower as u32, upper as u32, cat) + (Some(lower as u32), Some(upper as u32), cat) } Err(idx) => { ( - if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 }, - r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX), + if idx > 0 { Some(r[idx-1].1 as u32 + 1) } else { None }, + r.get(idx).map(|c|c.0 as u32 - 1), WC_Any, ) } @@ -1050,9 +1153,106 @@ pub mod word { } pub fn word_category(c: char) -> (u32, u32, WordCat) { - bsearch_range_value_table(c, word_cat_table) + // Perform a quick O(1) lookup in a precomputed table to determine + // the slice of the range table to search in. + let lookup_interval = 0x80; + let idx = (c as u32 / lookup_interval) as usize; + let range = word_cat_lookup.get(idx..(idx + 2)).map_or( + // If the `idx` is outside of the precomputed table - use the slice + // starting from the last covered index in the precomputed table and + // ending with the length of the range table. + 1050..1053, + |r| (r[0] as usize)..((r[1] + 1) as usize) + ); + let (lower, upper, cat) = bsearch_range_value_table(c, &word_cat_table[range]); + + ( + lower.unwrap_or_else(|| { + if idx == 0 { + 0 + } else { + // Use an entry just before the lookup index to find the lower + // bound for Any category. + let i = *word_cat_lookup.get(idx - 1).unwrap_or(&1050) as usize; + word_cat_table[i].1 as u32 + } + }), + upper.unwrap_or_else(|| { + word_cat_lookup.get(idx + 1).map_or_else(|| { + // If idx was outside of the lookup table - upper bound is either + // already found in the ranges table, or has to be a u32::MAX. + core::u32::MAX + }, |&i| { + // Otherwise use an entry just after the lookup index to find the + // lower bound for Any category. + word_cat_table[i as usize].0 as u32 + }) + }), + cat + ) } + const word_cat_lookup: &'static [u16] = &[ + 0, 14, 22, 22, 22, 22, 24, 30, 36, 36, 38, 43, 55, 66, 78, 83, 93, 104, 111, 121, 143, 162, + 180, 198, 215, 231, 250, 266, 278, 282, 286, 295, 301, 308, 316, 316, 316, 321, 329, 333, + 336, 336, 336, 336, 336, 338, 342, 351, 354, 359, 365, 369, 370, 375, 378, 384, 391, 397, + 409, 409, 411, 411, 411, 420, 430, 449, 451, 464, 465, 465, 465, 465, 465, 465, 466, 466, + 466, 466, 466, 466, 466, 466, 466, 466, 466, 466, 466, 466, 466, 466, 470, 476, 486, 487, + 487, 487, 487, 492, 496, 497, 500, 500, 501, 502, 502, 502, 502, 502, 502, 502, 502, 502, + 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, + 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, + 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, + 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, + 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, + 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, + 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, + 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, + 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, + 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, + 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, + 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, 502, + 502, 502, 504, 504, 504, 511, 515, 515, 519, 529, 538, 544, 551, 559, 568, 574, 578, 578, + 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, + 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, + 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, + 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, + 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 578, 581, 581, 581, 581, + 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, + 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, + 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, + 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 581, 592, 593, 593, 593, 594, + 597, 609, 611, 620, 628, 634, 635, 636, 637, 637, 640, 644, 648, 648, 652, 655, 662, 662, + 662, 665, 668, 675, 678, 680, 682, 692, 696, 699, 700, 701, 703, 706, 706, 706, 710, 714, + 718, 726, 734, 744, 753, 759, 767, 785, 785, 791, 796, 796, 801, 805, 809, 811, 811, 813, + 815, 828, 835, 844, 848, 848, 848, 854, 857, 869, 875, 875, 877, 885, 886, 886, 886, 886, + 886, 886, 886, 886, 887, 888, 888, 889, 889, 889, 889, 889, 889, 889, 889, 889, 889, 889, + 889, 889, 889, 889, 889, 889, 889, 889, 889, 889, 890, 890, 890, 890, 890, 890, 890, 890, + 890, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, + 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, 895, + 895, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, + 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, + 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, + 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, 896, + 896, 899, 903, 908, 909, 909, 909, 909, 909, 910, 910, 913, 920, 920, 920, 920, 920, 920, + 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, + 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, + 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, + 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, + 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, + 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, + 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 920, 923, 924, 924, 927, + 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, 927, + 927, 927, 927, 929, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, + 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, 933, + 933, 933, 933, 933, 933, 935, 935, 935, 935, 938, 941, 942, 942, 942, 942, 943, 951, 960, + 960, 960, 964, 968, 973, 973, 973, 973, 973, 976, 979, 979, 979, 979, 979, 979, 979, 979, + 979, 981, 981, 987, 988, 993, 993, 993, 998, 998, 998, 998, 1001, 1001, 1001, 1001, 1001, + 1001, 1005, 1005, 1007, 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1011, 1039, + 1044, 1044, 1044, 1044, 1044, 1046, 1048, 1048, 1048, 1048, 1049, 1049, 1049, 1049, 1049, + 1049, 1049, 1049, 1049, 1049, 1049, 1049, 1049, 1049, 1049, 1049, 1050, 1050, 1050, 1050, + 1050, 1050, 1050, 1050 + ]; + const word_cat_table: &'static [(char, char, WordCat)] = &[ ('\u{a}', '\u{a}', WC_LF), ('\u{b}', '\u{c}', WC_Newline), ('\u{d}', '\u{d}', WC_CR), ('\u{20}', '\u{20}', WC_WSegSpace), ('\u{22}', '\u{22}', WC_Double_Quote), ('\u{27}', @@ -1530,7 +1730,7 @@ pub mod emoji { EC_Extended_Pictographic, } - fn bsearch_range_value_table(c: char, r: &'static [(char, char, EmojiCat)]) -> (u32, u32, EmojiCat) { + fn bsearch_range_value_table(c: char, r: &'static [(char, char, EmojiCat)]) -> (Option, Option, EmojiCat) { use core::cmp::Ordering::{Equal, Less, Greater}; match r.binary_search_by(|&(lo, hi, _)| { if lo <= c && c <= hi { Equal } @@ -1539,12 +1739,12 @@ pub mod emoji { }) { Ok(idx) => { let (lower, upper, cat) = r[idx]; - (lower as u32, upper as u32, cat) + (Some(lower as u32), Some(upper as u32), cat) } Err(idx) => { ( - if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 }, - r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX), + if idx > 0 { Some(r[idx-1].1 as u32 + 1) } else { None }, + r.get(idx).map(|c|c.0 as u32 - 1), EC_Any, ) } @@ -1552,9 +1752,92 @@ pub mod emoji { } pub fn emoji_category(c: char) -> (u32, u32, EmojiCat) { - bsearch_range_value_table(c, emoji_cat_table) + // Perform a quick O(1) lookup in a precomputed table to determine + // the slice of the range table to search in. + let lookup_interval = 0x80; + let idx = (c as u32 / lookup_interval) as usize; + let range = emoji_cat_lookup.get(idx..(idx + 2)).map_or( + // If the `idx` is outside of the precomputed table - use the slice + // starting from the last covered index in the precomputed table and + // ending with the length of the range table. + 77..78, + |r| (r[0] as usize)..((r[1] + 1) as usize) + ); + let (lower, upper, cat) = bsearch_range_value_table(c, &emoji_cat_table[range]); + + ( + lower.unwrap_or_else(|| { + if idx == 0 { + 0 + } else { + // Use an entry just before the lookup index to find the lower + // bound for Any category. + let i = *emoji_cat_lookup.get(idx - 1).unwrap_or(&77) as usize; + emoji_cat_table[i].1 as u32 + } + }), + upper.unwrap_or_else(|| { + emoji_cat_lookup.get(idx + 1).map_or_else(|| { + // If idx was outside of the lookup table - upper bound is either + // already found in the ranges table, or has to be a u32::MAX. + core::u32::MAX + }, |&i| { + // Otherwise use an entry just after the lookup index to find the + // lower bound for Any category. + emoji_cat_table[i as usize].0 as u32 + }) + }), + cat + ) } + const emoji_cat_lookup: &'static [u8] = &[ + 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 4, 4, 6, 8, 8, 8, 10, 14, 14, 15, 15, 19, 21, 22, 37, 41, 41, 41, 42, 42, 42, 42, + 46, 46, 46, 46, 46, 46, 46, 46, 46, 46, 48, 48, 48, 48, 48, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 51, 55, 58, 63, 63, 63, 64, 64, 64, 65, 65, 66, 67, + 68, 69, 72, 74, 76, 76, 76, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77 + ]; + const emoji_cat_table: &'static [(char, char, EmojiCat)] = &[ ('\u{a9}', '\u{a9}', EC_Extended_Pictographic), ('\u{ae}', '\u{ae}', EC_Extended_Pictographic), ('\u{203c}', '\u{203c}', EC_Extended_Pictographic), ('\u{2049}', @@ -1633,7 +1916,7 @@ pub mod sentence { SC_Upper, } - fn bsearch_range_value_table(c: char, r: &'static [(char, char, SentenceCat)]) -> (u32, u32, SentenceCat) { + fn bsearch_range_value_table(c: char, r: &'static [(char, char, SentenceCat)]) -> (Option, Option, SentenceCat) { use core::cmp::Ordering::{Equal, Less, Greater}; match r.binary_search_by(|&(lo, hi, _)| { if lo <= c && c <= hi { Equal } @@ -1642,12 +1925,12 @@ pub mod sentence { }) { Ok(idx) => { let (lower, upper, cat) = r[idx]; - (lower as u32, upper as u32, cat) + (Some(lower as u32), Some(upper as u32), cat) } Err(idx) => { ( - if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 }, - r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX), + if idx > 0 { Some(r[idx-1].1 as u32 + 1) } else { None }, + r.get(idx).map(|c|c.0 as u32 - 1), SC_Any, ) } @@ -1655,9 +1938,116 @@ pub mod sentence { } pub fn sentence_category(c: char) -> (u32, u32, SentenceCat) { - bsearch_range_value_table(c, sentence_cat_table) + // Perform a quick O(1) lookup in a precomputed table to determine + // the slice of the range table to search in. + let lookup_interval = 0x80; + let idx = (c as u32 / lookup_interval) as usize; + let range = sentence_cat_lookup.get(idx..(idx + 2)).map_or( + // If the `idx` is outside of the precomputed table - use the slice + // starting from the last covered index in the precomputed table and + // ending with the length of the range table. + 2410..2421, + |r| (r[0] as usize)..((r[1] + 1) as usize) + ); + let (lower, upper, cat) = bsearch_range_value_table(c, &sentence_cat_table[range]); + + ( + lower.unwrap_or_else(|| { + if idx == 0 { + 0 + } else { + // Use an entry just before the lookup index to find the lower + // bound for Any category. + let i = *sentence_cat_lookup.get(idx - 1).unwrap_or(&2410) as usize; + sentence_cat_table[i].1 as u32 + } + }), + upper.unwrap_or_else(|| { + sentence_cat_lookup.get(idx + 1).map_or_else(|| { + // If idx was outside of the lookup table - upper bound is either + // already found in the ranges table, or has to be a u32::MAX. + core::u32::MAX + }, |&i| { + // Otherwise use an entry just after the lookup index to find the + // lower bound for Any category. + sentence_cat_table[i as usize].0 as u32 + }) + }), + cat + ) } + const sentence_cat_lookup: &'static [u16] = &[ + 0, 19, 31, 154, 247, 314, 323, 333, 375, 409, 528, 579, 588, 599, 612, 618, 629, 643, 650, + 661, 683, 702, 720, 738, 755, 771, 790, 806, 818, 825, 840, 850, 856, 871, 882, 882, 882, + 887, 895, 901, 904, 904, 904, 904, 904, 907, 912, 922, 928, 937, 943, 950, 953, 959, 964, + 973, 980, 988, 1000, 1000, 1002, 1130, 1249, 1267, 1288, 1308, 1311, 1336, 1340, 1340, 1340, + 1342, 1342, 1342, 1344, 1344, 1344, 1344, 1344, 1346, 1348, 1348, 1348, 1348, 1351, 1351, + 1351, 1351, 1351, 1369, 1476, 1482, 1492, 1501, 1501, 1501, 1501, 1512, 1517, 1518, 1521, + 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, + 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, + 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, + 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1521, 1522, 1522, 1522, 1522, 1522, + 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, + 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, + 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, + 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, + 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, + 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, + 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, + 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, + 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, + 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, + 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, 1522, + 1522, 1522, 1522, 1522, 1525, 1525, 1525, 1580, 1613, 1696, 1769, 1780, 1790, 1797, 1808, + 1819, 1836, 1843, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, + 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, + 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, + 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, + 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, + 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, 1849, + 1849, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, + 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, + 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, + 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, + 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1852, 1853, 1854, 1864, 1865, 1865, + 1865, 1867, 1870, 1886, 1888, 1905, 1913, 1919, 1920, 1921, 1922, 1922, 1925, 1929, 1933, + 1935, 1939, 1942, 1949, 1949, 1949, 1952, 1957, 1964, 1967, 1969, 1971, 1982, 1986, 1989, + 1990, 1991, 1993, 1996, 1996, 1996, 2000, 2005, 2010, 2019, 2028, 2039, 2051, 2059, 2068, + 2086, 2086, 2093, 2098, 2098, 2105, 2110, 2114, 2119, 2119, 2121, 2124, 2139, 2146, 2156, + 2161, 2161, 2161, 2168, 2171, 2183, 2189, 2189, 2192, 2201, 2202, 2202, 2202, 2202, 2202, + 2202, 2202, 2202, 2203, 2204, 2204, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, + 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2205, 2206, 2206, 2206, + 2206, 2206, 2206, 2206, 2206, 2206, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, + 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, + 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2211, 2212, 2212, 2212, + 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, + 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, + 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, + 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, + 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2212, 2216, 2221, 2228, 2229, 2229, 2229, + 2229, 2229, 2231, 2232, 2235, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, + 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, + 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, + 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2242, 2243, 2243, 2243, 2243, 2243, 2243, 2243, + 2243, 2243, 2243, 2244, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, + 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, + 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, + 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, + 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2245, 2248, 2248, + 2248, 2253, 2253, 2253, 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2254, + 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2254, 2256, 2261, 2261, 2261, 2261, 2261, 2261, + 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, + 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, 2261, + 2261, 2263, 2263, 2263, 2263, 2266, 2269, 2270, 2270, 2270, 2270, 2275, 2288, 2300, 2305, + 2310, 2316, 2322, 2330, 2330, 2330, 2330, 2330, 2333, 2337, 2337, 2337, 2337, 2337, 2337, + 2337, 2337, 2337, 2341, 2341, 2347, 2348, 2353, 2353, 2353, 2358, 2358, 2358, 2358, 2361, + 2361, 2361, 2361, 2361, 2361, 2365, 2365, 2367, 2372, 2372, 2372, 2372, 2372, 2372, 2372, + 2372, 2372, 2372, 2400, 2405, 2405, 2405, 2405, 2405, 2407, 2408, 2408, 2408, 2408, 2408, + 2408, 2408, 2408, 2408, 2409, 2409, 2409, 2409, 2409, 2409, 2409, 2409, 2409, 2409, 2409, + 2410, 2410, 2410, 2410, 2410, 2410, 2410, 2410 + ]; + const sentence_cat_table: &'static [(char, char, SentenceCat)] = &[ ('\u{9}', '\u{9}', SC_Sp), ('\u{a}', '\u{a}', SC_LF), ('\u{b}', '\u{c}', SC_Sp), ('\u{d}', '\u{d}', SC_CR), ('\u{20}', '\u{20}', SC_Sp), ('\u{21}', '\u{21}', SC_STerm), ('\u{22}',