Skip to content

Commit

Permalink
Spuriously treat certain always-wide characters as eligible for emoji…
Browse files Browse the repository at this point in the history
… presentation
  • Loading branch information
Jules-Bertholet committed Mar 5, 2024
1 parent ae0dbd9 commit 7ca0bd6
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 29 deletions.
27 changes: 17 additions & 10 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,17 +413,17 @@ def make_variation_sequence_table(
) -> "tuple[list[int], list[list[int]]]":
"""Generates 2-level lookup table for whether a codepoint might start an emoji presentation sequence.
(Characters that are always wide may be excluded.)
The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB."""
The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB.
"""

prefixes_dict = defaultdict(list)
prefixes_dict = defaultdict(set)
for cp in seqs:
prefixes_dict[cp >> 10].append(cp & 0x3FF)
prefixes_dict[cp >> 10].add(cp & 0x3FF)

# We don't strictly need to keep track of characters that are always wide,
# because being in an emoji variation seq won't affect their width.
# So store their info only when it wouldn't inflate the size of the tables.
keys = list(prefixes_dict.keys())
for k in keys:
for k in list(prefixes_dict.keys()):
if all(
map(
lambda cp: width_map[(k << 10) | cp] == EffectiveWidth.WIDE,
Expand All @@ -432,7 +432,14 @@ def make_variation_sequence_table(
):
del prefixes_dict[k]

print(prefixes_dict)
indexes = list(prefixes_dict.keys())

# Similarly, we can spuriously return `true` for always-wide characters
# even if not part of a presentation seq; this saves an additional lookup,
# so we should do it where there is no size cost.
for cp, width in enumerate(width_map):
if width == EffectiveWidth.WIDE and (cp >> 10) in indexes:
prefixes_dict[cp >> 10].add(cp & 0x3FF)

leaves = []
for cps in prefixes_dict.values():
Expand All @@ -441,7 +448,7 @@ def make_variation_sequence_table(
idx_in_leaf, bit_shift = divmod(cp, 8)
leaf[idx_in_leaf] |= 1 << bit_shift
leaves.append(leaf)
return (list(prefixes_dict.keys()), leaves)
return (indexes, leaves)


def emit_module(
Expand Down Expand Up @@ -530,12 +537,12 @@ def emit_module(
variation_idx, variation_leaves = variation_table

module.write(
f"""
"""
/// Whether this character forms an [emoji presentation sequence]
/// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
/// when followed by `'\\u{{FEOF}}'`.
/// Emoji presentation sequences are considered to have width 2.
/// This may spuriously return `false` for all characters that are always wide.
/// This may spuriously return `true` or `false` for characters that are always wide.
#[inline]
pub fn starts_emoji_presentation_seq(c: char) -> bool {{
let cp: u32 = c.into();
Expand All @@ -550,7 +557,7 @@ def emit_module(
module.write(f" {msbs} => {i},\n")

module.write(
f""" _ => return false,
""" _ => return false,
}};
// Extract the 3-9th (0-indexed) least significant bits of `cp`,
Expand Down
38 changes: 19 additions & 19 deletions src/tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ pub mod charwidth {
/// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
/// when followed by `'\u{FEOF}'`.
/// Emoji presentation sequences are considered to have width 2.
/// This may spuriously return `false` for all characters that are always wide.
/// This may spuriously return `true` or `false` for characters that are always wide.
#[inline]
pub fn starts_emoji_presentation_seq(c: char) -> bool {
let cp: u32 = c.into();
Expand Down Expand Up @@ -591,7 +591,7 @@ pub mod charwidth {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x0C, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x0C, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0xFE,
0x0F, 0x07,
],
Expand Down Expand Up @@ -621,27 +621,27 @@ pub mod charwidth {
],
[
0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x03, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x04, 0x00, 0x80,
0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xE0,
0x20, 0x10, 0xF2, 0x1F, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21,
0x00, 0x00, 0xC8, 0xCE, 0x80, 0x70, 0x00, 0x00, 0x54, 0x7C, 0xF0, 0xFF, 0x01, 0x20,
0xA8, 0x00,
0x00, 0x00, 0x00, 0x00, 0x03, 0xC0, 0x00, 0x40, 0xFE, 0x07, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0x0F, 0xFF, 0x01, 0x03, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
0xFF, 0xFF, 0xF3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xCF, 0xCE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xB9, 0xFF,
],
[
0x00, 0x01, 0x20, 0x80, 0x40, 0x00, 0x00, 0x80, 0xC6, 0x63, 0x08, 0x00, 0x00, 0x04,
0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x09, 0x88, 0x00, 0x08, 0x00, 0x84,
0x70, 0x3C, 0x80, 0x2E, 0x00, 0x21, 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06,
0xFF, 0xFF, 0xFF, 0x80, 0xF9, 0x03, 0x80, 0x3C, 0x01, 0x00, 0x20, 0x01, 0x06, 0x10,
0x1C, 0x00, 0x0E, 0x70, 0x0A, 0x81, 0x08, 0x04, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x20, 0x12, 0x01,
0x00, 0x20, 0x04, 0x16, 0x00, 0xE8, 0x00, 0x00, 0x3F, 0x02, 0x09, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xBF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x3F, 0x00, 0x7E,
0xFF, 0xFF, 0xFF, 0x80, 0xF9, 0x07, 0x80, 0x3C, 0x61, 0x00, 0x30, 0x01, 0x06, 0x10,
0x1C, 0x00, 0x0E, 0x70, 0x0A, 0x81, 0x08, 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0x3F, 0xF8, 0xE7, 0xF0, 0x3F, 0x1A, 0xF9, 0x1F, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x0F,
0x01, 0x00,
],
];
}

0 comments on commit 7ca0bd6

Please sign in to comment.