Skip to content

Commit

Permalink
Add more canonical equivalence tests
Browse files Browse the repository at this point in the history
Test that all canonically equivalent sequences
in Unicode's `NormalizationTest.txt` have the same width.
Currently no changes need to be made to the width logic
to ensure these tests pass. However, Unicode 16
is adding a few new characters that will be problematic
(the Kirat Rai vowel signs:
<https://www.unicode.org/charts/PDF/Unicode-16.0/U160-16D40.pdf>).
Adding this test in advance ensures that we won't forget
to account for these changes when the time comes.
  • Loading branch information
Jules-Bertholet committed Apr 27, 2024
1 parent 8092f84 commit eec13fa
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 14 deletions.
21 changes: 9 additions & 12 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,14 @@ jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Regen tables
run: cd scripts && python3 unicode.py
- name: Diff tables
run: diff src/tables.rs scripts/tables.rs
- name: Build
run: cargo build --verbose
- name: Run tests
Expand All @@ -28,14 +35,4 @@ jobs:
run: cargo fmt --check
- name: Check clippy
run: cargo clippy --lib --tests
regen:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Regen
run: cd scripts && python3 unicode.py
- name: Diff
run: diff src/tables.rs scripts/tables.rs

7 changes: 5 additions & 2 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,11 @@ class OffsetType(enum.IntEnum):

def fetch_open(filename: str):
"""Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.
fetches it from `https://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.
"""
basename = os.path.basename(filename)
if not os.path.exists(basename):
os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}")
os.system(f"curl -O https://www.unicode.org/Public/UNIDATA/{filename}")
try:
return open(basename, encoding="utf-8")
except OSError:
Expand Down Expand Up @@ -677,6 +677,9 @@ def main(module_filename: str):
emoji_variations = load_variation_sequences()
variation_table = make_variation_sequence_table(emoji_variations, width_map)

# Download normalization test file for use by tests
fetch_open("NormalizationTest.txt")

print("------------------------")
total_size = 0
for i, table in enumerate(tables):
Expand Down
37 changes: 37 additions & 0 deletions tests/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use std::{
fs::File,
io::{BufRead, BufReader},
};

use unicode_width::{UnicodeWidthChar, UnicodeWidthStr};

#[test]
Expand Down Expand Up @@ -149,6 +154,38 @@ fn test_canonical_equivalence() {
}
}

/// Requires `NormalizationTest.txt` to be present in the `scripts/` directory.
/// Run the `unicode.py` script to download it.
#[test]
fn test_canonical_equivalence_2() {
let norm_file = BufReader::new(
File::open("scripts/NormalizationTest.txt")
.expect("run `unicode.py` first to download `NormalizationTest.txt`"),
);
for line in norm_file.lines() {
let line = line.unwrap();
if line.is_empty() || line.starts_with('#') || line.starts_with('@') {
continue;
}
let (nfc, postnfc) = line.split_once(';').unwrap();
let (nfd, _) = postnfc.split_once(';').unwrap();
let nfc: String = nfc
.split(' ')
.map(|s| char::try_from(u32::from_str_radix(s, 16).unwrap()).unwrap())
.collect();
let nfd: String = nfd
.split(' ')
.map(|s| char::try_from(u32::from_str_radix(s, 16).unwrap()).unwrap())
.collect();

assert_eq!(
nfc.width(),
nfd.width(),
"width of {nfc:?} differs from {nfd:?}"
);
}
}

#[test]
fn test_emoji_presentation() {
assert_eq!(UnicodeWidthChar::width('\u{0023}'), Some(1));
Expand Down

0 comments on commit eec13fa

Please sign in to comment.