Skip to content

Commit

Permalink
syntax: regenerate tables for Unicode 11
Browse files Browse the repository at this point in the history
This adds `scripts/generate.py`, and uses it to regenerate all tables
with data from Unicode 11.0.0.  This also restores the character tests
that were first added in #400, with a new one for 11.
  • Loading branch information
cuviper authored and BurntSushi committed Jun 12, 2018
1 parent eeffc7f commit 5eaff67
Show file tree
Hide file tree
Showing 12 changed files with 3,723 additions and 3,328 deletions.
11 changes: 9 additions & 2 deletions regex-syntax/src/lib.rs
Expand Up @@ -213,8 +213,15 @@ mod tests {
assert!(is_word_byte(b'a'));
assert!(!is_word_byte(b'-'));

assert!(is_word_character('a'));
assert!(is_word_character('β'));
assert!(is_word_character('a'), "ASCII");
assert!(is_word_character('à'), "Latin-1");
assert!(is_word_character('β'), "Greek");
assert!(is_word_character('\u{11011}'), "Brahmi (Unicode 6.0)");
assert!(is_word_character('\u{11611}'), "Modi (Unicode 7.0)");
assert!(is_word_character('\u{11711}'), "Ahom (Unicode 8.0)");
assert!(is_word_character('\u{17828}'), "Tangut (Unicode 9.0)");
assert!(is_word_character('\u{1B1B1}'), "Nushu (Unicode 10.0)");
assert!(is_word_character('\u{16E40}'), "Medefaidrin (Unicode 11.0)");
assert!(!is_word_character('-'));
assert!(!is_word_character('☃'));
}
Expand Down
1 change: 1 addition & 0 deletions regex-syntax/src/unicode.rs
Expand Up @@ -346,6 +346,7 @@ fn ages(canonical_age: &str) -> Result<AgeIter> {
("V8_0", age::V8_0),
("V9_0", age::V9_0),
("V10_0", age::V10_0),
("V11_0", age::V11_0),
];
assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync");

Expand Down
43 changes: 37 additions & 6 deletions regex-syntax/src/unicode_tables/age.rs
@@ -1,15 +1,15 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// ucd-generate age tmp/ucd-10.0.0/ --chars
// ucd-generate age tmp/ucd-11.0.0/ --chars
//
// ucd-generate is available on crates.io.

pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
("V10_0", V10_0), ("V1_1", V1_1), ("V2_0", V2_0), ("V2_1", V2_1),
("V3_0", V3_0), ("V3_1", V3_1), ("V3_2", V3_2), ("V4_0", V4_0),
("V4_1", V4_1), ("V5_0", V5_0), ("V5_1", V5_1), ("V5_2", V5_2),
("V6_0", V6_0), ("V6_1", V6_1), ("V6_2", V6_2), ("V6_3", V6_3),
("V7_0", V7_0), ("V8_0", V8_0), ("V9_0", V9_0),
("V10_0", V10_0), ("V11_0", V11_0), ("V1_1", V1_1), ("V2_0", V2_0),
("V2_1", V2_1), ("V3_0", V3_0), ("V3_1", V3_1), ("V3_2", V3_2),
("V4_0", V4_0), ("V4_1", V4_1), ("V5_0", V5_0), ("V5_1", V5_1),
("V5_2", V5_2), ("V6_0", V6_0), ("V6_1", V6_1), ("V6_2", V6_2),
("V6_3", V6_3), ("V7_0", V7_0), ("V8_0", V8_0), ("V9_0", V9_0),
];

pub const V10_0: &'static [(char, char)] = &[
Expand All @@ -25,6 +25,37 @@ pub const V10_0: &'static [(char, char)] = &[
('🥟', '🥫'), ('🦒', '🦗'), ('🧐', '🧦'), ('𬺰', '𮯠'),
];

pub const V11_0: &'static [(char, char)] = &[
('\u{560}', '\u{560}'), ('\u{588}', '\u{588}'), ('\u{5ef}', '\u{5ef}'),
('\u{7fd}', '\u{7ff}'), ('\u{8d3}', '\u{8d3}'), ('\u{9fe}', '\u{9fe}'),
('\u{a76}', '\u{a76}'), ('\u{c04}', '\u{c04}'), ('\u{c84}', '\u{c84}'),
('\u{1878}', '\u{1878}'), ('\u{1c90}', '\u{1cba}'),
('\u{1cbd}', '\u{1cbf}'), ('\u{2bba}', '\u{2bbc}'),
('\u{2bd3}', '\u{2beb}'), ('\u{2bf0}', '\u{2bfe}'),
('\u{2e4a}', '\u{2e4e}'), ('\u{312f}', '\u{312f}'),
('\u{9feb}', '\u{9fef}'), ('\u{a7af}', '\u{a7af}'),
('\u{a7b8}', '\u{a7b9}'), ('\u{a8fe}', '\u{a8ff}'),
('\u{10a34}', '\u{10a35}'), ('\u{10a48}', '\u{10a48}'),
('\u{10d00}', '\u{10d27}'), ('\u{10d30}', '\u{10d39}'),
('\u{10f00}', '\u{10f27}'), ('\u{10f30}', '\u{10f59}'),
('\u{110cd}', '\u{110cd}'), ('\u{11144}', '\u{11146}'),
('\u{1133b}', '\u{1133b}'), ('\u{1145e}', '\u{1145e}'),
('\u{1171a}', '\u{1171a}'), ('\u{11800}', '\u{1183b}'),
('\u{11a9d}', '\u{11a9d}'), ('\u{11d60}', '\u{11d65}'),
('\u{11d67}', '\u{11d68}'), ('\u{11d6a}', '\u{11d8e}'),
('\u{11d90}', '\u{11d91}'), ('\u{11d93}', '\u{11d98}'),
('\u{11da0}', '\u{11da9}'), ('\u{11ee0}', '\u{11ef8}'),
('\u{16e40}', '\u{16e9a}'), ('\u{187ed}', '\u{187f1}'),
('\u{1d2e0}', '\u{1d2f3}'), ('\u{1d372}', '\u{1d378}'),
('\u{1ec71}', '\u{1ecb4}'), ('\u{1f12f}', '\u{1f12f}'),
('\u{1f6f9}', '\u{1f6f9}'), ('\u{1f7d5}', '\u{1f7d8}'),
('\u{1f94d}', '\u{1f94f}'), ('\u{1f96c}', '\u{1f970}'),
('\u{1f973}', '\u{1f976}'), ('\u{1f97a}', '\u{1f97a}'),
('\u{1f97c}', '\u{1f97f}'), ('\u{1f998}', '\u{1f9a2}'),
('\u{1f9b0}', '\u{1f9b9}'), ('\u{1f9c1}', '\u{1f9c2}'),
('\u{1f9e7}', '\u{1f9ff}'), ('\u{1fa60}', '\u{1fa6d}'),
];

pub const V1_1: &'static [(char, char)] = &[
('\u{0}', 'ǵ'), ('Ǻ', 'ȗ'), ('ɐ', 'ʨ'), ('ʰ', '˞'), ('ˠ', '˩'),
('̀', 'ͅ'), ('͠', '͡'), ('ʹ', '͵'), ('ͺ', 'ͺ'), (';', ';'),
Expand Down
911 changes: 487 additions & 424 deletions regex-syntax/src/unicode_tables/case_folding_simple.rs

Large diffs are not rendered by default.

2,219 changes: 1,141 additions & 1,078 deletions regex-syntax/src/unicode_tables/general_category.rs

Large diffs are not rendered by default.

291 changes: 150 additions & 141 deletions regex-syntax/src/unicode_tables/perl_word.rs

Large diffs are not rendered by default.

2,936 changes: 1,509 additions & 1,427 deletions regex-syntax/src/unicode_tables/property_bool.rs

Large diffs are not rendered by default.

14 changes: 8 additions & 6 deletions regex-syntax/src/unicode_tables/property_names.rs
@@ -1,6 +1,6 @@
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
//
// ucd-generate property-names tmp/ucd-10.0.0/
// ucd-generate property-names tmp/ucd-11.0.0/
//
// ucd-generate is available on crates.io.

Expand Down Expand Up @@ -47,11 +47,13 @@ pub const PROPERTY_NAMES: &'static [(&'static str, &'static str)] = &[
("di", "Default_Ignorable_Code_Point"), ("dia", "Diacritic"),
("diacritic", "Diacritic"), ("dm", "Decomposition_Mapping"),
("dt", "Decomposition_Type"), ("ea", "East_Asian_Width"),
("eastasianwidth", "East_Asian_Width"), ("expandsonnfc", "Expands_On_NFC"),
("expandsonnfd", "Expands_On_NFD"), ("expandsonnfkc", "Expands_On_NFKC"),
("expandsonnfkd", "Expands_On_NFKD"), ("ext", "Extender"),
("extender", "Extender"), ("fcnfkc", "FC_NFKC_Closure"),
("fcnfkcclosure", "FC_NFKC_Closure"),
("eastasianwidth", "East_Asian_Width"),
("equideo", "Equivalent_Unified_Ideograph"),
("equivalentunifiedideograph", "Equivalent_Unified_Ideograph"),
("expandsonnfc", "Expands_On_NFC"), ("expandsonnfd", "Expands_On_NFD"),
("expandsonnfkc", "Expands_On_NFKC"), ("expandsonnfkd", "Expands_On_NFKD"),
("ext", "Extender"), ("extender", "Extender"),
("fcnfkc", "FC_NFKC_Closure"), ("fcnfkcclosure", "FC_NFKC_Closure"),
("fullcompositionexclusion", "Full_Composition_Exclusion"),
("gc", "General_Category"), ("gcb", "Grapheme_Cluster_Break"),
("generalcategory", "General_Category"), ("graphemebase", "Grapheme_Base"),
Expand Down

0 comments on commit 5eaff67

Please sign in to comment.