Skip to content

Commit

Permalink
Improve unicode char_property_functions macro for supports original…
Browse files Browse the repository at this point in the history
… BY_NAME values by `ucd-generate` generated.

And export all property names from Unicode (Script).
  • Loading branch information
huacnlee committed Dec 22, 2022
1 parent 57dfab7 commit 89504ed
Showing 1 changed file with 218 additions and 21 deletions.
239 changes: 218 additions & 21 deletions pest/src/unicode/mod.rs
Expand Up @@ -7,23 +7,45 @@

use alloc::boxed::Box;

macro_rules! char_property_functions {
{$(
mod $module:ident;
static $property_names:ident = [$(
$prop:ident,
)*];
)*} => {$(
macro_rules! property_functions {
($module:ident, $property_names:ident, [$(
$prop:ident,
)*]) => {
#[allow(unused)]
mod $module;
// ALPHABETIC('a')
// unicode::ALPHABETIC('a')
$(pub fn $prop(c: char) -> bool {
self::$module::$prop.contains_char(c)
})*

pub static $property_names: &[&str] = &[
$(stringify!($prop),)*
];
};
}

macro_rules! char_property_functions {
// For define custom property names
{$(
mod $module:ident;
static $property_names:ident = [$(
$prop:ident,
)*];
)*} => {$(
property_functions!($module, $property_names, [$(
$prop,
)*]);
)*};
// For define property by copy BY_NAME values from `ucd-generate` generated.
{$(
mod $module:ident;
static $property_names:ident = [$(
($_name:tt, $prop:ident),
)*];
)*} => {$(
property_functions!($module, $property_names, [$(
$prop,
)*]);
)*};
}

Expand All @@ -43,26 +65,201 @@ char_property_functions! {
REGIONAL_INDICATOR, SENTENCE_TERMINAL, SOFT_DOTTED, TERMINAL_PUNCTUATION, UNIFIED_IDEOGRAPH,
UPPERCASE, VARIATION_SELECTOR, WHITE_SPACE, XID_CONTINUE, XID_START,
];
}

char_property_functions! {
mod category;
// Copy from category::BY_NAME
static CATEGORY_PROPERTY_NAMES = [
CASED_LETTER, CLOSE_PUNCTUATION, CONNECTOR_PUNCTUATION, CONTROL, CURRENCY_SYMBOL,
DASH_PUNCTUATION, DECIMAL_NUMBER, ENCLOSING_MARK, FINAL_PUNCTUATION, FORMAT,
INITIAL_PUNCTUATION, LETTER, LETTER_NUMBER, LINE_SEPARATOR, LOWERCASE_LETTER, MARK,
MATH_SYMBOL, MODIFIER_LETTER, MODIFIER_SYMBOL, NONSPACING_MARK, NUMBER, OPEN_PUNCTUATION,
OTHER, OTHER_LETTER, OTHER_NUMBER, OTHER_PUNCTUATION, OTHER_SYMBOL, PARAGRAPH_SEPARATOR,
PRIVATE_USE, PUNCTUATION, SEPARATOR, SPACE_SEPARATOR, SPACING_MARK, SURROGATE, SYMBOL,
TITLECASE_LETTER, UNASSIGNED, UPPERCASE_LETTER,
("Cased_Letter", CASED_LETTER), ("Close_Punctuation", CLOSE_PUNCTUATION),
("Connector_Punctuation", CONNECTOR_PUNCTUATION), ("Control", CONTROL),
("Currency_Symbol", CURRENCY_SYMBOL),
("Dash_Punctuation", DASH_PUNCTUATION), ("Decimal_Number", DECIMAL_NUMBER),
("Enclosing_Mark", ENCLOSING_MARK),
("Final_Punctuation", FINAL_PUNCTUATION), ("Format", FORMAT),
("Initial_Punctuation", INITIAL_PUNCTUATION), ("Letter", LETTER),
("Letter_Number", LETTER_NUMBER), ("Line_Separator", LINE_SEPARATOR),
("Lowercase_Letter", LOWERCASE_LETTER), ("Mark", MARK),
("Math_Symbol", MATH_SYMBOL), ("Modifier_Letter", MODIFIER_LETTER),
("Modifier_Symbol", MODIFIER_SYMBOL), ("Nonspacing_Mark", NONSPACING_MARK),
("Number", NUMBER), ("Open_Punctuation", OPEN_PUNCTUATION),
("Other", OTHER), ("Other_Letter", OTHER_LETTER),
("Other_Number", OTHER_NUMBER), ("Other_Punctuation", OTHER_PUNCTUATION),
("Other_Symbol", OTHER_SYMBOL),
("Paragraph_Separator", PARAGRAPH_SEPARATOR), ("Private_Use", PRIVATE_USE),
("Punctuation", PUNCTUATION), ("Separator", SEPARATOR),
("Space_Separator", SPACE_SEPARATOR), ("Spacing_Mark", SPACING_MARK),
("Surrogate", SURROGATE), ("Symbol", SYMBOL),
("Titlecase_Letter", TITLECASE_LETTER), ("Unassigned", UNASSIGNED),
("Uppercase_Letter", UPPERCASE_LETTER),
];

mod script;
// Copy from script::BY_NAME
static SCRIPT_PROPERTY_NAMES = [
// Chinese
HAN,
// Japanese
KATAKANA, HIRAGANA,
// Korean
HANGUL,
("Adlam", ADLAM),
("Ahom", AHOM),
("Anatolian_Hieroglyphs", ANATOLIAN_HIEROGLYPHS),
("Arabic", ARABIC),
("Armenian", ARMENIAN),
("Avestan", AVESTAN),
("Balinese", BALINESE),
("Bamum", BAMUM),
("Bassa_Vah", BASSA_VAH),
("Batak", BATAK),
("Bengali", BENGALI),
("Bhaiksuki", BHAIKSUKI),
("Bopomofo", BOPOMOFO),
("Brahmi", BRAHMI),
("Braille", BRAILLE),
("Buginese", BUGINESE),
("Buhid", BUHID),
("Canadian_Aboriginal", CANADIAN_ABORIGINAL),
("Carian", CARIAN),
("Caucasian_Albanian", CAUCASIAN_ALBANIAN),
("Chakma", CHAKMA),
("Cham", CHAM),
("Cherokee", CHEROKEE),
("Chorasmian", CHORASMIAN),
("Common", COMMON),
("Coptic", COPTIC),
("Cuneiform", CUNEIFORM),
("Cypriot", CYPRIOT),
("Cypro_Minoan", CYPRO_MINOAN),
("Cyrillic", CYRILLIC),
("Deseret", DESERET),
("Devanagari", DEVANAGARI),
("Dives_Akuru", DIVES_AKURU),
("Dogra", DOGRA),
("Duployan", DUPLOYAN),
("Egyptian_Hieroglyphs", EGYPTIAN_HIEROGLYPHS),
("Elbasan", ELBASAN),
("Elymaic", ELYMAIC),
("Ethiopic", ETHIOPIC),
("Georgian", GEORGIAN),
("Glagolitic", GLAGOLITIC),
("Gothic", GOTHIC),
("Grantha", GRANTHA),
("Greek", GREEK),
("Gujarati", GUJARATI),
("Gunjala_Gondi", GUNJALA_GONDI),
("Gurmukhi", GURMUKHI),
("Han", HAN),
("Hangul", HANGUL),
("Hanifi_Rohingya", HANIFI_ROHINGYA),
("Hanunoo", HANUNOO),
("Hatran", HATRAN),
("Hebrew", HEBREW),
("Hiragana", HIRAGANA),
("Imperial_Aramaic", IMPERIAL_ARAMAIC),
("Inherited", INHERITED),
("Inscriptional_Pahlavi", INSCRIPTIONAL_PAHLAVI),
("Inscriptional_Parthian", INSCRIPTIONAL_PARTHIAN),
("Javanese", JAVANESE),
("Kaithi", KAITHI),
("Kannada", KANNADA),
("Katakana", KATAKANA),
("Kawi", KAWI),
("Kayah_Li", KAYAH_LI),
("Kharoshthi", KHAROSHTHI),
("Khitan_Small_Script", KHITAN_SMALL_SCRIPT),
("Khmer", KHMER),
("Khojki", KHOJKI),
("Khudawadi", KHUDAWADI),
("Lao", LAO),
("Latin", LATIN),
("Lepcha", LEPCHA),
("Limbu", LIMBU),
("Linear_A", LINEAR_A),
("Linear_B", LINEAR_B),
("Lisu", LISU),
("Lycian", LYCIAN),
("Lydian", LYDIAN),
("Mahajani", MAHAJANI),
("Makasar", MAKASAR),
("Malayalam", MALAYALAM),
("Mandaic", MANDAIC),
("Manichaean", MANICHAEAN),
("Marchen", MARCHEN),
("Masaram_Gondi", MASARAM_GONDI),
("Medefaidrin", MEDEFAIDRIN),
("Meetei_Mayek", MEETEI_MAYEK),
("Mende_Kikakui", MENDE_KIKAKUI),
("Meroitic_Cursive", MEROITIC_CURSIVE),
("Meroitic_Hieroglyphs", MEROITIC_HIEROGLYPHS),
("Miao", MIAO),
("Modi", MODI),
("Mongolian", MONGOLIAN),
("Mro", MRO),
("Multani", MULTANI),
("Myanmar", MYANMAR),
("Nabataean", NABATAEAN),
("Nag_Mundari", NAG_MUNDARI),
("Nandinagari", NANDINAGARI),
("New_Tai_Lue", NEW_TAI_LUE),
("Newa", NEWA),
("Nko", NKO),
("Nushu", NUSHU),
("Nyiakeng_Puachue_Hmong", NYIAKENG_PUACHUE_HMONG),
("Ogham", OGHAM),
("Ol_Chiki", OL_CHIKI),
("Old_Hungarian", OLD_HUNGARIAN),
("Old_Italic", OLD_ITALIC),
("Old_North_Arabian", OLD_NORTH_ARABIAN),
("Old_Permic", OLD_PERMIC),
("Old_Persian", OLD_PERSIAN),
("Old_Sogdian", OLD_SOGDIAN),
("Old_South_Arabian", OLD_SOUTH_ARABIAN),
("Old_Turkic", OLD_TURKIC),
("Old_Uyghur", OLD_UYGHUR),
("Oriya", ORIYA),
("Osage", OSAGE),
("Osmanya", OSMANYA),
("Pahawh_Hmong", PAHAWH_HMONG),
("Palmyrene", PALMYRENE),
("Pau_Cin_Hau", PAU_CIN_HAU),
("Phags_Pa", PHAGS_PA),
("Phoenician", PHOENICIAN),
("Psalter_Pahlavi", PSALTER_PAHLAVI),
("Rejang", REJANG),
("Runic", RUNIC),
("Samaritan", SAMARITAN),
("Saurashtra", SAURASHTRA),
("Sharada", SHARADA),
("Shavian", SHAVIAN),
("Siddham", SIDDHAM),
("SignWriting", SIGNWRITING),
("Sinhala", SINHALA),
("Sogdian", SOGDIAN),
("Sora_Sompeng", SORA_SOMPENG),
("Soyombo", SOYOMBO),
("Sundanese", SUNDANESE),
("Syloti_Nagri", SYLOTI_NAGRI),
("Syriac", SYRIAC),
("Tagalog", TAGALOG),
("Tagbanwa", TAGBANWA),
("Tai_Le", TAI_LE),
("Tai_Tham", TAI_THAM),
("Tai_Viet", TAI_VIET),
("Takri", TAKRI),
("Tamil", TAMIL),
("Tangsa", TANGSA),
("Tangut", TANGUT),
("Telugu", TELUGU),
("Thaana", THAANA),
("Thai", THAI),
("Tibetan", TIBETAN),
("Tifinagh", TIFINAGH),
("Tirhuta", TIRHUTA),
("Toto", TOTO),
("Ugaritic", UGARITIC),
("Vai", VAI),
("Vithkuqi", VITHKUQI),
("Wancho", WANCHO),
("Warang_Citi", WARANG_CITI),
("Yezidi", YEZIDI),
("Yi", YI),
("Zanabazar_Square", ZANABAZAR_SQUARE),
];
}

Expand Down

0 comments on commit 89504ed

Please sign in to comment.