From 89504edb72f0a35bff037468a033d5403ba28635 Mon Sep 17 00:00:00 2001 From: Jason Lee Date: Fri, 23 Dec 2022 00:03:29 +0800 Subject: [PATCH] Improve unicode `char_property_functions` macro for supports original BY_NAME values by `ucd-generate` generated. And export all property names from Unicode (Script). --- pest/src/unicode/mod.rs | 239 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 218 insertions(+), 21 deletions(-) diff --git a/pest/src/unicode/mod.rs b/pest/src/unicode/mod.rs index c37efd1d..106b0ec2 100644 --- a/pest/src/unicode/mod.rs +++ b/pest/src/unicode/mod.rs @@ -7,16 +7,13 @@ use alloc::boxed::Box; -macro_rules! char_property_functions { - {$( - mod $module:ident; - static $property_names:ident = [$( - $prop:ident, - )*]; - )*} => {$( +macro_rules! property_functions { + ($module:ident, $property_names:ident, [$( + $prop:ident, + )*]) => { #[allow(unused)] mod $module; - // ALPHABETIC('a') + // unicode::ALPHABETIC('a') $(pub fn $prop(c: char) -> bool { self::$module::$prop.contains_char(c) })* @@ -24,6 +21,31 @@ macro_rules! char_property_functions { pub static $property_names: &[&str] = &[ $(stringify!($prop),)* ]; + }; +} + +macro_rules! char_property_functions { + // For define custom property names + {$( + mod $module:ident; + static $property_names:ident = [$( + $prop:ident, + )*]; + )*} => {$( + property_functions!($module, $property_names, [$( + $prop, + )*]); + )*}; + // For define property by copy BY_NAME values from `ucd-generate` generated. + {$( + mod $module:ident; + static $property_names:ident = [$( + ($_name:tt, $prop:ident), + )*]; + )*} => {$( + property_functions!($module, $property_names, [$( + $prop, + )*]); )*}; } @@ -43,26 +65,201 @@ char_property_functions! { REGIONAL_INDICATOR, SENTENCE_TERMINAL, SOFT_DOTTED, TERMINAL_PUNCTUATION, UNIFIED_IDEOGRAPH, UPPERCASE, VARIATION_SELECTOR, WHITE_SPACE, XID_CONTINUE, XID_START, ]; +} +char_property_functions! { mod category; + // Copy from category::BY_NAME static CATEGORY_PROPERTY_NAMES = [ - CASED_LETTER, CLOSE_PUNCTUATION, CONNECTOR_PUNCTUATION, CONTROL, CURRENCY_SYMBOL, - DASH_PUNCTUATION, DECIMAL_NUMBER, ENCLOSING_MARK, FINAL_PUNCTUATION, FORMAT, - INITIAL_PUNCTUATION, LETTER, LETTER_NUMBER, LINE_SEPARATOR, LOWERCASE_LETTER, MARK, - MATH_SYMBOL, MODIFIER_LETTER, MODIFIER_SYMBOL, NONSPACING_MARK, NUMBER, OPEN_PUNCTUATION, - OTHER, OTHER_LETTER, OTHER_NUMBER, OTHER_PUNCTUATION, OTHER_SYMBOL, PARAGRAPH_SEPARATOR, - PRIVATE_USE, PUNCTUATION, SEPARATOR, SPACE_SEPARATOR, SPACING_MARK, SURROGATE, SYMBOL, - TITLECASE_LETTER, UNASSIGNED, UPPERCASE_LETTER, + ("Cased_Letter", CASED_LETTER), ("Close_Punctuation", CLOSE_PUNCTUATION), + ("Connector_Punctuation", CONNECTOR_PUNCTUATION), ("Control", CONTROL), + ("Currency_Symbol", CURRENCY_SYMBOL), + ("Dash_Punctuation", DASH_PUNCTUATION), ("Decimal_Number", DECIMAL_NUMBER), + ("Enclosing_Mark", ENCLOSING_MARK), + ("Final_Punctuation", FINAL_PUNCTUATION), ("Format", FORMAT), + ("Initial_Punctuation", INITIAL_PUNCTUATION), ("Letter", LETTER), + ("Letter_Number", LETTER_NUMBER), ("Line_Separator", LINE_SEPARATOR), + ("Lowercase_Letter", LOWERCASE_LETTER), ("Mark", MARK), + ("Math_Symbol", MATH_SYMBOL), ("Modifier_Letter", MODIFIER_LETTER), + ("Modifier_Symbol", MODIFIER_SYMBOL), ("Nonspacing_Mark", NONSPACING_MARK), + ("Number", NUMBER), ("Open_Punctuation", OPEN_PUNCTUATION), + ("Other", OTHER), ("Other_Letter", OTHER_LETTER), + ("Other_Number", OTHER_NUMBER), ("Other_Punctuation", OTHER_PUNCTUATION), + ("Other_Symbol", OTHER_SYMBOL), + ("Paragraph_Separator", PARAGRAPH_SEPARATOR), ("Private_Use", PRIVATE_USE), + ("Punctuation", PUNCTUATION), ("Separator", SEPARATOR), + ("Space_Separator", SPACE_SEPARATOR), ("Spacing_Mark", SPACING_MARK), + ("Surrogate", SURROGATE), ("Symbol", SYMBOL), + ("Titlecase_Letter", TITLECASE_LETTER), ("Unassigned", UNASSIGNED), + ("Uppercase_Letter", UPPERCASE_LETTER), ]; mod script; + // Copy from script::BY_NAME static SCRIPT_PROPERTY_NAMES = [ - // Chinese - HAN, - // Japanese - KATAKANA, HIRAGANA, - // Korean - HANGUL, + ("Adlam", ADLAM), + ("Ahom", AHOM), + ("Anatolian_Hieroglyphs", ANATOLIAN_HIEROGLYPHS), + ("Arabic", ARABIC), + ("Armenian", ARMENIAN), + ("Avestan", AVESTAN), + ("Balinese", BALINESE), + ("Bamum", BAMUM), + ("Bassa_Vah", BASSA_VAH), + ("Batak", BATAK), + ("Bengali", BENGALI), + ("Bhaiksuki", BHAIKSUKI), + ("Bopomofo", BOPOMOFO), + ("Brahmi", BRAHMI), + ("Braille", BRAILLE), + ("Buginese", BUGINESE), + ("Buhid", BUHID), + ("Canadian_Aboriginal", CANADIAN_ABORIGINAL), + ("Carian", CARIAN), + ("Caucasian_Albanian", CAUCASIAN_ALBANIAN), + ("Chakma", CHAKMA), + ("Cham", CHAM), + ("Cherokee", CHEROKEE), + ("Chorasmian", CHORASMIAN), + ("Common", COMMON), + ("Coptic", COPTIC), + ("Cuneiform", CUNEIFORM), + ("Cypriot", CYPRIOT), + ("Cypro_Minoan", CYPRO_MINOAN), + ("Cyrillic", CYRILLIC), + ("Deseret", DESERET), + ("Devanagari", DEVANAGARI), + ("Dives_Akuru", DIVES_AKURU), + ("Dogra", DOGRA), + ("Duployan", DUPLOYAN), + ("Egyptian_Hieroglyphs", EGYPTIAN_HIEROGLYPHS), + ("Elbasan", ELBASAN), + ("Elymaic", ELYMAIC), + ("Ethiopic", ETHIOPIC), + ("Georgian", GEORGIAN), + ("Glagolitic", GLAGOLITIC), + ("Gothic", GOTHIC), + ("Grantha", GRANTHA), + ("Greek", GREEK), + ("Gujarati", GUJARATI), + ("Gunjala_Gondi", GUNJALA_GONDI), + ("Gurmukhi", GURMUKHI), + ("Han", HAN), + ("Hangul", HANGUL), + ("Hanifi_Rohingya", HANIFI_ROHINGYA), + ("Hanunoo", HANUNOO), + ("Hatran", HATRAN), + ("Hebrew", HEBREW), + ("Hiragana", HIRAGANA), + ("Imperial_Aramaic", IMPERIAL_ARAMAIC), + ("Inherited", INHERITED), + ("Inscriptional_Pahlavi", INSCRIPTIONAL_PAHLAVI), + ("Inscriptional_Parthian", INSCRIPTIONAL_PARTHIAN), + ("Javanese", JAVANESE), + ("Kaithi", KAITHI), + ("Kannada", KANNADA), + ("Katakana", KATAKANA), + ("Kawi", KAWI), + ("Kayah_Li", KAYAH_LI), + ("Kharoshthi", KHAROSHTHI), + ("Khitan_Small_Script", KHITAN_SMALL_SCRIPT), + ("Khmer", KHMER), + ("Khojki", KHOJKI), + ("Khudawadi", KHUDAWADI), + ("Lao", LAO), + ("Latin", LATIN), + ("Lepcha", LEPCHA), + ("Limbu", LIMBU), + ("Linear_A", LINEAR_A), + ("Linear_B", LINEAR_B), + ("Lisu", LISU), + ("Lycian", LYCIAN), + ("Lydian", LYDIAN), + ("Mahajani", MAHAJANI), + ("Makasar", MAKASAR), + ("Malayalam", MALAYALAM), + ("Mandaic", MANDAIC), + ("Manichaean", MANICHAEAN), + ("Marchen", MARCHEN), + ("Masaram_Gondi", MASARAM_GONDI), + ("Medefaidrin", MEDEFAIDRIN), + ("Meetei_Mayek", MEETEI_MAYEK), + ("Mende_Kikakui", MENDE_KIKAKUI), + ("Meroitic_Cursive", MEROITIC_CURSIVE), + ("Meroitic_Hieroglyphs", MEROITIC_HIEROGLYPHS), + ("Miao", MIAO), + ("Modi", MODI), + ("Mongolian", MONGOLIAN), + ("Mro", MRO), + ("Multani", MULTANI), + ("Myanmar", MYANMAR), + ("Nabataean", NABATAEAN), + ("Nag_Mundari", NAG_MUNDARI), + ("Nandinagari", NANDINAGARI), + ("New_Tai_Lue", NEW_TAI_LUE), + ("Newa", NEWA), + ("Nko", NKO), + ("Nushu", NUSHU), + ("Nyiakeng_Puachue_Hmong", NYIAKENG_PUACHUE_HMONG), + ("Ogham", OGHAM), + ("Ol_Chiki", OL_CHIKI), + ("Old_Hungarian", OLD_HUNGARIAN), + ("Old_Italic", OLD_ITALIC), + ("Old_North_Arabian", OLD_NORTH_ARABIAN), + ("Old_Permic", OLD_PERMIC), + ("Old_Persian", OLD_PERSIAN), + ("Old_Sogdian", OLD_SOGDIAN), + ("Old_South_Arabian", OLD_SOUTH_ARABIAN), + ("Old_Turkic", OLD_TURKIC), + ("Old_Uyghur", OLD_UYGHUR), + ("Oriya", ORIYA), + ("Osage", OSAGE), + ("Osmanya", OSMANYA), + ("Pahawh_Hmong", PAHAWH_HMONG), + ("Palmyrene", PALMYRENE), + ("Pau_Cin_Hau", PAU_CIN_HAU), + ("Phags_Pa", PHAGS_PA), + ("Phoenician", PHOENICIAN), + ("Psalter_Pahlavi", PSALTER_PAHLAVI), + ("Rejang", REJANG), + ("Runic", RUNIC), + ("Samaritan", SAMARITAN), + ("Saurashtra", SAURASHTRA), + ("Sharada", SHARADA), + ("Shavian", SHAVIAN), + ("Siddham", SIDDHAM), + ("SignWriting", SIGNWRITING), + ("Sinhala", SINHALA), + ("Sogdian", SOGDIAN), + ("Sora_Sompeng", SORA_SOMPENG), + ("Soyombo", SOYOMBO), + ("Sundanese", SUNDANESE), + ("Syloti_Nagri", SYLOTI_NAGRI), + ("Syriac", SYRIAC), + ("Tagalog", TAGALOG), + ("Tagbanwa", TAGBANWA), + ("Tai_Le", TAI_LE), + ("Tai_Tham", TAI_THAM), + ("Tai_Viet", TAI_VIET), + ("Takri", TAKRI), + ("Tamil", TAMIL), + ("Tangsa", TANGSA), + ("Tangut", TANGUT), + ("Telugu", TELUGU), + ("Thaana", THAANA), + ("Thai", THAI), + ("Tibetan", TIBETAN), + ("Tifinagh", TIFINAGH), + ("Tirhuta", TIRHUTA), + ("Toto", TOTO), + ("Ugaritic", UGARITIC), + ("Vai", VAI), + ("Vithkuqi", VITHKUQI), + ("Wancho", WANCHO), + ("Warang_Citi", WARANG_CITI), + ("Yezidi", YEZIDI), + ("Yi", YI), + ("Zanabazar_Square", ZANABAZAR_SQUARE), ]; }