Skip to content

Commit

Permalink
Rewrite pest_meta::UNICODE_PROPERTY_NAMES to pest::unicode::unicode_p…
Browse files Browse the repository at this point in the history
…roperty_names.

-  will generate property names by use macro.
-  has been removed.
  • Loading branch information
huacnlee committed Dec 22, 2022
1 parent 524bfd7 commit 57dfab7
Show file tree
Hide file tree
Showing 5 changed files with 7,038 additions and 428 deletions.
4 changes: 2 additions & 2 deletions generator/src/generator.rs
Expand Up @@ -13,9 +13,9 @@ use proc_macro2::TokenStream;
use quote::{ToTokens, TokenStreamExt};
use syn::{self, Generics, Ident};

use pest::unicode::unicode_property_names;
use pest_meta::ast::*;
use pest_meta::optimizer::*;
use pest_meta::UNICODE_PROPERTY_NAMES;

pub fn generate(
name: Ident,
Expand Down Expand Up @@ -153,7 +153,7 @@ fn generate_builtin_rules() -> Vec<(&'static str, TokenStream)> {

let box_ty = box_type();

for property in UNICODE_PROPERTY_NAMES {
for property in unicode_property_names() {
let property_ident: Ident = syn::parse_str(property).unwrap();
// insert manually for #property substitution
builtins.push((property, quote! {
Expand Down
102 changes: 0 additions & 102 deletions meta/src/lib.rs
Expand Up @@ -67,105 +67,3 @@ pub fn parse_and_optimize(

Ok((defaults, optimizer::optimize(ast)))
}

#[doc(hidden)]
pub static UNICODE_PROPERTY_NAMES: &[&str] = &[
/* BINARY */
"ALPHABETIC",
"BIDI_CONTROL",
"CASE_IGNORABLE",
"CASED",
"CHANGES_WHEN_CASEFOLDED",
"CHANGES_WHEN_CASEMAPPED",
"CHANGES_WHEN_LOWERCASED",
"CHANGES_WHEN_TITLECASED",
"CHANGES_WHEN_UPPERCASED",
"DASH",
"DEFAULT_IGNORABLE_CODE_POINT",
"DEPRECATED",
"DIACRITIC",
"EXTENDER",
"GRAPHEME_BASE",
"GRAPHEME_EXTEND",
"GRAPHEME_LINK",
"HEX_DIGIT",
"HYPHEN",
"IDS_BINARY_OPERATOR",
"IDS_TRINARY_OPERATOR",
"ID_CONTINUE",
"ID_START",
"IDEOGRAPHIC",
"JOIN_CONTROL",
"LOGICAL_ORDER_EXCEPTION",
"LOWERCASE",
"MATH",
"NONCHARACTER_CODE_POINT",
"OTHER_ALPHABETIC",
"OTHER_DEFAULT_IGNORABLE_CODE_POINT",
"OTHER_GRAPHEME_EXTEND",
"OTHER_ID_CONTINUE",
"OTHER_ID_START",
"OTHER_LOWERCASE",
"OTHER_MATH",
"OTHER_UPPERCASE",
"PATTERN_SYNTAX",
"PATTERN_WHITE_SPACE",
"PREPENDED_CONCATENATION_MARK",
"QUOTATION_MARK",
"RADICAL",
"REGIONAL_INDICATOR",
"SENTENCE_TERMINAL",
"SOFT_DOTTED",
"TERMINAL_PUNCTUATION",
"UNIFIED_IDEOGRAPH",
"UPPERCASE",
"VARIATION_SELECTOR",
"WHITE_SPACE",
"XID_CONTINUE",
"XID_START",
/* CATEGORY */
"CASED_LETTER",
"CLOSE_PUNCTUATION",
"CONNECTOR_PUNCTUATION",
"CONTROL",
"CURRENCY_SYMBOL",
"DASH_PUNCTUATION",
"DECIMAL_NUMBER",
"ENCLOSING_MARK",
"FINAL_PUNCTUATION",
"FORMAT",
"INITIAL_PUNCTUATION",
"LETTER",
"LETTER_NUMBER",
"LINE_SEPARATOR",
"LOWERCASE_LETTER",
"MARK",
"MATH_SYMBOL",
"MODIFIER_LETTER",
"MODIFIER_SYMBOL",
"NONSPACING_MARK",
"NUMBER",
"OPEN_PUNCTUATION",
"OTHER",
"OTHER_LETTER",
"OTHER_NUMBER",
"OTHER_PUNCTUATION",
"OTHER_SYMBOL",
"PARAGRAPH_SEPARATOR",
"PRIVATE_USE",
"PUNCTUATION",
"SEPARATOR",
"SPACE_SEPARATOR",
"SPACING_MARK",
"SURROGATE",
"SYMBOL",
"TITLECASE_LETTER",
"UNASSIGNED",
"UPPERCASE_LETTER",
/* Script */
"CJK",
"HAN",
"HANGUL",
"HIRAGANA",
"KATAKANA",
];
4 changes: 2 additions & 2 deletions meta/src/validator.rs
Expand Up @@ -15,10 +15,10 @@ use std::collections::{HashMap, HashSet};

use pest::error::{Error, ErrorVariant, InputLocation};
use pest::iterators::Pairs;
use pest::unicode::unicode_property_names;
use pest::Span;

use crate::parser::{ParserExpr, ParserNode, ParserRule, Rule};
use crate::UNICODE_PROPERTY_NAMES;

static RUST_KEYWORDS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
[
Expand Down Expand Up @@ -66,7 +66,7 @@ static BUILTINS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
]
.iter()
.cloned()
.chain(UNICODE_PROPERTY_NAMES.iter().cloned())
.chain(unicode_property_names())
.collect::<HashSet<&str>>()
});

Expand Down
39 changes: 25 additions & 14 deletions pest/src/unicode/mod.rs
Expand Up @@ -10,21 +10,26 @@ use alloc::boxed::Box;
macro_rules! char_property_functions {
{$(
mod $module:ident;
[$(
static $property_names:ident = [$(
$prop:ident,
)*];
)*} => {$(
#[allow(unused)]
mod $module;
// ALPHABETIC('a')
$(pub fn $prop(c: char) -> bool {
self::$module::$prop.contains_char(c)
})*

pub static $property_names: &[&str] = &[
$(stringify!($prop),)*
];
)*};
}

char_property_functions! {
mod binary;
[
static BINARY_PROPERTY_NAMES = [
// ASCII_HEX_DIGIT, // let this one be stripped out -- the full trie is wasteful for ASCII
ALPHABETIC, BIDI_CONTROL, CASE_IGNORABLE, CASED, CHANGES_WHEN_CASEFOLDED,
CHANGES_WHEN_CASEMAPPED, CHANGES_WHEN_LOWERCASED, CHANGES_WHEN_TITLECASED,
Expand All @@ -40,7 +45,7 @@ char_property_functions! {
];

mod category;
[
static CATEGORY_PROPERTY_NAMES = [
CASED_LETTER, CLOSE_PUNCTUATION, CONNECTOR_PUNCTUATION, CONTROL, CURRENCY_SYMBOL,
DASH_PUNCTUATION, DECIMAL_NUMBER, ENCLOSING_MARK, FINAL_PUNCTUATION, FORMAT,
INITIAL_PUNCTUATION, LETTER, LETTER_NUMBER, LINE_SEPARATOR, LOWERCASE_LETTER, MARK,
Expand All @@ -51,11 +56,26 @@ char_property_functions! {
];

mod script;
[
HAN, KATAKANA, HIRAGANA, HANGUL,
static SCRIPT_PROPERTY_NAMES = [
// Chinese
HAN,
// Japanese
KATAKANA, HIRAGANA,
// Korean
HANGUL,
];
}

pub fn unicode_property_names() -> Box<dyn Iterator<Item = &'static str>> {
Box::new(
BINARY_PROPERTY_NAMES
.iter()
.map(|name| *name)
.chain(CATEGORY_PROPERTY_NAMES.iter().map(|name| *name))
.chain(SCRIPT_PROPERTY_NAMES.iter().map(|name| *name)),
)
}

pub fn by_name(name: &str) -> Option<Box<dyn Fn(char) -> bool>> {
for property in binary::BY_NAME {
if name == property.0.to_uppercase() {
Expand All @@ -75,14 +95,5 @@ pub fn by_name(name: &str) -> Option<Box<dyn Fn(char) -> bool>> {
}
}

if name == "CJK" {
return Some(Box::new(|c| {
script::HAN.contains_char(c)
|| script::HANGUL.contains_char(c)
|| script::KATAKANA.contains_char(c)
|| script::HIRAGANA.contains_char(c)
}));
}

None
}

0 comments on commit 57dfab7

Please sign in to comment.