Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Unicode Script into built-in rules. #751

Merged
merged 6 commits into from Dec 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 5 additions & 0 deletions derive/tests/grammar.pest
Expand Up @@ -63,6 +63,11 @@ newline = { NEWLINE+ }
unicode = { XID_START ~ XID_CONTINUE* }
SYMBOL = { "shadows builtin" }

han = { HAN+ }
hangul = { HANGUL+ }
hiragana = { HIRAGANA+ }
arabic = { ARABIC+ }

WHITESPACE = _{ " " }
COMMENT = _{ "$"+ }

Expand Down
4 changes: 2 additions & 2 deletions generator/src/generator.rs
Expand Up @@ -13,9 +13,9 @@ use proc_macro2::TokenStream;
use quote::{ToTokens, TokenStreamExt};
use syn::{self, Generics, Ident};

use pest::unicode::unicode_property_names;
use pest_meta::ast::*;
use pest_meta::optimizer::*;
use pest_meta::UNICODE_PROPERTY_NAMES;

pub fn generate(
name: Ident,
Expand Down Expand Up @@ -153,7 +153,7 @@ fn generate_builtin_rules() -> Vec<(&'static str, TokenStream)> {

let box_ty = box_type();

for property in UNICODE_PROPERTY_NAMES {
for property in unicode_property_names() {
let property_ident: Ident = syn::parse_str(property).unwrap();
// insert manually for #property substitution
builtins.push((property, quote! {
Expand Down
99 changes: 5 additions & 94 deletions meta/src/lib.rs
Expand Up @@ -20,9 +20,11 @@
#[macro_use]
extern crate pest;

use once_cell::sync::Lazy;
use std::fmt::Display;

use pest::error::Error;
use pest::unicode::unicode_property_names;

pub mod ast;
pub mod optimizer;
Expand Down Expand Up @@ -69,97 +71,6 @@ pub fn parse_and_optimize(
}

#[doc(hidden)]
pub static UNICODE_PROPERTY_NAMES: &[&str] = &[
tomtau marked this conversation as resolved.
Show resolved Hide resolved
/* BINARY */
"ALPHABETIC",
"BIDI_CONTROL",
"CASE_IGNORABLE",
"CASED",
"CHANGES_WHEN_CASEFOLDED",
"CHANGES_WHEN_CASEMAPPED",
"CHANGES_WHEN_LOWERCASED",
"CHANGES_WHEN_TITLECASED",
"CHANGES_WHEN_UPPERCASED",
"DASH",
"DEFAULT_IGNORABLE_CODE_POINT",
"DEPRECATED",
"DIACRITIC",
"EXTENDER",
"GRAPHEME_BASE",
"GRAPHEME_EXTEND",
"GRAPHEME_LINK",
"HEX_DIGIT",
"HYPHEN",
"IDS_BINARY_OPERATOR",
"IDS_TRINARY_OPERATOR",
"ID_CONTINUE",
"ID_START",
"IDEOGRAPHIC",
"JOIN_CONTROL",
"LOGICAL_ORDER_EXCEPTION",
"LOWERCASE",
"MATH",
"NONCHARACTER_CODE_POINT",
"OTHER_ALPHABETIC",
"OTHER_DEFAULT_IGNORABLE_CODE_POINT",
"OTHER_GRAPHEME_EXTEND",
"OTHER_ID_CONTINUE",
"OTHER_ID_START",
"OTHER_LOWERCASE",
"OTHER_MATH",
"OTHER_UPPERCASE",
"PATTERN_SYNTAX",
"PATTERN_WHITE_SPACE",
"PREPENDED_CONCATENATION_MARK",
"QUOTATION_MARK",
"RADICAL",
"REGIONAL_INDICATOR",
"SENTENCE_TERMINAL",
"SOFT_DOTTED",
"TERMINAL_PUNCTUATION",
"UNIFIED_IDEOGRAPH",
"UPPERCASE",
"VARIATION_SELECTOR",
"WHITE_SPACE",
"XID_CONTINUE",
"XID_START",
/* CATEGORY */
"CASED_LETTER",
"CLOSE_PUNCTUATION",
"CONNECTOR_PUNCTUATION",
"CONTROL",
"CURRENCY_SYMBOL",
"DASH_PUNCTUATION",
"DECIMAL_NUMBER",
"ENCLOSING_MARK",
"FINAL_PUNCTUATION",
"FORMAT",
"INITIAL_PUNCTUATION",
"LETTER",
"LETTER_NUMBER",
"LINE_SEPARATOR",
"LOWERCASE_LETTER",
"MARK",
"MATH_SYMBOL",
"MODIFIER_LETTER",
"MODIFIER_SYMBOL",
"NONSPACING_MARK",
"NUMBER",
"OPEN_PUNCTUATION",
"OTHER",
"OTHER_LETTER",
"OTHER_NUMBER",
"OTHER_PUNCTUATION",
"OTHER_SYMBOL",
"PARAGRAPH_SEPARATOR",
"PRIVATE_USE",
"PUNCTUATION",
"SEPARATOR",
"SPACE_SEPARATOR",
"SPACING_MARK",
"SURROGATE",
"SYMBOL",
"TITLECASE_LETTER",
"UNASSIGNED",
"UPPERCASE_LETTER",
];
#[deprecated(note = "use `pest::unicode::unicode_property_names` instead")]
pub static UNICODE_PROPERTY_NAMES: Lazy<Vec<&str>> =
Lazy::new(|| unicode_property_names().collect::<Vec<_>>());
4 changes: 2 additions & 2 deletions meta/src/validator.rs
Expand Up @@ -15,10 +15,10 @@ use std::collections::{HashMap, HashSet};

use pest::error::{Error, ErrorVariant, InputLocation};
use pest::iterators::Pairs;
use pest::unicode::unicode_property_names;
use pest::Span;

use crate::parser::{ParserExpr, ParserNode, ParserRule, Rule};
use crate::UNICODE_PROPERTY_NAMES;

static RUST_KEYWORDS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
[
Expand Down Expand Up @@ -66,7 +66,7 @@ static BUILTINS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
]
.iter()
.cloned()
.chain(UNICODE_PROPERTY_NAMES.iter().cloned())
.chain(unicode_property_names())
.collect::<HashSet<&str>>()
});

Expand Down
2 changes: 1 addition & 1 deletion pest/src/unicode/binary.rs
Expand Up @@ -4,7 +4,7 @@
//
// Unicode version: 15.0.0.
//
// ucd-generate 0.2.13 is available on crates.io.
// ucd-generate 0.2.15 is available on crates.io.

pub const BY_NAME: &'static [(&'static str, &'static ::ucd_trie::TrieSet)] = &[
("ASCII_Hex_Digit", ASCII_HEX_DIGIT), ("Alphabetic", ALPHABETIC),
Expand Down
2 changes: 1 addition & 1 deletion pest/src/unicode/category.rs
Expand Up @@ -4,7 +4,7 @@
//
// Unicode version: 15.0.0.
//
// ucd-generate 0.2.13 is available on crates.io.
// ucd-generate 0.2.15 is available on crates.io.

pub const BY_NAME: &'static [(&'static str, &'static ::ucd_trie::TrieSet)] = &[
("Cased_Letter", CASED_LETTER), ("Close_Punctuation", CLOSE_PUNCTUATION),
Expand Down