Skip to content

Commit

Permalink
Add CJK unicode into built-in rules.
Browse files Browse the repository at this point in the history
Make this change to add `CJK`, `HAN`, `HANGUL`, `KATAKANA`, `HIRAGANA` to built-in rules.

https://unicode.org/faq/han_cjk.html

- Chinese - `HAN`
- Japanese - `KATAKANA`, `HIRAGANA`
- Korean - `HANGUL`

So we can easy to to match the CJK chars.
  • Loading branch information
huacnlee committed Dec 22, 2022
1 parent 2c47201 commit 524bfd7
Show file tree
Hide file tree
Showing 6 changed files with 358 additions and 2 deletions.
6 changes: 6 additions & 0 deletions meta/src/lib.rs
Expand Up @@ -162,4 +162,10 @@ pub static UNICODE_PROPERTY_NAMES: &[&str] = &[
"TITLECASE_LETTER",
"UNASSIGNED",
"UPPERCASE_LETTER",
/* Script */
"CJK",
"HAN",
"HANGUL",
"HIRAGANA",
"KATAKANA",
];
2 changes: 1 addition & 1 deletion pest/src/unicode/binary.rs
Expand Up @@ -4,7 +4,7 @@
//
// Unicode version: 15.0.0.
//
// ucd-generate 0.2.13 is available on crates.io.
// ucd-generate 0.2.15 is available on crates.io.

pub const BY_NAME: &'static [(&'static str, &'static ::ucd_trie::TrieSet)] = &[
("ASCII_Hex_Digit", ASCII_HEX_DIGIT), ("Alphabetic", ALPHABETIC),
Expand Down
2 changes: 1 addition & 1 deletion pest/src/unicode/category.rs
Expand Up @@ -4,7 +4,7 @@
//
// Unicode version: 15.0.0.
//
// ucd-generate 0.2.13 is available on crates.io.
// ucd-generate 0.2.15 is available on crates.io.

pub const BY_NAME: &'static [(&'static str, &'static ::ucd_trie::TrieSet)] = &[
("Cased_Letter", CASED_LETTER), ("Close_Punctuation", CLOSE_PUNCTUATION),
Expand Down
20 changes: 20 additions & 0 deletions pest/src/unicode/mod.rs
Expand Up @@ -49,6 +49,11 @@ char_property_functions! {
PRIVATE_USE, PUNCTUATION, SEPARATOR, SPACE_SEPARATOR, SPACING_MARK, SURROGATE, SYMBOL,
TITLECASE_LETTER, UNASSIGNED, UPPERCASE_LETTER,
];

mod script;
[
HAN, KATAKANA, HIRAGANA, HANGUL,
];
}

pub fn by_name(name: &str) -> Option<Box<dyn Fn(char) -> bool>> {
Expand All @@ -64,5 +69,20 @@ pub fn by_name(name: &str) -> Option<Box<dyn Fn(char) -> bool>> {
}
}

for property in script::BY_NAME {
if name == property.0.to_uppercase() {
return Some(Box::new(move |c| property.1.contains_char(c)));
}
}

if name == "CJK" {
return Some(Box::new(|c| {
script::HAN.contains_char(c)
|| script::HANGUL.contains_char(c)
|| script::KATAKANA.contains_char(c)
|| script::HIRAGANA.contains_char(c)
}));
}

None
}

0 comments on commit 524bfd7

Please sign in to comment.