From 0b098ced2f243b5fd7297422fb5e5bc28db5a475 Mon Sep 17 00:00:00 2001 From: snsmac Date: Mon, 1 Aug 2022 20:34:32 +0200 Subject: [PATCH] Syntax: Make \p{Sc} work 'sc' refers to the 'Currency_Symbol' general category, but is also the abbreviation for the 'Script' property. Fixes #835 Related #719 b1489c8 --- regex-syntax/src/unicode.rs | 68 +++++++++++++++++++------------------ tests/unicode.rs | 1 + 2 files changed, 36 insertions(+), 33 deletions(-) diff --git a/regex-syntax/src/unicode.rs b/regex-syntax/src/unicode.rs index 70d5954b7..040532289 100644 --- a/regex-syntax/src/unicode.rs +++ b/regex-syntax/src/unicode.rs @@ -80,11 +80,11 @@ impl fmt::Display for UnicodeWordError { /// This returns an error if the Unicode case folding tables are not available. pub fn simple_fold( c: char, -) -> FoldResult, Option>> { +) -> FoldResult, Option>> { #[cfg(not(feature = "unicode-case"))] fn imp( _: char, - ) -> FoldResult, Option>> + ) -> FoldResult, Option>> { use std::option::IntoIter; Err::, _>, _>(CaseFoldError(())) @@ -93,7 +93,7 @@ pub fn simple_fold( #[cfg(feature = "unicode-case")] fn imp( c: char, - ) -> FoldResult, Option>> + ) -> FoldResult, Option>> { use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; @@ -220,7 +220,7 @@ impl<'a> ClassQuery<'a> { let canon_val = match canonical_value(vals, &property_value) { None => { - return Err(Error::PropertyValueNotFound) + return Err(Error::PropertyValueNotFound); } Some(canon_val) => canon_val, }; @@ -243,7 +243,9 @@ impl<'a> ClassQuery<'a> { // a general category. (Currently, we don't even support the // 'Case_Folding' property. But if we do in the future, users will be // required to spell it out.) - if norm != "cf" { + // 'sc' refers to the 'Currency_Symbol' general category, but is also + // the abbreviation for the 'Script' property. + if norm != "cf" && norm != "sc" { if let Some(canon) = canonical_prop(&norm)? { return Ok(CanonicalClassQuery::Binary(canon)); } @@ -462,24 +464,24 @@ fn canonical_script(normalized_value: &str) -> Result> { /// If the property names data is not available, then an error is returned. fn canonical_prop(normalized_name: &str) -> Result> { #[cfg(not(any( - feature = "unicode-age", - feature = "unicode-bool", - feature = "unicode-gencat", - feature = "unicode-perl", - feature = "unicode-script", - feature = "unicode-segment", + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", )))] fn imp(_: &str) -> Result> { Err(Error::PropertyNotFound) } #[cfg(any( - feature = "unicode-age", - feature = "unicode-bool", - feature = "unicode-gencat", - feature = "unicode-perl", - feature = "unicode-script", - feature = "unicode-segment", + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", ))] fn imp(name: &str) -> Result> { use crate::unicode_tables::property_names::PROPERTY_NAMES; @@ -519,24 +521,24 @@ fn property_values( canonical_property_name: &'static str, ) -> Result> { #[cfg(not(any( - feature = "unicode-age", - feature = "unicode-bool", - feature = "unicode-gencat", - feature = "unicode-perl", - feature = "unicode-script", - feature = "unicode-segment", + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", )))] fn imp(_: &'static str) -> Result> { Err(Error::PropertyValueNotFound) } #[cfg(any( - feature = "unicode-age", - feature = "unicode-bool", - feature = "unicode-gencat", - feature = "unicode-perl", - feature = "unicode-script", - feature = "unicode-segment", + feature = "unicode-age", + feature = "unicode-bool", + feature = "unicode-gencat", + feature = "unicode-perl", + feature = "unicode-script", + feature = "unicode-segment", ))] fn imp(name: &'static str) -> Result> { use crate::unicode_tables::property_values::PROPERTY_VALUES; @@ -569,15 +571,15 @@ fn property_set( /// /// If the given age value isn't valid or if the data isn't available, then an /// error is returned instead. -fn ages(canonical_age: &str) -> Result> { +fn ages(canonical_age: &str) -> Result> { #[cfg(not(feature = "unicode-age"))] - fn imp(_: &str) -> Result> { + fn imp(_: &str) -> Result> { use std::option::IntoIter; Err::, _>(Error::PropertyNotFound) } #[cfg(feature = "unicode-age")] - fn imp(canonical_age: &str) -> Result> { + fn imp(canonical_age: &str) -> Result> { use crate::unicode_tables::age; const AGES: &[(&str, Range)] = &[ @@ -878,7 +880,7 @@ mod tests { }; #[cfg(feature = "unicode-case")] - fn simple_fold_ok(c: char) -> impl Iterator { + fn simple_fold_ok(c: char) -> impl Iterator { simple_fold(c).unwrap().unwrap() } diff --git a/tests/unicode.rs b/tests/unicode.rs index 9b3228624..748bbb79c 100644 --- a/tests/unicode.rs +++ b/tests/unicode.rs @@ -77,6 +77,7 @@ mat!(uni_class_gencat_format, r"\p{Format}", "\u{E007F}", Some((0, 4))); // See: https://github.com/rust-lang/regex/issues/719 mat!(uni_class_gencat_format_abbrev1, r"\p{cf}", "\u{E007F}", Some((0, 4))); mat!(uni_class_gencat_format_abbrev2, r"\p{gc=cf}", "\u{E007F}", Some((0, 4))); +mat!(uni_class_gencat_format_abbrev3, r"\p{Sc}", "$", Some((0, 1))); mat!( uni_class_gencat_initial_punctuation, r"\p{Initial_Punctuation}",