Skip to content

Commit

Permalink
syntax: \p{Sc} should map to \p{Currency_Symbol}
Browse files Browse the repository at this point in the history
'sc' refers to the 'Currency_Symbol' general category, but is also
the abbreviation for the 'Script' property. So when going through the
canonicalization process, it would get normalized to 'Script' before
being checked as a general category. We fix it by special casing it.

See also #719

Fixes #835, #899
  • Loading branch information
snsmac authored and BurntSushi committed Mar 4, 2023
1 parent 56a3d80 commit a009f89
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 2 deletions.
9 changes: 7 additions & 2 deletions regex-syntax/src/unicode.rs
Expand Up @@ -188,7 +188,7 @@ impl<'a> ClassQuery<'a> {
fn canonicalize(&self) -> Result<CanonicalClassQuery> {
match *self {
ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()),
ClassQuery::Binary(name) => self.canonical_binary(name),
ClassQuery::Binary(name) => std::dbg!(self.canonical_binary(name)),
ClassQuery::ByValue { property_name, property_value } => {
let property_name = symbolic_name_normalize(property_name);
let property_value = symbolic_name_normalize(property_value);
Expand Down Expand Up @@ -243,7 +243,12 @@ impl<'a> ClassQuery<'a> {
// a general category. (Currently, we don't even support the
// 'Case_Folding' property. But if we do in the future, users will be
// required to spell it out.)
if norm != "cf" {
//
// Also 'sc' refers to the 'Currency_Symbol' general category, but is
// also the abbreviation for the 'Script' property. So we avoid calling
// 'canonical_prop' for it too, which would erroneously normalize it
// to 'Script'.
if norm != "cf" && norm != "sc" {
if let Some(canon) = canonical_prop(&norm)? {
return Ok(CanonicalClassQuery::Binary(canon));
}
Expand Down
1 change: 1 addition & 0 deletions tests/unicode.rs
Expand Up @@ -77,6 +77,7 @@ mat!(uni_class_gencat_format, r"\p{Format}", "\u{E007F}", Some((0, 4)));
// See: https://github.com/rust-lang/regex/issues/719
mat!(uni_class_gencat_format_abbrev1, r"\p{cf}", "\u{E007F}", Some((0, 4)));
mat!(uni_class_gencat_format_abbrev2, r"\p{gc=cf}", "\u{E007F}", Some((0, 4)));
mat!(uni_class_gencat_format_abbrev3, r"\p{Sc}", "$", Some((0, 1)));
mat!(
uni_class_gencat_initial_punctuation,
r"\p{Initial_Punctuation}",
Expand Down

0 comments on commit a009f89

Please sign in to comment.