Skip to content

Commit

Permalink
syntax: fix ascii class union bug
Browse files Browse the repository at this point in the history
This fixes a bug in how ASCII class unioning was implemented. Namely, it
previously and erroneously unioned together two classes and then applied
negation/case-folding based on the most recently added class, even if
the class added previously wasn't negated. So for example, given the
regex '[[:alnum:][:^ascii:]]', this would initialize the class with
'[:alnum:]', then add all '[:^ascii:]' codepoints and then negate the
entire thing because of the negation in '[:^ascii:]'. Negating the
entire thing is clearly wrong and not the intended semantics.

We fix this by applying negation/case-folding only to the class we're
dealing with, and then we union it with whatever existing class we're
building.

Fixes #680
  • Loading branch information
BurntSushi committed May 17, 2022
1 parent b537286 commit 5a65e9a
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 11 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
@@ -1,3 +1,10 @@
TBD
===

* [BUG #680](https://github.com/rust-lang/regex/issues/680):
Fixes a bug where `[[:alnum:][:^ascii:]]` dropped `[:alnum:]` from the class.


1.5.5 (2022-03-08)
==================
This releases fixes a security bug in the regex compiler. This bug permits a
Expand Down
61 changes: 50 additions & 11 deletions regex-syntax/src/hir/translate.rs
Expand Up @@ -434,20 +434,14 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
}
ast::ClassSetItem::Ascii(ref x) => {
if self.flags().unicode() {
let xcls = self.hir_ascii_unicode_class(x)?;
let mut cls = self.pop().unwrap().unwrap_class_unicode();
for &(s, e) in ascii_class(&x.kind) {
cls.push(hir::ClassUnicodeRange::new(s, e));
}
self.unicode_fold_and_negate(
&x.span, x.negated, &mut cls,
)?;
cls.union(&xcls);
self.push(HirFrame::ClassUnicode(cls));
} else {
let xcls = self.hir_ascii_byte_class(x)?;
let mut cls = self.pop().unwrap().unwrap_class_bytes();
for &(s, e) in ascii_class(&x.kind) {
cls.push(hir::ClassBytesRange::new(s as u8, e as u8));
}
self.bytes_fold_and_negate(&x.span, x.negated, &mut cls)?;
cls.union(&xcls);
self.push(HirFrame::ClassBytes(cls));
}
}
Expand Down Expand Up @@ -853,6 +847,32 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
result
}

fn hir_ascii_unicode_class(
&self,
ast: &ast::ClassAscii,
) -> Result<hir::ClassUnicode> {
let mut cls = hir::ClassUnicode::new(
ascii_class(&ast.kind)
.iter()
.map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)),
);
self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
Ok(cls)
}

fn hir_ascii_byte_class(
&self,
ast: &ast::ClassAscii,
) -> Result<hir::ClassBytes> {
let mut cls = hir::ClassBytes::new(
ascii_class(&ast.kind)
.iter()
.map(|&(s, e)| hir::ClassBytesRange::new(s as u8, e as u8)),
);
self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
Ok(cls)
}

fn hir_perl_unicode_class(
&self,
ast_class: &ast::ClassPerl,
Expand Down Expand Up @@ -948,7 +968,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
class: &mut hir::ClassBytes,
) -> Result<()> {
// Note that we must apply case folding before negation!
// Consider `(?i)[^x]`. If we applied negation field, then
// Consider `(?i)[^x]`. If we applied negation first, then
// the result would be the character class that matched any
// Unicode scalar value.
if self.flags().case_insensitive() {
Expand Down Expand Up @@ -1943,6 +1963,25 @@ mod tests {
);
}

#[test]
fn class_ascii_multiple() {
// See: https://github.com/rust-lang/regex/issues/680
assert_eq!(
t("[[:alnum:][:^ascii:]]"),
hir_union(
hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)),
hir_uclass(&[('\u{80}', '\u{10FFFF}')]),
),
);
assert_eq!(
t_bytes("(?-u)[[:alnum:][:^ascii:]]"),
hir_union(
hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Alnum)),
hir_bclass(&[(0x80, 0xFF)]),
),
);
}

#[test]
#[cfg(feature = "unicode-perl")]
fn class_perl() {
Expand Down

0 comments on commit 5a65e9a

Please sign in to comment.