diff --git a/CHANGELOG.md b/CHANGELOG.md index 8bf571caf..961b5abb8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +TBD +=== + +* [BUG #680](https://github.com/rust-lang/regex/issues/680): + Fixes a bug where `[[:alnum:][:^ascii:]]` dropped `[:alnum:]` from the class. + + 1.5.5 (2022-03-08) ================== This releases fixes a security bug in the regex compiler. This bug permits a diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 99c949302..a7ae9bc38 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -434,20 +434,14 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } ast::ClassSetItem::Ascii(ref x) => { if self.flags().unicode() { + let xcls = self.hir_ascii_unicode_class(x)?; let mut cls = self.pop().unwrap().unwrap_class_unicode(); - for &(s, e) in ascii_class(&x.kind) { - cls.push(hir::ClassUnicodeRange::new(s, e)); - } - self.unicode_fold_and_negate( - &x.span, x.negated, &mut cls, - )?; + cls.union(&xcls); self.push(HirFrame::ClassUnicode(cls)); } else { + let xcls = self.hir_ascii_byte_class(x)?; let mut cls = self.pop().unwrap().unwrap_class_bytes(); - for &(s, e) in ascii_class(&x.kind) { - cls.push(hir::ClassBytesRange::new(s as u8, e as u8)); - } - self.bytes_fold_and_negate(&x.span, x.negated, &mut cls)?; + cls.union(&xcls); self.push(HirFrame::ClassBytes(cls)); } } @@ -853,6 +847,32 @@ impl<'t, 'p> TranslatorI<'t, 'p> { result } + fn hir_ascii_unicode_class( + &self, + ast: &ast::ClassAscii, + ) -> Result { + let mut cls = hir::ClassUnicode::new( + ascii_class(&ast.kind) + .iter() + .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)), + ); + self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?; + Ok(cls) + } + + fn hir_ascii_byte_class( + &self, + ast: &ast::ClassAscii, + ) -> Result { + let mut cls = hir::ClassBytes::new( + ascii_class(&ast.kind) + .iter() + .map(|&(s, e)| hir::ClassBytesRange::new(s as u8, e as u8)), + ); + self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?; + Ok(cls) + } + fn hir_perl_unicode_class( &self, ast_class: &ast::ClassPerl, @@ -948,7 +968,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { class: &mut hir::ClassBytes, ) -> Result<()> { // Note that we must apply case folding before negation! - // Consider `(?i)[^x]`. If we applied negation field, then + // Consider `(?i)[^x]`. If we applied negation first, then // the result would be the character class that matched any // Unicode scalar value. if self.flags().case_insensitive() { @@ -1943,6 +1963,25 @@ mod tests { ); } + #[test] + fn class_ascii_multiple() { + // See: https://github.com/rust-lang/regex/issues/680 + assert_eq!( + t("[[:alnum:][:^ascii:]]"), + hir_union( + hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)), + hir_uclass(&[('\u{80}', '\u{10FFFF}')]), + ), + ); + assert_eq!( + t_bytes("(?-u)[[:alnum:][:^ascii:]]"), + hir_union( + hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Alnum)), + hir_bclass(&[(0x80, 0xFF)]), + ), + ); + } + #[test] #[cfg(feature = "unicode-perl")] fn class_perl() {