Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix ascii class bug and 'Hir::is_match_empty' bug #860

Merged
merged 2 commits into from May 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 10 additions & 0 deletions CHANGELOG.md
@@ -1,3 +1,13 @@
TBD
===
The below are changes for the next release, which is to be determined.

* [BUG #680](https://github.com/rust-lang/regex/issues/680):
Fixes a bug where `[[:alnum:][:^ascii:]]` dropped `[:alnum:]` from the class.
* [BUG #859](https://github.com/rust-lang/regex/issues/859):
Fixes a bug where `Hir::is_match_empty` returned `false` for `\b`.


1.5.5 (2022-03-08)
==================
This releases fixes a security bug in the regex compiler. This bug permits a
Expand Down
14 changes: 9 additions & 5 deletions regex-syntax/src/hir/mod.rs
Expand Up @@ -334,9 +334,13 @@ impl Hir {
info.set_any_anchored_end(false);
info.set_literal(false);
info.set_alternation_literal(false);
// A negated word boundary matches the empty string, but a normal
// word boundary does not!
info.set_match_empty(word_boundary.is_negated());
// A negated word boundary matches '', so that's fine. But \b does not
// match \b, so why do we say it can match the empty string? Well,
// because, if you search for \b against 'a', it will report [0, 0) and
// [1, 1) as matches, and both of those matches correspond to the empty
// string. Thus, only *certain* empty strings match \b, which similarly
// applies to \B.
info.set_match_empty(true);
// Negated ASCII word boundaries can match invalid UTF-8.
if let WordBoundary::AsciiNegate = word_boundary {
info.set_always_utf8(false);
Expand Down Expand Up @@ -661,8 +665,8 @@ impl Hir {
/// Return true if and only if the empty string is part of the language
/// matched by this regular expression.
///
/// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\B`,
/// but not `a`, `a+` or `\b`.
/// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\b`
/// and `\B`, but not `a` or `a+`.
pub fn is_match_empty(&self) -> bool {
self.info.is_match_empty()
}
Expand Down
68 changes: 55 additions & 13 deletions regex-syntax/src/hir/translate.rs
Expand Up @@ -434,20 +434,14 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
}
ast::ClassSetItem::Ascii(ref x) => {
if self.flags().unicode() {
let xcls = self.hir_ascii_unicode_class(x)?;
let mut cls = self.pop().unwrap().unwrap_class_unicode();
for &(s, e) in ascii_class(&x.kind) {
cls.push(hir::ClassUnicodeRange::new(s, e));
}
self.unicode_fold_and_negate(
&x.span, x.negated, &mut cls,
)?;
cls.union(&xcls);
self.push(HirFrame::ClassUnicode(cls));
} else {
let xcls = self.hir_ascii_byte_class(x)?;
let mut cls = self.pop().unwrap().unwrap_class_bytes();
for &(s, e) in ascii_class(&x.kind) {
cls.push(hir::ClassBytesRange::new(s as u8, e as u8));
}
self.bytes_fold_and_negate(&x.span, x.negated, &mut cls)?;
cls.union(&xcls);
self.push(HirFrame::ClassBytes(cls));
}
}
Expand Down Expand Up @@ -853,6 +847,32 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
result
}

fn hir_ascii_unicode_class(
&self,
ast: &ast::ClassAscii,
) -> Result<hir::ClassUnicode> {
let mut cls = hir::ClassUnicode::new(
ascii_class(&ast.kind)
.iter()
.map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)),
);
self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
Ok(cls)
}

fn hir_ascii_byte_class(
&self,
ast: &ast::ClassAscii,
) -> Result<hir::ClassBytes> {
let mut cls = hir::ClassBytes::new(
ascii_class(&ast.kind)
.iter()
.map(|&(s, e)| hir::ClassBytesRange::new(s as u8, e as u8)),
);
self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
Ok(cls)
}

fn hir_perl_unicode_class(
&self,
ast_class: &ast::ClassPerl,
Expand Down Expand Up @@ -948,7 +968,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
class: &mut hir::ClassBytes,
) -> Result<()> {
// Note that we must apply case folding before negation!
// Consider `(?i)[^x]`. If we applied negation field, then
// Consider `(?i)[^x]`. If we applied negation first, then
// the result would be the character class that matched any
// Unicode scalar value.
if self.flags().case_insensitive() {
Expand Down Expand Up @@ -1943,6 +1963,25 @@ mod tests {
);
}

#[test]
fn class_ascii_multiple() {
// See: https://github.com/rust-lang/regex/issues/680
assert_eq!(
t("[[:alnum:][:^ascii:]]"),
hir_union(
hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)),
hir_uclass(&[('\u{80}', '\u{10FFFF}')]),
),
);
assert_eq!(
t_bytes("(?-u)[[:alnum:][:^ascii:]]"),
hir_union(
hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Alnum)),
hir_bclass(&[(0x80, 0xFF)]),
),
);
}

#[test]
#[cfg(feature = "unicode-perl")]
fn class_perl() {
Expand Down Expand Up @@ -3100,6 +3139,9 @@ mod tests {
assert!(t(r"\pL*").is_match_empty());
assert!(t(r"a*|b").is_match_empty());
assert!(t(r"b|a*").is_match_empty());
assert!(t(r"a|").is_match_empty());
assert!(t(r"|a").is_match_empty());
assert!(t(r"a||b").is_match_empty());
assert!(t(r"a*a?(abcd)*").is_match_empty());
assert!(t(r"^").is_match_empty());
assert!(t(r"$").is_match_empty());
Expand All @@ -3109,6 +3151,8 @@ mod tests {
assert!(t(r"\z").is_match_empty());
assert!(t(r"\B").is_match_empty());
assert!(t_bytes(r"(?-u)\B").is_match_empty());
assert!(t(r"\b").is_match_empty());
assert!(t(r"(?-u)\b").is_match_empty());

// Negative examples.
assert!(!t(r"a+").is_match_empty());
Expand All @@ -3118,8 +3162,6 @@ mod tests {
assert!(!t(r"a{1,10}").is_match_empty());
assert!(!t(r"b|a").is_match_empty());
assert!(!t(r"a*a+(abcd)*").is_match_empty());
assert!(!t(r"\b").is_match_empty());
assert!(!t(r"(?-u)\b").is_match_empty());
}

#[test]
Expand Down