rust-lang · BurntSushi · May 18, 2022 · May 17, 2022 · May 17, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,13 @@
+TBD
+===
+The below are changes for the next release, which is to be determined.
+
+* [BUG #680](https://github.com/rust-lang/regex/issues/680):
+  Fixes a bug where `[[:alnum:][:^ascii:]]` dropped `[:alnum:]` from the class.
+* [BUG #859](https://github.com/rust-lang/regex/issues/859):
+  Fixes a bug where `Hir::is_match_empty` returned `false` for `\b`.
+
+
 1.5.5 (2022-03-08)
 ==================
 This releases fixes a security bug in the regex compiler. This bug permits a

diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs
@@ -334,9 +334,13 @@ impl Hir {
         info.set_any_anchored_end(false);
         info.set_literal(false);
         info.set_alternation_literal(false);
-        // A negated word boundary matches the empty string, but a normal
-        // word boundary does not!
-        info.set_match_empty(word_boundary.is_negated());
+        // A negated word boundary matches '', so that's fine. But \b does not
+        // match \b, so why do we say it can match the empty string? Well,
+        // because, if you search for \b against 'a', it will report [0, 0) and
+        // [1, 1) as matches, and both of those matches correspond to the empty
+        // string. Thus, only *certain* empty strings match \b, which similarly
+        // applies to \B.
+        info.set_match_empty(true);
         // Negated ASCII word boundaries can match invalid UTF-8.
         if let WordBoundary::AsciiNegate = word_boundary {
             info.set_always_utf8(false);
@@ -661,8 +665,8 @@ impl Hir {
     /// Return true if and only if the empty string is part of the language
     /// matched by this regular expression.
     ///
-    /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\B`,
-    /// but not `a`, `a+` or `\b`.
+    /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\b`
+    /// and `\B`, but not `a` or `a+`.
     pub fn is_match_empty(&self) -> bool {
         self.info.is_match_empty()
     }

diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
@@ -434,20 +434,14 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
             }
             ast::ClassSetItem::Ascii(ref x) => {
                 if self.flags().unicode() {
+                    let xcls = self.hir_ascii_unicode_class(x)?;
                     let mut cls = self.pop().unwrap().unwrap_class_unicode();
-                    for &(s, e) in ascii_class(&x.kind) {
-                        cls.push(hir::ClassUnicodeRange::new(s, e));
-                    }
-                    self.unicode_fold_and_negate(
-                        &x.span, x.negated, &mut cls,
-                    )?;
+                    cls.union(&xcls);
                     self.push(HirFrame::ClassUnicode(cls));
                 } else {
+                    let xcls = self.hir_ascii_byte_class(x)?;
                     let mut cls = self.pop().unwrap().unwrap_class_bytes();
-                    for &(s, e) in ascii_class(&x.kind) {
-                        cls.push(hir::ClassBytesRange::new(s as u8, e as u8));
-                    }
-                    self.bytes_fold_and_negate(&x.span, x.negated, &mut cls)?;
+                    cls.union(&xcls);
                     self.push(HirFrame::ClassBytes(cls));
                 }
             }
@@ -853,6 +847,32 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
         result
     }
 
+    fn hir_ascii_unicode_class(
+        &self,
+        ast: &ast::ClassAscii,
+    ) -> Result<hir::ClassUnicode> {
+        let mut cls = hir::ClassUnicode::new(
+            ascii_class(&ast.kind)
+                .iter()
+                .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)),
+        );
+        self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
+        Ok(cls)
+    }
+
+    fn hir_ascii_byte_class(
+        &self,
+        ast: &ast::ClassAscii,
+    ) -> Result<hir::ClassBytes> {
+        let mut cls = hir::ClassBytes::new(
+            ascii_class(&ast.kind)
+                .iter()
+                .map(|&(s, e)| hir::ClassBytesRange::new(s as u8, e as u8)),
+        );
+        self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
+        Ok(cls)
+    }
+
     fn hir_perl_unicode_class(
         &self,
         ast_class: &ast::ClassPerl,
@@ -948,7 +968,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
         class: &mut hir::ClassBytes,
     ) -> Result<()> {
         // Note that we must apply case folding before negation!
-        // Consider `(?i)[^x]`. If we applied negation field, then
+        // Consider `(?i)[^x]`. If we applied negation first, then
         // the result would be the character class that matched any
         // Unicode scalar value.
         if self.flags().case_insensitive() {
@@ -1943,6 +1963,25 @@ mod tests {
         );
     }
 
+    #[test]
+    fn class_ascii_multiple() {
+        // See: https://github.com/rust-lang/regex/issues/680
+        assert_eq!(
+            t("[[:alnum:][:^ascii:]]"),
+            hir_union(
+                hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)),
+                hir_uclass(&[('\u{80}', '\u{10FFFF}')]),
+            ),
+        );
+        assert_eq!(
+            t_bytes("(?-u)[[:alnum:][:^ascii:]]"),
+            hir_union(
+                hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Alnum)),
+                hir_bclass(&[(0x80, 0xFF)]),
+            ),
+        );
+    }
+
     #[test]
     #[cfg(feature = "unicode-perl")]
     fn class_perl() {
@@ -3100,6 +3139,9 @@ mod tests {
         assert!(t(r"\pL*").is_match_empty());
         assert!(t(r"a*|b").is_match_empty());
         assert!(t(r"b|a*").is_match_empty());
+        assert!(t(r"a|").is_match_empty());
+        assert!(t(r"|a").is_match_empty());
+        assert!(t(r"a||b").is_match_empty());
         assert!(t(r"a*a?(abcd)*").is_match_empty());
         assert!(t(r"^").is_match_empty());
         assert!(t(r"$").is_match_empty());
@@ -3109,6 +3151,8 @@ mod tests {
         assert!(t(r"\z").is_match_empty());
         assert!(t(r"\B").is_match_empty());
         assert!(t_bytes(r"(?-u)\B").is_match_empty());
+        assert!(t(r"\b").is_match_empty());
+        assert!(t(r"(?-u)\b").is_match_empty());
 
         // Negative examples.
         assert!(!t(r"a+").is_match_empty());
@@ -3118,8 +3162,6 @@ mod tests {
         assert!(!t(r"a{1,10}").is_match_empty());
         assert!(!t(r"b|a").is_match_empty());
         assert!(!t(r"a*a+(abcd)*").is_match_empty());
-        assert!(!t(r"\b").is_match_empty());
-        assert!(!t(r"(?-u)\b").is_match_empty());
     }
 
     #[test]