From fc06d1a7eae683f1cff474e2158df3b5e265a4e9 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 28 Apr 2018 12:59:47 -0400 Subject: [PATCH] regex: disable octal syntax by default This commit disables octal syntax by default, which will permit us to produce useful error messages if a user tried to invoke a backreference. This commit adds a new `octal` method to RegexBuilder and RegexSetBuilder which permits callers to re-enable octal syntax. See #457 --- src/exec.rs | 6 +----- src/lib.rs | 5 +++-- src/re_builder.rs | 42 ++++++++++++++++++++++++++++++++++++++++++ tests/test_default.rs | 10 ++++++++++ 4 files changed, 56 insertions(+), 7 deletions(-) diff --git a/src/exec.rs b/src/exec.rs index 73f9550970..9133a0ddd4 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -218,11 +218,7 @@ impl ExecBuilder { for pat in &self.options.pats { let mut parser = ParserBuilder::new() - // TODO(burntsushi): Disable octal in regex 1.0. Nobody - // uses it, and we'll get better error messages when - // someone tries to use a backreference. Provide a new - // opt-in toggle for it though. - .octal(true) + .octal(self.options.octal) .case_insensitive(self.options.case_insensitive) .multi_line(self.options.multi_line) .dot_matches_new_line(self.options.dot_matches_new_line) diff --git a/src/lib.rs b/src/lib.rs index f82eb9c1ce..7d768eea76 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -445,7 +445,7 @@ assert_eq!(&cap[0], "abc"); \n new line \r carriage return \v vertical tab (\x0B) -\123 octal character code (up to three digits) +\123 octal character code (up to three digits) (when enabled) \x7F hex character code (exactly two digits) \x{10FFFF} any hex character code corresponding to a Unicode code point \u007F hex character code (exactly four digits) @@ -619,7 +619,8 @@ determine whether a byte is a word byte or not. 5. Hexadecimal notation can be used to specify arbitrary bytes instead of Unicode codepoints. For example, in ASCII compatible mode, `\xFF` matches the literal byte `\xFF`, while in Unicode mode, `\xFF` is a Unicode codepoint that -matches its UTF-8 encoding of `\xC3\xBF`. Similarly for octal notation. +matches its UTF-8 encoding of `\xC3\xBF`. Similarly for octal notation when +enabled. 6. `.` matches any *byte* except for `\n` instead of any Unicode scalar value. When the `s` flag is enabled, `.` matches any byte. diff --git a/src/re_builder.rs b/src/re_builder.rs index ae60ecffa4..ca3d62536a 100644 --- a/src/re_builder.rs +++ b/src/re_builder.rs @@ -22,6 +22,7 @@ pub struct RegexOptions { pub swap_greed: bool, pub ignore_whitespace: bool, pub unicode: bool, + pub octal: bool, } impl Default for RegexOptions { @@ -37,6 +38,7 @@ impl Default for RegexOptions { swap_greed: false, ignore_whitespace: false, unicode: true, + octal: false, } } } @@ -142,6 +144,26 @@ impl RegexBuilder { self } + /// Whether to support octal syntax or not. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints in + /// a regular expression. For example, `a`, `\x61`, `\u0061` and + /// `\141` are all equivalent regular expressions, where the last example + /// shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, it does + /// make good error messages harder. That is, in PCRE based regex engines, + /// syntax like `\0` invokes a backreference, which is explicitly + /// unsupported in Rust's regex engine. However, many users expect it to + /// be supported. Therefore, when octal support is disabled, the error + /// message will explicitly mention that backreferences aren't supported. + /// + /// Octal syntax is disabled by default. + pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder { + self.0.octal = yes; + self + } + /// Set the approximate size limit of the compiled regular expression. /// /// This roughly corresponds to the number of bytes occupied by a single @@ -283,6 +305,26 @@ impl RegexSetBuilder { self } + /// Whether to support octal syntax or not. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints in + /// a regular expression. For example, `a`, `\x61`, `\u0061` and + /// `\141` are all equivalent regular expressions, where the last example + /// shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, it does + /// make good error messages harder. That is, in PCRE based regex engines, + /// syntax like `\0` invokes a backreference, which is explicitly + /// unsupported in Rust's regex engine. However, many users expect it to + /// be supported. Therefore, when octal support is disabled, the error + /// message will explicitly mention that backreferences aren't supported. + /// + /// Octal syntax is disabled by default. + pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder { + self.0.octal = yes; + self + } + /// Set the approximate size limit of the compiled regular expression. /// /// This roughly corresponds to the number of bytes occupied by a single diff --git a/tests/test_default.rs b/tests/test_default.rs index e6cf92fa2e..0f8a0c2dba 100644 --- a/tests/test_default.rs +++ b/tests/test_default.rs @@ -75,3 +75,13 @@ fn disallow_non_utf8() { assert!(regex::Regex::new(r"(?-u)[\xFF]").is_err()); assert!(regex::Regex::new(r"(?-u)☃").is_err()); } + +#[test] +fn disallow_octal() { + assert!(regex::Regex::new(r"\0").is_err()); +} + +#[test] +fn allow_octal() { + assert!(regex::RegexBuilder::new(r"\0").octal(true).build().is_ok()); +}