diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index f5b4548b23..55c5f79898 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -98,12 +98,13 @@ fn is_hex(c: char) -> bool { /// Returns true if the given character is a valid in a capture group name. /// /// If `first` is true, then `c` is treated as the first character in the -/// group name (which is not allowed to be a digit). +/// group name (which must be alphabetic or underscore). fn is_capture_char(c: char, first: bool) -> bool { c == '_' - || (!first && c >= '0' && c <= '9') - || (c >= 'a' && c <= 'z') - || (c >= 'A' && c <= 'Z') + || (!first + && (('0' <= c && c <= '9') || c == '.' || c == '[' || c == ']')) + || ('A' <= c && c <= 'Z') + || ('a' <= c && c <= 'z') } /// A builder for a regular expression parser. @@ -3851,6 +3852,45 @@ bar })) ); + assert_eq!( + parser("(?Pz)").parse(), + Ok(Ast::Group(ast::Group { + span: span(0..10), + kind: ast::GroupKind::CaptureName(ast::CaptureName { + span: span(4..7), + name: s("a_1"), + index: 1, + }), + ast: Box::new(lit('z', 8)), + })) + ); + + assert_eq!( + parser("(?Pz)").parse(), + Ok(Ast::Group(ast::Group { + span: span(0..10), + kind: ast::GroupKind::CaptureName(ast::CaptureName { + span: span(4..7), + name: s("a.1"), + index: 1, + }), + ast: Box::new(lit('z', 8)), + })) + ); + + assert_eq!( + parser("(?Pz)").parse(), + Ok(Ast::Group(ast::Group { + span: span(0..11), + kind: ast::GroupKind::CaptureName(ast::CaptureName { + span: span(4..8), + name: s("a[1]"), + index: 1, + }), + ast: Box::new(lit('z', 9)), + })) + ); + assert_eq!( parser("(?P<").parse().unwrap_err(), TestError { diff --git a/src/expand.rs b/src/expand.rs index 528f55e717..fd2ab03acb 100644 --- a/src/expand.rs +++ b/src/expand.rs @@ -24,7 +24,7 @@ pub fn expand_str( continue; } debug_assert!(!replacement.is_empty()); - let cap_ref = match find_cap_ref(replacement) { + let cap_ref = match find_cap_ref(replacement.as_bytes()) { Some(cap_ref) => cap_ref, None => { dst.push_str("$"); @@ -125,19 +125,15 @@ impl From for Ref<'static> { /// starting at the beginning of `replacement`. /// /// If no such valid reference could be found, None is returned. -fn find_cap_ref>( - replacement: &T, -) -> Option { +fn find_cap_ref(replacement: &[u8]) -> Option { let mut i = 0; let rep: &[u8] = replacement.as_ref(); if rep.len() <= 1 || rep[0] != b'$' { return None; } - let mut brace = false; i += 1; if rep[i] == b'{' { - brace = true; - i += 1; + return find_cap_ref_braced(rep, i + 1); } let mut cap_end = i; while rep.get(cap_end).map_or(false, is_valid_cap_letter) { @@ -151,12 +147,6 @@ fn find_cap_ref>( // check with either unsafe or by parsing the number straight from &[u8]. let cap = str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name"); - if brace { - if !rep.get(cap_end).map_or(false, |&b| b == b'}') { - return None; - } - cap_end += 1; - } Some(CaptureRef { cap: match cap.parse::() { Ok(i) => Ref::Number(i as usize), @@ -166,6 +156,31 @@ fn find_cap_ref>( }) } +fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option { + let start = i; + while rep.get(i).map_or(false, |&b| b != b'}') { + i += 1; + } + if !rep.get(i).map_or(false, |&b| b == b'}') { + return None; + } + // When looking at braced names, we don't put any restrictions on the name, + // so it's possible it could be invalid UTF-8. But a capture group name + // can never be invalid UTF-8, so if we have invalid UTF-8, then we can + // safely return None. + let cap = match str::from_utf8(&rep[start..i]) { + Err(_) => return None, + Ok(cap) => cap, + }; + Some(CaptureRef { + cap: match cap.parse::() { + Ok(i) => Ref::Number(i as usize), + Err(_) => Ref::Named(cap), + }, + end: i + 1, + }) +} + /// Returns true if and only if the given byte is allowed in a capture name. fn is_valid_cap_letter(b: &u8) -> bool { match *b { @@ -182,13 +197,13 @@ mod tests { ($name:ident, $text:expr) => { #[test] fn $name() { - assert_eq!(None, find_cap_ref($text)); + assert_eq!(None, find_cap_ref($text.as_bytes())); } }; ($name:ident, $text:expr, $capref:expr) => { #[test] fn $name() { - assert_eq!(Some($capref), find_cap_ref($text)); + assert_eq!(Some($capref), find_cap_ref($text.as_bytes())); } }; } @@ -204,7 +219,8 @@ mod tests { find!(find_cap_ref3, "$0", c!(0, 2)); find!(find_cap_ref4, "$5", c!(5, 2)); find!(find_cap_ref5, "$10", c!(10, 3)); - // see https://github.com/rust-lang/regex/pull/585 for more on characters following numbers + // See https://github.com/rust-lang/regex/pull/585 + // for more on characters following numbers find!(find_cap_ref6, "$42a", c!("42a", 4)); find!(find_cap_ref7, "${42}a", c!(42, 5)); find!(find_cap_ref8, "${42"); @@ -217,4 +233,6 @@ mod tests { find!(find_cap_ref15, "$1_$2", c!("1_", 3)); find!(find_cap_ref16, "$x-$y", c!("x", 2)); find!(find_cap_ref17, "$x_$y", c!("x_", 3)); + find!(find_cap_ref18, "${#}", c!("#", 4)); + find!(find_cap_ref19, "${Z[}", c!("Z[", 5)); } diff --git a/src/lib.rs b/src/lib.rs index 5534179764..bdcebd44b3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -365,7 +365,7 @@ $ the end of text (or end-of-line with multi-line mode)
 (exp)          numbered capture group (indexed by opening parenthesis)
-(?P<name>exp)  named (also numbered) capture group (allowed chars: [_0-9a-zA-Z])
+(?P<name>exp)  named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
 (?:exp)        non-capturing group
 (?flags)       set flags within current group
 (?flags:exp)   set flags for exp (non-capturing)
diff --git a/src/re_bytes.rs b/src/re_bytes.rs
index 69f0b335de..b5a7219d79 100644
--- a/src/re_bytes.rs
+++ b/src/re_bytes.rs
@@ -930,17 +930,22 @@ impl<'t> Captures<'t> {
     /// Expands all instances of `$name` in `replacement` to the corresponding
     /// capture group `name`, and writes them to the `dst` buffer given.
     ///
-    /// `name` may be an integer corresponding to the index of the
-    /// capture group (counted by order of opening parenthesis where `0` is the
+    /// `name` may be an integer corresponding to the index of the capture
+    /// group (counted by order of opening parenthesis where `0` is the
     /// entire match) or it can be a name (consisting of letters, digits or
     /// underscores) corresponding to a named capture group.
     ///
     /// If `name` isn't a valid capture group (whether the name doesn't exist
     /// or isn't a valid index), then it is replaced with the empty string.
     ///
-    /// The longest possible name is used. e.g., `$1a` looks up the capture
-    /// group named `1a` and not the capture group at index `1`. To exert more
-    /// precise control over the name, use braces, e.g., `${1}a`.
+    /// The longest possible name consisting of the characters `[_0-9A-Za-z]`
+    /// is used. e.g., `$1a` looks up the capture group named `1a` and not the
+    /// capture group at index `1`. To exert more precise control over the
+    /// name, or to refer to a capture group name that uses characters outside
+    /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When
+    /// using braces, any sequence of valid UTF-8 bytes is permitted. If the
+    /// sequence does not refer to a capture group name in the corresponding
+    /// regex, then it is replaced with an empty string.
     ///
     /// To write a literal `$` use `$$`.
     pub fn expand(&self, replacement: &[u8], dst: &mut Vec) {
diff --git a/src/re_unicode.rs b/src/re_unicode.rs
index 685d4046de..c23b7719bc 100644
--- a/src/re_unicode.rs
+++ b/src/re_unicode.rs
@@ -947,17 +947,22 @@ impl<'t> Captures<'t> {
     /// Expands all instances of `$name` in `replacement` to the corresponding
     /// capture group `name`, and writes them to the `dst` buffer given.
     ///
-    /// `name` may be an integer corresponding to the index of the
-    /// capture group (counted by order of opening parenthesis where `0` is the
+    /// `name` may be an integer corresponding to the index of the capture
+    /// group (counted by order of opening parenthesis where `0` is the
     /// entire match) or it can be a name (consisting of letters, digits or
     /// underscores) corresponding to a named capture group.
     ///
     /// If `name` isn't a valid capture group (whether the name doesn't exist
     /// or isn't a valid index), then it is replaced with the empty string.
     ///
-    /// The longest possible name is used. e.g., `$1a` looks up the capture
-    /// group named `1a` and not the capture group at index `1`. To exert more
-    /// precise control over the name, use braces, e.g., `${1}a`.
+    /// The longest possible name consisting of the characters `[_0-9A-Za-z]`
+    /// is used. e.g., `$1a` looks up the capture group named `1a` and not the
+    /// capture group at index `1`. To exert more precise control over the
+    /// name, or to refer to a capture group name that uses characters outside
+    /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When
+    /// using braces, any sequence of characters is permitted. If the sequence
+    /// does not refer to a capture group name in the corresponding regex, then
+    /// it is replaced with an empty string.
     ///
     /// To write a literal `$` use `$$`.
     pub fn expand(&self, replacement: &str, dst: &mut String) {
diff --git a/tests/api.rs b/tests/api.rs
index 0d4962cc9f..c7250a8a3a 100644
--- a/tests/api.rs
+++ b/tests/api.rs
@@ -195,6 +195,18 @@ expand!(
 );
 expand!(expand10, r"(?-u)(?P\w+)\s+(?P\d+)", "abc 123", "$bz$az", "");
 
+expand!(expand_name1, r"%(?P[a-z]+)", "%abc", "$Z%", "abc%");
+expand!(expand_name2, r"\[(?P[a-z]+)", "[abc", "$Z[", "abc[");
+expand!(expand_name3, r"\{(?P[a-z]+)", "{abc", "$Z{", "abc{");
+expand!(expand_name4, r"\}(?P[a-z]+)", "}abc", "$Z}", "abc}");
+expand!(expand_name5, r"%([a-z]+)", "%abc", "$1a%", "%");
+expand!(expand_name6, r"%([a-z]+)", "%abc", "${1}a%", "abca%");
+expand!(expand_name7, r"\[(?P[a-z]+)", "[abc", "${Z[}[", "abc[");
+expand!(expand_name8, r"\[(?P[a-z]+)", "[abc", "${foo}[", "[");
+expand!(expand_name9, r"\[(?P[a-z]+)", "[abc", "${1a}[", "[");
+expand!(expand_name10, r"\[(?P[a-z]+)", "[abc", "${#}[", "[");
+expand!(expand_name11, r"\[(?P[a-z]+)", "[abc", "${$$}[", "[");
+
 split!(
     split1,
     r"(?-u)\s+",