Skip to content

Commit

Permalink
capture: support [, ] and . in capture group names
Browse files Browse the repository at this point in the history
This slightly expands the set of characters allowed in capture group
names to be `[][_0-9A-Za-z.]` from `[_0-9A-Za-z]`.

This required some delicacy in order to avoid replacement strings like
`$Z[` from referring to invalid capture group names where the intent was
to refer to the capture group named `Z`. That is, in order to use `[`,
`]` or `.` in a capture group name, one must use the explicit brace
syntax: `${Z[}`. We clarify the docs around this issue.

Regretably, we are not much closer to handling #595. In order to
support, say, all Unicode word characters, our replacement parser would
need to become UTF-8 aware on `&[u8]`. But std makes this difficult and
I would prefer not to add another dependency on ad hoc UTF-8 decoding or
a dependency on another crate.

Closes #649
  • Loading branch information
bruceg authored and BurntSushi committed Oct 12, 2020
1 parent 96456dd commit e1e3692
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 31 deletions.
48 changes: 44 additions & 4 deletions regex-syntax/src/ast/parse.rs
Expand Up @@ -98,12 +98,13 @@ fn is_hex(c: char) -> bool {
/// Returns true if the given character is a valid in a capture group name.
///
/// If `first` is true, then `c` is treated as the first character in the
/// group name (which is not allowed to be a digit).
/// group name (which must be alphabetic or underscore).
fn is_capture_char(c: char, first: bool) -> bool {
c == '_'
|| (!first && c >= '0' && c <= '9')
|| (c >= 'a' && c <= 'z')
|| (c >= 'A' && c <= 'Z')
|| (!first
&& (('0' <= c && c <= '9') || c == '.' || c == '[' || c == ']'))
|| ('A' <= c && c <= 'Z')
|| ('a' <= c && c <= 'z')
}

/// A builder for a regular expression parser.
Expand Down Expand Up @@ -3851,6 +3852,45 @@ bar
}))
);

assert_eq!(
parser("(?P<a_1>z)").parse(),
Ok(Ast::Group(ast::Group {
span: span(0..10),
kind: ast::GroupKind::CaptureName(ast::CaptureName {
span: span(4..7),
name: s("a_1"),
index: 1,
}),
ast: Box::new(lit('z', 8)),
}))
);

assert_eq!(
parser("(?P<a.1>z)").parse(),
Ok(Ast::Group(ast::Group {
span: span(0..10),
kind: ast::GroupKind::CaptureName(ast::CaptureName {
span: span(4..7),
name: s("a.1"),
index: 1,
}),
ast: Box::new(lit('z', 8)),
}))
);

assert_eq!(
parser("(?P<a[1]>z)").parse(),
Ok(Ast::Group(ast::Group {
span: span(0..11),
kind: ast::GroupKind::CaptureName(ast::CaptureName {
span: span(4..8),
name: s("a[1]"),
index: 1,
}),
ast: Box::new(lit('z', 9)),
}))
);

assert_eq!(
parser("(?P<").parse().unwrap_err(),
TestError {
Expand Down
50 changes: 34 additions & 16 deletions src/expand.rs
Expand Up @@ -24,7 +24,7 @@ pub fn expand_str(
continue;
}
debug_assert!(!replacement.is_empty());
let cap_ref = match find_cap_ref(replacement) {
let cap_ref = match find_cap_ref(replacement.as_bytes()) {
Some(cap_ref) => cap_ref,
None => {
dst.push_str("$");
Expand Down Expand Up @@ -125,19 +125,15 @@ impl From<usize> for Ref<'static> {
/// starting at the beginning of `replacement`.
///
/// If no such valid reference could be found, None is returned.
fn find_cap_ref<T: ?Sized + AsRef<[u8]>>(
replacement: &T,
) -> Option<CaptureRef> {
fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef> {
let mut i = 0;
let rep: &[u8] = replacement.as_ref();
if rep.len() <= 1 || rep[0] != b'$' {
return None;
}
let mut brace = false;
i += 1;
if rep[i] == b'{' {
brace = true;
i += 1;
return find_cap_ref_braced(rep, i + 1);
}
let mut cap_end = i;
while rep.get(cap_end).map_or(false, is_valid_cap_letter) {
Expand All @@ -151,12 +147,6 @@ fn find_cap_ref<T: ?Sized + AsRef<[u8]>>(
// check with either unsafe or by parsing the number straight from &[u8].
let cap =
str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name");
if brace {
if !rep.get(cap_end).map_or(false, |&b| b == b'}') {
return None;
}
cap_end += 1;
}
Some(CaptureRef {
cap: match cap.parse::<u32>() {
Ok(i) => Ref::Number(i as usize),
Expand All @@ -166,6 +156,31 @@ fn find_cap_ref<T: ?Sized + AsRef<[u8]>>(
})
}

fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef> {
let start = i;
while rep.get(i).map_or(false, |&b| b != b'}') {
i += 1;
}
if !rep.get(i).map_or(false, |&b| b == b'}') {
return None;
}
// When looking at braced names, we don't put any restrictions on the name,
// so it's possible it could be invalid UTF-8. But a capture group name
// can never be invalid UTF-8, so if we have invalid UTF-8, then we can
// safely return None.
let cap = match str::from_utf8(&rep[start..i]) {
Err(_) => return None,
Ok(cap) => cap,
};
Some(CaptureRef {
cap: match cap.parse::<u32>() {
Ok(i) => Ref::Number(i as usize),
Err(_) => Ref::Named(cap),
},
end: i + 1,
})
}

/// Returns true if and only if the given byte is allowed in a capture name.
fn is_valid_cap_letter(b: &u8) -> bool {
match *b {
Expand All @@ -182,13 +197,13 @@ mod tests {
($name:ident, $text:expr) => {
#[test]
fn $name() {
assert_eq!(None, find_cap_ref($text));
assert_eq!(None, find_cap_ref($text.as_bytes()));
}
};
($name:ident, $text:expr, $capref:expr) => {
#[test]
fn $name() {
assert_eq!(Some($capref), find_cap_ref($text));
assert_eq!(Some($capref), find_cap_ref($text.as_bytes()));
}
};
}
Expand All @@ -204,7 +219,8 @@ mod tests {
find!(find_cap_ref3, "$0", c!(0, 2));
find!(find_cap_ref4, "$5", c!(5, 2));
find!(find_cap_ref5, "$10", c!(10, 3));
// see https://github.com/rust-lang/regex/pull/585 for more on characters following numbers
// See https://github.com/rust-lang/regex/pull/585
// for more on characters following numbers
find!(find_cap_ref6, "$42a", c!("42a", 4));
find!(find_cap_ref7, "${42}a", c!(42, 5));
find!(find_cap_ref8, "${42");
Expand All @@ -217,4 +233,6 @@ mod tests {
find!(find_cap_ref15, "$1_$2", c!("1_", 3));
find!(find_cap_ref16, "$x-$y", c!("x", 2));
find!(find_cap_ref17, "$x_$y", c!("x_", 3));
find!(find_cap_ref18, "${#}", c!("#", 4));
find!(find_cap_ref19, "${Z[}", c!("Z[", 5));
}
2 changes: 1 addition & 1 deletion src/lib.rs
Expand Up @@ -365,7 +365,7 @@ $ the end of text (or end-of-line with multi-line mode)
<pre class="rust">
(exp) numbered capture group (indexed by opening parenthesis)
(?P&lt;name&gt;exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z])
(?P&lt;name&gt;exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
(?:exp) non-capturing group
(?flags) set flags within current group
(?flags:exp) set flags for exp (non-capturing)
Expand Down
15 changes: 10 additions & 5 deletions src/re_bytes.rs
Expand Up @@ -930,17 +930,22 @@ impl<'t> Captures<'t> {
/// Expands all instances of `$name` in `replacement` to the corresponding
/// capture group `name`, and writes them to the `dst` buffer given.
///
/// `name` may be an integer corresponding to the index of the
/// capture group (counted by order of opening parenthesis where `0` is the
/// `name` may be an integer corresponding to the index of the capture
/// group (counted by order of opening parenthesis where `0` is the
/// entire match) or it can be a name (consisting of letters, digits or
/// underscores) corresponding to a named capture group.
///
/// If `name` isn't a valid capture group (whether the name doesn't exist
/// or isn't a valid index), then it is replaced with the empty string.
///
/// The longest possible name is used. e.g., `$1a` looks up the capture
/// group named `1a` and not the capture group at index `1`. To exert more
/// precise control over the name, use braces, e.g., `${1}a`.
/// The longest possible name consisting of the characters `[_0-9A-Za-z]`
/// is used. e.g., `$1a` looks up the capture group named `1a` and not the
/// capture group at index `1`. To exert more precise control over the
/// name, or to refer to a capture group name that uses characters outside
/// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When
/// using braces, any sequence of valid UTF-8 bytes is permitted. If the
/// sequence does not refer to a capture group name in the corresponding
/// regex, then it is replaced with an empty string.
///
/// To write a literal `$` use `$$`.
pub fn expand(&self, replacement: &[u8], dst: &mut Vec<u8>) {
Expand Down
15 changes: 10 additions & 5 deletions src/re_unicode.rs
Expand Up @@ -947,17 +947,22 @@ impl<'t> Captures<'t> {
/// Expands all instances of `$name` in `replacement` to the corresponding
/// capture group `name`, and writes them to the `dst` buffer given.
///
/// `name` may be an integer corresponding to the index of the
/// capture group (counted by order of opening parenthesis where `0` is the
/// `name` may be an integer corresponding to the index of the capture
/// group (counted by order of opening parenthesis where `0` is the
/// entire match) or it can be a name (consisting of letters, digits or
/// underscores) corresponding to a named capture group.
///
/// If `name` isn't a valid capture group (whether the name doesn't exist
/// or isn't a valid index), then it is replaced with the empty string.
///
/// The longest possible name is used. e.g., `$1a` looks up the capture
/// group named `1a` and not the capture group at index `1`. To exert more
/// precise control over the name, use braces, e.g., `${1}a`.
/// The longest possible name consisting of the characters `[_0-9A-Za-z]`
/// is used. e.g., `$1a` looks up the capture group named `1a` and not the
/// capture group at index `1`. To exert more precise control over the
/// name, or to refer to a capture group name that uses characters outside
/// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When
/// using braces, any sequence of characters is permitted. If the sequence
/// does not refer to a capture group name in the corresponding regex, then
/// it is replaced with an empty string.
///
/// To write a literal `$` use `$$`.
pub fn expand(&self, replacement: &str, dst: &mut String) {
Expand Down
12 changes: 12 additions & 0 deletions tests/api.rs
Expand Up @@ -195,6 +195,18 @@ expand!(
);
expand!(expand10, r"(?-u)(?P<a>\w+)\s+(?P<b>\d+)", "abc 123", "$bz$az", "");

expand!(expand_name1, r"%(?P<Z>[a-z]+)", "%abc", "$Z%", "abc%");
expand!(expand_name2, r"\[(?P<Z>[a-z]+)", "[abc", "$Z[", "abc[");
expand!(expand_name3, r"\{(?P<Z>[a-z]+)", "{abc", "$Z{", "abc{");
expand!(expand_name4, r"\}(?P<Z>[a-z]+)", "}abc", "$Z}", "abc}");
expand!(expand_name5, r"%([a-z]+)", "%abc", "$1a%", "%");
expand!(expand_name6, r"%([a-z]+)", "%abc", "${1}a%", "abca%");
expand!(expand_name7, r"\[(?P<Z[>[a-z]+)", "[abc", "${Z[}[", "abc[");
expand!(expand_name8, r"\[(?P<Z[>[a-z]+)", "[abc", "${foo}[", "[");
expand!(expand_name9, r"\[(?P<Z[>[a-z]+)", "[abc", "${1a}[", "[");
expand!(expand_name10, r"\[(?P<Z[>[a-z]+)", "[abc", "${#}[", "[");
expand!(expand_name11, r"\[(?P<Z[>[a-z]+)", "[abc", "${$$}[", "[");

split!(
split1,
r"(?-u)\s+",
Expand Down

0 comments on commit e1e3692

Please sign in to comment.