Skip to content

Commit

Permalink
exec: fix a bug in capture with match
Browse files Browse the repository at this point in the history
When performing "EndText" matching, it is necessary to check whether
the current position matches the input text length. However, when
capturing a submatch using the matching result of DFA, "EndText"
matching wasn't actually performed correctly because the input text is
sliced.

By applying this patch we specify the match end position by the
argument "end", not using slice when performing capture with the
matching result of DFA.

Fixes #557, Closes #561
  • Loading branch information
pipopa authored and BurntSushi committed Mar 30, 2019
1 parent 6152de4 commit 3563d73
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 46 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ Performance improvements:
* [OPT #566](https://github.com/rust-lang/regex/pull/566):
Upgrades `aho-corasick` to 0.7 and uses it for `foo|bar|...|quux` regexes.

Bug fixes:

* [BUG #557](https://github.com/rust-lang/regex/issues/557):
Fix a bug where captures could lead to an incorrect match.


1.1.2 (2019-02-27)
==================
Expand Down
7 changes: 4 additions & 3 deletions src/backtrack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
slots: &'s mut [Slot],
input: I,
start: usize,
end: usize,
) -> bool {
let mut cache = cache.borrow_mut();
let cache = &mut cache.backtrack;
Expand All @@ -109,7 +110,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
slots: slots,
m: cache,
};
b.exec_(start)
b.exec_(start, end)
}

/// Clears the cache such that the backtracking engine can be executed
Expand Down Expand Up @@ -147,7 +148,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {

/// Start backtracking at the given position in the input, but also look
/// for literal prefixes.
fn exec_(&mut self, mut at: InputAt) -> bool {
fn exec_(&mut self, mut at: InputAt, end: usize) -> bool {
self.clear();
// If this is an anchored regex at the beginning of the input, then
// we're either already done or we only need to try backtracking once.
Expand All @@ -170,7 +171,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
if matched && self.prog.matches.len() == 1 {
return true;
}
if at.is_end() {
if at.pos() == end {
break;
}
at = self.input.at(at.next_pos());
Expand Down
98 changes: 57 additions & 41 deletions src/exec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

use std::cell::RefCell;
use std::collections::HashMap;
use std::cmp;
use std::sync::Arc;

use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
Expand Down Expand Up @@ -589,7 +588,8 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
match self.ro.match_type {
MatchType::Literal(ty) => {
self.find_literals(ty, text, start).and_then(|(s, e)| {
self.captures_nfa_with_match(slots, text, s, e)
self.captures_nfa_type(
MatchNfaType::Auto, slots, text, s, e)
})
}
MatchType::Dfa => {
Expand All @@ -598,17 +598,21 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
} else {
match self.find_dfa_forward(text, start) {
dfa::Result::Match((s, e)) => {
self.captures_nfa_with_match(slots, text, s, e)
self.captures_nfa_type(
MatchNfaType::Auto, slots, text, s, e)
}
dfa::Result::NoMatch(_) => None,
dfa::Result::Quit => self.captures_nfa(slots, text, start),
dfa::Result::Quit => {
self.captures_nfa(slots, text, start)
}
}
}
}
MatchType::DfaAnchoredReverse => {
match self.find_dfa_anchored_reverse(text, start) {
dfa::Result::Match((s, e)) => {
self.captures_nfa_with_match(slots, text, s, e)
self.captures_nfa_type(
MatchNfaType::Auto, slots, text, s, e)
}
dfa::Result::NoMatch(_) => None,
dfa::Result::Quit => self.captures_nfa(slots, text, start),
Expand All @@ -617,14 +621,15 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
MatchType::DfaSuffix => {
match self.find_dfa_reverse_suffix(text, start) {
dfa::Result::Match((s, e)) => {
self.captures_nfa_with_match(slots, text, s, e)
self.captures_nfa_type(
MatchNfaType::Auto, slots, text, s, e)
}
dfa::Result::NoMatch(_) => None,
dfa::Result::Quit => self.captures_nfa(slots, text, start),
}
}
MatchType::Nfa(ty) => {
self.captures_nfa_type(ty, slots, text, start)
self.captures_nfa_type(ty, slots, text, start, text.len())
}
MatchType::Nothing => None,
MatchType::DfaMany => {
Expand Down Expand Up @@ -867,7 +872,7 @@ impl<'c> ExecNoSync<'c> {
text: &[u8],
start: usize,
) -> bool {
self.exec_nfa(ty, &mut [false], &mut [], true, text, start)
self.exec_nfa(ty, &mut [false], &mut [], true, text, start, text.len())
}

/// Finds the shortest match using an NFA.
Expand All @@ -883,7 +888,15 @@ impl<'c> ExecNoSync<'c> {
start: usize,
) -> Option<usize> {
let mut slots = [None, None];
if self.exec_nfa(ty, &mut [false], &mut slots, true, text, start) {
if self.exec_nfa(
ty,
&mut [false],
&mut slots,
true,
text,
start,
text.len()
) {
slots[1]
} else {
None
Expand All @@ -898,7 +911,15 @@ impl<'c> ExecNoSync<'c> {
start: usize,
) -> Option<(usize, usize)> {
let mut slots = [None, None];
if self.exec_nfa(ty, &mut [false], &mut slots, false, text, start) {
if self.exec_nfa(
ty,
&mut [false],
&mut slots,
false,
text,
start,
text.len()
) {
match (slots[0], slots[1]) {
(Some(s), Some(e)) => Some((s, e)),
_ => None,
Expand All @@ -908,26 +929,6 @@ impl<'c> ExecNoSync<'c> {
}
}

/// Like find_nfa, but fills in captures and restricts the search space
/// using previously found match information.
///
/// `slots` should have length equal to `2 * nfa.captures.len()`.
fn captures_nfa_with_match(
&self,
slots: &mut [Slot],
text: &[u8],
match_start: usize,
match_end: usize,
) -> Option<(usize, usize)> {
// We can't use match_end directly, because we may need to examine one
// "character" after the end of a match for lookahead operators. We
// need to move two characters beyond the end, since some look-around
// operations may falsely assume a premature end of text otherwise.
let e = cmp::min(
next_utf8(text, next_utf8(text, match_end)), text.len());
self.captures_nfa(slots, &text[..e], match_start)
}

/// Like find_nfa, but fills in captures.
///
/// `slots` should have length equal to `2 * nfa.captures.len()`.
Expand All @@ -937,7 +938,8 @@ impl<'c> ExecNoSync<'c> {
text: &[u8],
start: usize,
) -> Option<(usize, usize)> {
self.captures_nfa_type(MatchNfaType::Auto, slots, text, start)
self.captures_nfa_type(
MatchNfaType::Auto, slots, text, start, text.len())
}

/// Like captures_nfa, but allows specification of type of NFA engine.
Expand All @@ -947,8 +949,9 @@ impl<'c> ExecNoSync<'c> {
slots: &mut [Slot],
text: &[u8],
start: usize,
end: usize,
) -> Option<(usize, usize)> {
if self.exec_nfa(ty, &mut [false], slots, false, text, start) {
if self.exec_nfa(ty, &mut [false], slots, false, text, start, end) {
match (slots[0], slots[1]) {
(Some(s), Some(e)) => Some((s, e)),
_ => None,
Expand All @@ -966,6 +969,7 @@ impl<'c> ExecNoSync<'c> {
quit_after_match: bool,
text: &[u8],
start: usize,
end: usize,
) -> bool {
use self::MatchNfaType::*;
if let Auto = ty {
Expand All @@ -977,10 +981,10 @@ impl<'c> ExecNoSync<'c> {
}
match ty {
Auto => unreachable!(),
Backtrack => self.exec_backtrack(matches, slots, text, start),
Backtrack => self.exec_backtrack(matches, slots, text, start, end),
PikeVM => {
self.exec_pikevm(
matches, slots, quit_after_match, text, start)
matches, slots, quit_after_match, text, start, end)
}
}
}
Expand All @@ -993,6 +997,7 @@ impl<'c> ExecNoSync<'c> {
quit_after_match: bool,
text: &[u8],
start: usize,
end: usize,
) -> bool {
if self.ro.nfa.uses_bytes() {
pikevm::Fsm::exec(
Expand All @@ -1002,7 +1007,8 @@ impl<'c> ExecNoSync<'c> {
slots,
quit_after_match,
ByteInput::new(text, self.ro.nfa.only_utf8),
start)
start,
end)
} else {
pikevm::Fsm::exec(
&self.ro.nfa,
Expand All @@ -1011,7 +1017,8 @@ impl<'c> ExecNoSync<'c> {
slots,
quit_after_match,
CharInput::new(text),
start)
start,
end)
}
}

Expand All @@ -1022,6 +1029,7 @@ impl<'c> ExecNoSync<'c> {
slots: &mut [Slot],
text: &[u8],
start: usize,
end: usize,
) -> bool {
if self.ro.nfa.uses_bytes() {
backtrack::Bounded::exec(
Expand All @@ -1030,15 +1038,17 @@ impl<'c> ExecNoSync<'c> {
matches,
slots,
ByteInput::new(text, self.ro.nfa.only_utf8),
start)
start,
end)
} else {
backtrack::Bounded::exec(
&self.ro.nfa,
self.cache,
matches,
slots,
CharInput::new(text),
start)
start,
end)
}
}

Expand Down Expand Up @@ -1082,11 +1092,15 @@ impl<'c> ExecNoSync<'c> {
&mut [],
false,
text,
start)
start,
text.len())
}
}
}
Nfa(ty) => self.exec_nfa(ty, matches, &mut [], false, text, start),
Nfa(ty) => {
self.exec_nfa(
ty, matches, &mut [], false, text, start, text.len())
}
Nothing => false,
}
}
Expand Down Expand Up @@ -1118,7 +1132,9 @@ impl Exec {
/// Get a searcher that isn't Sync.
#[inline(always)] // reduces constant overhead
pub fn searcher(&self) -> ExecNoSync {
let create = || Box::new(RefCell::new(ProgramCacheInner::new(&self.ro)));
let create = || {
Box::new(RefCell::new(ProgramCacheInner::new(&self.ro)))
};
ExecNoSync {
ro: &self.ro, // a clone is too expensive here! (and not needed)
cache: self.cache.get_or(create),
Expand Down
5 changes: 4 additions & 1 deletion src/pikevm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ impl<'r, I: Input> Fsm<'r, I> {
quit_after_match: bool,
input: I,
start: usize,
end: usize,
) -> bool {
let mut cache = cache.borrow_mut();
let cache = &mut cache.pikevm;
Expand All @@ -124,6 +125,7 @@ impl<'r, I: Input> Fsm<'r, I> {
slots,
quit_after_match,
at,
end,
)
}

Expand All @@ -135,6 +137,7 @@ impl<'r, I: Input> Fsm<'r, I> {
slots: &mut [Slot],
quit_after_match: bool,
mut at: InputAt,
end: usize,
) -> bool {
let mut matched = false;
let mut all_matched = false;
Expand Down Expand Up @@ -212,7 +215,7 @@ impl<'r, I: Input> Fsm<'r, I> {
}
}
}
if at.is_end() {
if at.pos() == end {
break;
}
at = at_next;
Expand Down
7 changes: 6 additions & 1 deletion tests/regression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,13 @@ ismatch!(reverse_suffix2, r"\d\d\d000", "153.230000\n", true);
matiter!(reverse_suffix3, r"\d\d\d000", "153.230000\n", (4, 10));

// See: https://github.com/rust-lang/regex/issues/334
mat!(captures_after_dfa_premature_end, r"a(b*(X|$))?", "abcbX",
// See: https://github.com/rust-lang/regex/issues/557
mat!(captures_after_dfa_premature_end1, r"a(b*(X|$))?", "abcbX",
Some((0, 1)), None, None);
mat!(captures_after_dfa_premature_end2, r"a(bc*(X|$))?", "abcbX",
Some((0, 1)), None, None);
mat!(captures_after_dfa_premature_end3, r"(aa$)?", "aaz",
Some((0, 0)));

// See: https://github.com/rust-lang/regex/issues/437
ismatch!(
Expand Down

0 comments on commit 3563d73

Please sign in to comment.