Skip to content

Commit

Permalink
syntax: fix literal extraction for 'ab??'
Browse files Browse the repository at this point in the history
Previously, 'ab??' returned [Complete(ab), Complete(a)], but the order
matters here because of greediness. The correct result is [Complete(a),
Complete(ab)].

Instead of trying to actually fix literal extraction (which is a mess),
we just rewrite 'ab?' (and 'ab??') as 'ab*'. 'ab*' still produces
literals in the incorrect order, i.e., [Cut(ab), Complete(a)], but since
one is cut we are guaranteed that the regex engine will be called to
confirm the match. In so doing, it will correctly report 'a' as a match
for 'ab??' in 'ab'.

Fixes #862
  • Loading branch information
BurntSushi committed May 20, 2022
1 parent 88a2a62 commit 04dc15a
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 14 deletions.
33 changes: 19 additions & 14 deletions regex-syntax/src/hir/literal/mod.rs
Expand Up @@ -735,18 +735,18 @@ fn repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>(
lits: &mut Literals,
mut f: F,
) {
let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty());
lits3.set_limit_size(lits.limit_size() / 2);
f(e, &mut lits3);

if lits3.is_empty() || !lits2.cross_product(&lits3) {
lits.cut();
return;
}
lits2.add(Literal::empty());
if !lits.union(lits2) {
lits.cut();
}
f(
&Hir::repetition(hir::Repetition {
kind: hir::RepetitionKind::ZeroOrMore,
// FIXME: Our literal extraction doesn't care about greediness.
// Which is partially why we're treating 'e?' as 'e*'. Namely,
// 'ab??' yields [Complete(ab), Complete(a)], but it should yield
// [Complete(a), Complete(ab)] because of the non-greediness.
greedy: true,
hir: Box::new(e.clone()),
}),
lits,
);
}

fn repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
Expand Down Expand Up @@ -1141,6 +1141,11 @@ mod tests {
test_lit!(pfx_group1, prefixes, "(a)", M("a"));
test_lit!(pfx_rep_zero_or_one1, prefixes, "a?");
test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?");
test_lit!(pfx_rep_zero_or_one_cat1, prefixes, "ab?", C("ab"), M("a"));
// FIXME: This should return [M("a"), M("ab")] because of the non-greedy
// repetition. As a work-around, we rewrite ab?? as ab*?, and thus we get
// a cut literal.
test_lit!(pfx_rep_zero_or_one_cat2, prefixes, "ab??", C("ab"), M("a"));
test_lit!(pfx_rep_zero_or_more1, prefixes, "a*");
test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*");
test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a"));
Expand Down Expand Up @@ -1249,8 +1254,8 @@ mod tests {
pfx_crazy1,
prefixes,
r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
C("Mo\\'am"),
C("Mu\\'am"),
C("Mo\\'"),
C("Mu\\'"),
C("Moam"),
C("Muam")
);
Expand Down
3 changes: 3 additions & 0 deletions tests/regression.rs
Expand Up @@ -217,3 +217,6 @@ matiter!(
// https://en.wikipedia.org/wiki/Je_(Cyrillic)
ismatch!(empty_group_match, r"()Ј01", "zЈ01", true);
matiter!(empty_group_find, r"()Ј01", "zЈ01", (1, 5));

// See: https://github.com/rust-lang/regex/issues/862
mat!(non_greedy_question_literal, r"ab??", "ab", Some((0, 1)));

0 comments on commit 04dc15a

Please sign in to comment.