From 1c19619672c2ef16dc9f64fec38af5719c4ec06c Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 20 May 2022 13:30:04 -0400 Subject: [PATCH] syntax: fix literal extraction for 'ab??' Previously, 'ab??' returned [Complete(ab), Complete(a)], but the order matters here because of greediness. The correct result is [Complete(a), Complete(ab)]. Instead of trying to actually fix literal extraction (which is a mess), we just rewrite 'ab?' (and 'ab??') as 'ab*'. 'ab*' still produces literals in the incorrect order, i.e., [Cut(ab), Complete(a)], but since one is cut we are guaranteed that the regex engine will be called to confirm the match. In so doing, it will correctly report 'a' as a match for 'ab??' in 'ab'. Fixes #862 --- CHANGELOG.md | 2 ++ regex-syntax/src/hir/literal/mod.rs | 33 +++++++++++++++++------------ tests/regression.rs | 3 +++ 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bd612e72ac..fee39b5e9c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ The below are changes for the next release, which is to be determined. Fixes a bug where `[[:alnum:][:^ascii:]]` dropped `[:alnum:]` from the class. * [BUG #859](https://github.com/rust-lang/regex/issues/859): Fixes a bug where `Hir::is_match_empty` returned `false` for `\b`. +* [BUG #862](https://github.com/rust-lang/regex/issues/862): + Fixes a bug where 'ab??' matches 'ab' instead of 'a' in 'ab'. 1.5.5 (2022-03-08) diff --git a/regex-syntax/src/hir/literal/mod.rs b/regex-syntax/src/hir/literal/mod.rs index 25ee88b065..1e66d2cc3d 100644 --- a/regex-syntax/src/hir/literal/mod.rs +++ b/regex-syntax/src/hir/literal/mod.rs @@ -735,18 +735,18 @@ fn repeat_zero_or_one_literals( lits: &mut Literals, mut f: F, ) { - let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty()); - lits3.set_limit_size(lits.limit_size() / 2); - f(e, &mut lits3); - - if lits3.is_empty() || !lits2.cross_product(&lits3) { - lits.cut(); - return; - } - lits2.add(Literal::empty()); - if !lits.union(lits2) { - lits.cut(); - } + f( + &Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrMore, + // FIXME: Our literal extraction doesn't care about greediness. + // Which is partially why we're treating 'e?' as 'e*'. Namely, + // 'ab??' yields [Complete(ab), Complete(a)], but it should yield + // [Complete(a), Complete(ab)] because of the non-greediness. + greedy: true, + hir: Box::new(e.clone()), + }), + lits, + ); } fn repeat_zero_or_more_literals( @@ -1141,6 +1141,11 @@ mod tests { test_lit!(pfx_group1, prefixes, "(a)", M("a")); test_lit!(pfx_rep_zero_or_one1, prefixes, "a?"); test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?"); + test_lit!(pfx_rep_zero_or_one_cat1, prefixes, "ab?", C("ab"), M("a")); + // FIXME: This should return [M("a"), M("ab")] because of the non-greedy + // repetition. As a work-around, we rewrite ab?? as ab*?, and thus we get + // a cut literal. + test_lit!(pfx_rep_zero_or_one_cat2, prefixes, "ab??", C("ab"), M("a")); test_lit!(pfx_rep_zero_or_more1, prefixes, "a*"); test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*"); test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a")); @@ -1249,8 +1254,8 @@ mod tests { pfx_crazy1, prefixes, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", - C("Mo\\'am"), - C("Mu\\'am"), + C("Mo\\'"), + C("Mu\\'"), C("Moam"), C("Muam") ); diff --git a/tests/regression.rs b/tests/regression.rs index 44b90832b8..e8b2525385 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -217,3 +217,6 @@ matiter!( // https://en.wikipedia.org/wiki/Je_(Cyrillic) ismatch!(empty_group_match, r"()Ј01", "zЈ01", true); matiter!(empty_group_find, r"()Ј01", "zЈ01", (1, 5)); + +// See: https://github.com/rust-lang/regex/issues/862 +mat!(non_greedy_question_literal, r"ab??", "ab", Some((0, 1)));