From 036ce80c93591b1416556bd142aa1d6749db6d91 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 1 May 2021 07:22:49 -0400 Subject: [PATCH] compiler: fix lazy DFA false quits on ASCII text One of the things the lazy DFA can't handle is Unicode word boundaries, since it requires multi-byte look-around. However, it turns out that on pure ASCII text, Unicode word boundaries are equivalent to ASCII word boundaries. So the DFA has a heuristic: it treats Unicode word boundaries as ASCII boundaries until it sees a non-ASCII byte. When it does, it quits, and some other (slower) regex engine needs to take over. In a bug report against ripgrep[1], it was discovered that the lazy DFA was quitting and falling back to a slower engine even though the haystack was pure ASCII. It turned out that our equivalence byte class optimization was at fault. Namely, a '{' (which appears very frequently in the input) was being grouped in with other non-ASCII bytes. So whenever the DFA saw it, it treated it as a non-ASCII byte and thus stopped. The fix for this is simple: when we see a Unicode word boundary in the compiler, we set a boundary on our byte classes such that ASCII bytes are guaranteed to be in a different class from non-ASCII bytes. And indeed, this fixes the performance problem reported in [1]. [1] - https://github.com/BurntSushi/ripgrep/issues/1860 --- src/compile.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/compile.rs b/src/compile.rs index 8904f1516..9a2ed5e92 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -318,6 +318,13 @@ impl Compiler { } self.compiled.has_unicode_word_boundary = true; self.byte_classes.set_word_boundary(); + // We also make sure that all ASCII bytes are in a different + // class from non-ASCII bytes. Otherwise, it's possible for + // ASCII bytes to get lumped into the same class as non-ASCII + // bytes. This in turn may cause the lazy DFA to falsely start + // when it sees an ASCII byte that maps to a byte class with + // non-ASCII bytes. This ensures that never happens. + self.byte_classes.set_range(0, 0x7F); self.c_empty_look(prog::EmptyLook::WordBoundary) } WordBoundary(hir::WordBoundary::UnicodeNegate) => { @@ -330,6 +337,8 @@ impl Compiler { } self.compiled.has_unicode_word_boundary = true; self.byte_classes.set_word_boundary(); + // See comments above for why we set the ASCII range here. + self.byte_classes.set_range(0, 0x7F); self.c_empty_look(prog::EmptyLook::NotWordBoundary) } WordBoundary(hir::WordBoundary::Ascii) => {