From 37ed947a3650d84a6814e69e60e97ec4e5df4908 Mon Sep 17 00:00:00 2001
From: Iskander Sharipov <i.sharipov@corp.vk.com>
Date: Fri, 15 Oct 2021 01:36:22 +0300
Subject: [PATCH] ruleguard/textmatch: an abstraction on top of regexp for
 performance
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`textmatch.Compile()` takes a regexp pattern and tries to recognize
it, returning the matcher that can match the input strings faster
than real `*regexp.Regexp` would. If it can't recognize the pattern,
it returns a normal `*regexp.Regexp`.

Right now we only optimize the simplest patterns, but it's a
first step to prove that we can still use regexp in ruleguard
rules and avoid big performance loses.

```
name                       old time/op    new time/op    delta
Match/^\p{Lu}_0-8             153ns ± 4%      11ns ± 1%  -92.81%  (p=0.008 n=5+5)
Match/^\p{Lu}_1-8             140ns ± 2%      11ns ± 0%  -92.13%  (p=0.008 n=5+5)
Match/^\p{Ll}_0-8             152ns ± 1%      11ns ± 1%  -92.77%  (p=0.008 n=5+5)
Match/^\p{Ll}_1-8             140ns ± 2%      11ns ± 3%  -92.04%  (p=0.008 n=5+5)
Match/foo$_0-8                174ns ± 1%      13ns ± 1%  -92.26%  (p=0.008 n=5+5)
Match/foo$_1-8               83.4ns ± 2%    13.4ns ± 6%  -83.96%  (p=0.008 n=5+5)
Match/^foo_0-8                135ns ± 0%      10ns ± 1%  -92.33%  (p=0.016 n=4+5)
Match/^foo_1-8                108ns ± 4%      11ns ± 4%  -89.78%  (p=0.008 n=5+5)
Match/simpleIdent_0-8         243ns ± 2%      18ns ± 1%  -92.51%  (p=0.008 n=5+5)
Match/simpleIdent_1-8        92.7ns ± 1%    26.5ns ± 1%  -71.43%  (p=0.008 n=5+5)
Match/.*simpleIdent.*_0-8    1.59µs ± 2%    0.02µs ± 1%  -98.86%  (p=0.008 n=5+5)
Match/.*simpleIdent.*_1-8    1.70µs ± 1%    0.03µs ± 1%  -98.46%  (p=0.008 n=5+5)
Match/simpleIdent_0#01-8      237ns ± 1%      14ns ± 1%  -94.03%  (p=0.008 n=5+5)
Match/simpleIdent_1#01-8      247ns ± 1%      24ns ± 3%  -90.42%  (p=0.008 n=5+5)
[Geo mean]                    211ns           15ns       -93.00%
```
---
 ruleguard/filters.go                  |   8 +-
 ruleguard/ir_loader.go                |   5 +-
 ruleguard/textmatch/compile.go        |  84 +++++++++++++
 ruleguard/textmatch/matchers.go       |  72 +++++++++++
 ruleguard/textmatch/textmatch.go      |  26 ++++
 ruleguard/textmatch/textmatch_test.go | 167 ++++++++++++++++++++++++++
 6 files changed, 356 insertions(+), 6 deletions(-)
 create mode 100644 ruleguard/textmatch/compile.go
 create mode 100644 ruleguard/textmatch/matchers.go
 create mode 100644 ruleguard/textmatch/textmatch.go
 create mode 100644 ruleguard/textmatch/textmatch_test.go

diff --git a/ruleguard/filters.go b/ruleguard/filters.go
index 525458da..9bf50dab 100644
--- a/ruleguard/filters.go
+++ b/ruleguard/filters.go
@@ -6,12 +6,12 @@ import (
 	"go/token"
 	"go/types"
 	"path/filepath"
-	"regexp"
 
 	"github.com/quasilyte/go-ruleguard/internal/gogrep"
 	"github.com/quasilyte/go-ruleguard/internal/xtypes"
 	"github.com/quasilyte/go-ruleguard/nodetag"
 	"github.com/quasilyte/go-ruleguard/ruleguard/quasigo"
+	"github.com/quasilyte/go-ruleguard/ruleguard/textmatch"
 	"github.com/quasilyte/go-ruleguard/ruleguard/typematch"
 )
 
@@ -76,7 +76,7 @@ func makeFileImportsFilter(src, pkgPath string) filterFunc {
 	}
 }
 
-func makeFilePkgPathMatchesFilter(src string, re *regexp.Regexp) filterFunc {
+func makeFilePkgPathMatchesFilter(src string, re textmatch.Pattern) filterFunc {
 	return func(params *filterParams) matchFilterResult {
 		pkgPath := params.ctx.Pkg.Path()
 		if re.MatchString(pkgPath) {
@@ -86,7 +86,7 @@ func makeFilePkgPathMatchesFilter(src string, re *regexp.Regexp) filterFunc {
 	}
 }
 
-func makeFileNameMatchesFilter(src string, re *regexp.Regexp) filterFunc {
+func makeFileNameMatchesFilter(src string, re textmatch.Pattern) filterFunc {
 	return func(params *filterParams) matchFilterResult {
 		if re.MatchString(filepath.Base(params.filename)) {
 			return filterSuccess
@@ -373,7 +373,7 @@ func makeTextFilter(src, varname string, op token.Token, rhsVarname string) filt
 	}
 }
 
-func makeTextMatchesFilter(src, varname string, re *regexp.Regexp) filterFunc {
+func makeTextMatchesFilter(src, varname string, re textmatch.Pattern) filterFunc {
 	// TODO(quasilyte): add variadic support.
 	return func(params *filterParams) matchFilterResult {
 		if re.Match(params.nodeText(params.subNode(varname))) {
diff --git a/ruleguard/ir_loader.go b/ruleguard/ir_loader.go
index fa7bf163..f4d10411 100644
--- a/ruleguard/ir_loader.go
+++ b/ruleguard/ir_loader.go
@@ -16,6 +16,7 @@ import (
 	"github.com/quasilyte/go-ruleguard/ruleguard/goutil"
 	"github.com/quasilyte/go-ruleguard/ruleguard/ir"
 	"github.com/quasilyte/go-ruleguard/ruleguard/quasigo"
+	"github.com/quasilyte/go-ruleguard/ruleguard/textmatch"
 	"github.com/quasilyte/go-ruleguard/ruleguard/typematch"
 )
 
@@ -409,12 +410,12 @@ func (l *irLoader) unwrapInterfaceExpr(filter ir.FilterExpr) (*types.Interface,
 	return iface, nil
 }
 
-func (l *irLoader) unwrapRegexpExpr(filter ir.FilterExpr) (*regexp.Regexp, error) {
+func (l *irLoader) unwrapRegexpExpr(filter ir.FilterExpr) (textmatch.Pattern, error) {
 	patternString := l.unwrapStringExpr(filter)
 	if patternString == "" {
 		return nil, l.errorf(filter.Line, nil, "expected a non-empty regexp pattern argument")
 	}
-	re, err := regexp.Compile(patternString)
+	re, err := textmatch.Compile(patternString)
 	if err != nil {
 		return nil, l.errorf(filter.Line, err, "compile regexp")
 	}
diff --git a/ruleguard/textmatch/compile.go b/ruleguard/textmatch/compile.go
new file mode 100644
index 00000000..d320bf88
--- /dev/null
+++ b/ruleguard/textmatch/compile.go
@@ -0,0 +1,84 @@
+package textmatch
+
+import (
+	"regexp"
+	"regexp/syntax"
+	"unicode"
+)
+
+func compile(s string) (Pattern, error) {
+	reSyntax, err := syntax.Parse(s, syntax.Perl)
+	if err == nil {
+		if optimized := compileOptimized(s, reSyntax); optimized != nil {
+			return optimized, nil
+		}
+	}
+	return regexp.Compile(s)
+}
+
+func compileOptimized(s string, re *syntax.Regexp) Pattern {
+	// .*
+	isAny := func(re *syntax.Regexp) bool {
+		return re.Op == syntax.OpStar && re.Sub[0].Op == syntax.OpAnyCharNotNL
+	}
+	// "literal"
+	isLit := func(re *syntax.Regexp) bool {
+		return re.Op == syntax.OpLiteral
+	}
+	// ^
+	isBegin := func(re *syntax.Regexp) bool {
+		return re.Op == syntax.OpBeginText
+	}
+	// $
+	isEnd := func(re *syntax.Regexp) bool {
+		return re.Op == syntax.OpEndText
+	}
+
+	// TODO: analyze what kind of regexps people use in rules
+	// more often and optimize those as well.
+
+	// lit => strings.Contains($input, lit)
+	if re.Op == syntax.OpLiteral {
+		return &containsLiteralMatcher{value: newInputValue(string(re.Rune))}
+	}
+
+	// `.*` lit `.*` => strings.Contains($input, lit)
+	if re.Op == syntax.OpConcat && len(re.Sub) == 3 {
+		if isAny(re.Sub[0]) && isLit(re.Sub[1]) && isAny(re.Sub[2]) {
+			return &containsLiteralMatcher{value: newInputValue(string(re.Sub[1].Rune))}
+		}
+	}
+
+	// `^` lit => strings.HasPrefix($input, lit)
+	if re.Op == syntax.OpConcat && len(re.Sub) == 2 {
+		if isBegin(re.Sub[0]) && isLit(re.Sub[1]) {
+			return &prefixLiteralMatcher{value: newInputValue(string(re.Sub[1].Rune))}
+		}
+	}
+
+	// lit `$` => strings.HasSuffix($input, lit)
+	if re.Op == syntax.OpConcat && len(re.Sub) == 2 {
+		if isLit(re.Sub[0]) && isEnd(re.Sub[1]) {
+			return &suffixLiteralMatcher{value: newInputValue(string(re.Sub[0].Rune))}
+		}
+	}
+
+	// `^` lit `$` => $input == lit
+	if re.Op == syntax.OpConcat && len(re.Sub) == 3 {
+		if isBegin(re.Sub[0]) && isLit(re.Sub[1]) && isEnd(re.Sub[2]) {
+			return &eqLiteralMatcher{value: newInputValue(string(re.Sub[1].Rune))}
+		}
+	}
+
+	// `^\p{Lu}` => prefixRunePredMatcher:unicode.IsUpper
+	// `^\p{Ll}` => prefixRunePredMatcher:unicode.IsLower
+	switch s {
+	case `^\p{Lu}`:
+		return &prefixRunePredMatcher{pred: unicode.IsUpper}
+	case `^\p{Ll}`:
+		return &prefixRunePredMatcher{pred: unicode.IsLower}
+	}
+
+	// Can't optimize.
+	return nil
+}
diff --git a/ruleguard/textmatch/matchers.go b/ruleguard/textmatch/matchers.go
new file mode 100644
index 00000000..2f68c9ae
--- /dev/null
+++ b/ruleguard/textmatch/matchers.go
@@ -0,0 +1,72 @@
+package textmatch
+
+import (
+	"bytes"
+	"strings"
+	"unicode/utf8"
+)
+
+// inputValue is a wrapper for string|[]byte.
+//
+// We hold both values to avoid string->[]byte and vice versa
+// conversions when doing Match and MatchString.
+type inputValue struct {
+	s string
+	b []byte
+}
+
+func newInputValue(s string) inputValue {
+	return inputValue{s: s, b: []byte(s)}
+}
+
+type containsLiteralMatcher struct{ value inputValue }
+
+func (m *containsLiteralMatcher) MatchString(s string) bool {
+	return strings.Contains(s, m.value.s)
+}
+
+func (m *containsLiteralMatcher) Match(b []byte) bool {
+	return bytes.Contains(b, m.value.b)
+}
+
+type prefixLiteralMatcher struct{ value inputValue }
+
+func (m *prefixLiteralMatcher) MatchString(s string) bool {
+	return strings.HasPrefix(s, m.value.s)
+}
+
+func (m *prefixLiteralMatcher) Match(b []byte) bool {
+	return bytes.HasPrefix(b, m.value.b)
+}
+
+type suffixLiteralMatcher struct{ value inputValue }
+
+func (m *suffixLiteralMatcher) MatchString(s string) bool {
+	return strings.HasSuffix(s, m.value.s)
+}
+
+func (m *suffixLiteralMatcher) Match(b []byte) bool {
+	return bytes.HasSuffix(b, m.value.b)
+}
+
+type eqLiteralMatcher struct{ value inputValue }
+
+func (m *eqLiteralMatcher) MatchString(s string) bool {
+	return m.value.s == s
+}
+
+func (m *eqLiteralMatcher) Match(b []byte) bool {
+	return bytes.Equal(m.value.b, b)
+}
+
+type prefixRunePredMatcher struct{ pred func(rune) bool }
+
+func (m *prefixRunePredMatcher) MatchString(s string) bool {
+	r, _ := utf8.DecodeRuneInString(s)
+	return m.pred(r)
+}
+
+func (m *prefixRunePredMatcher) Match(b []byte) bool {
+	r, _ := utf8.DecodeRune(b)
+	return m.pred(r)
+}
diff --git a/ruleguard/textmatch/textmatch.go b/ruleguard/textmatch/textmatch.go
new file mode 100644
index 00000000..a3787e2c
--- /dev/null
+++ b/ruleguard/textmatch/textmatch.go
@@ -0,0 +1,26 @@
+package textmatch
+
+import "regexp"
+
+// Pattern is a compiled regular expression.
+type Pattern interface {
+	MatchString(s string) bool
+	Match(b []byte) bool
+}
+
+// Compile parses a regular expression and returns a compiled
+// pattern that can match inputs descriped by the regexp.
+//
+// Semantically it's close to the regexp.Compile, but
+// it does recognize some common patterns and creates
+// a more optimized matcher for them.
+func Compile(re string) (Pattern, error) {
+	return compile(re)
+}
+
+// IsRegexp reports whether p is implemented using regexp.
+// False means that the underlying matcher is something optimized.
+func IsRegexp(p Pattern) bool {
+	_, ok := p.(*regexp.Regexp)
+	return ok
+}
diff --git a/ruleguard/textmatch/textmatch_test.go b/ruleguard/textmatch/textmatch_test.go
new file mode 100644
index 00000000..dfdf1a2a
--- /dev/null
+++ b/ruleguard/textmatch/textmatch_test.go
@@ -0,0 +1,167 @@
+package textmatch
+
+import (
+	"fmt"
+	"regexp"
+	"testing"
+)
+
+func TestCompileAndRun(t *testing.T) {
+	tests := []struct {
+		re              string
+		expectedMatcher string
+	}{
+		{`foo`, `containsLiteralMatcher`},
+		{`.*foo.*`, `containsLiteralMatcher`},
+		{`^foo$`, `eqLiteralMatcher`},
+		{`^foo`, `prefixLiteralMatcher`},
+		{`foo$`, `suffixLiteralMatcher`},
+
+		{`^\p{Lu}`, `prefixRunePredMatcher`},
+		{`^\p{Ll}`, `prefixRunePredMatcher`},
+	}
+
+	inputs := make([]string, 0, len(inputStrings))
+	for _, s := range inputStrings {
+		inputs = append(inputs, s)
+		inputs = append(inputs, s+" "+s)
+		inputs = append(inputs, s+"_"+s)
+		inputs = append(inputs, " "+s)
+		inputs = append(inputs, s+" ")
+		inputs = append(inputs, " "+s+" ")
+		inputs = append(inputs, "\n"+s+"\n")
+	}
+
+	for _, test := range tests {
+		p, err := Compile(test.re)
+		if err != nil {
+			t.Fatal(err)
+		}
+		wantMatcher := `*textmatch.` + test.expectedMatcher
+		if IsRegexp(p) {
+			t.Errorf("`%s` is not optimized (want %s)", test.re, wantMatcher)
+			continue
+		}
+		haveMatcher := fmt.Sprintf("%T", p)
+		if haveMatcher != wantMatcher {
+			t.Errorf("`%s` matcher is %s, want %s", test.re, haveMatcher, wantMatcher)
+			continue
+		}
+		re, err := regexp.Compile(test.re)
+		if err != nil {
+			t.Fatal(err)
+		}
+		for _, input := range inputs {
+			have := p.MatchString(input)
+			want := re.MatchString(input)
+			if have != want {
+				t.Errorf("`%s` invalid MatchString() result on %q (want %v)", test.re, input, want)
+				break
+			}
+			have = p.Match([]byte(input))
+			want = re.Match([]byte(input))
+			if have != want {
+				t.Errorf("`%s` invalid Match() result on %q (want %v)", test.re, input, want)
+				break
+			}
+		}
+	}
+}
+
+func BenchmarkMatch(b *testing.B) {
+	tests := []struct {
+		re     string
+		inputs []string
+	}{
+		{
+			`^\p{Lu}`,
+			[]string{
+				`Foo`,
+				`foo`,
+			},
+		},
+
+		{
+			`^\p{Ll}`,
+			[]string{
+				`foo`,
+				`Foo`,
+			},
+		},
+
+		{
+			`foo$`,
+			[]string{
+				`   foo`,
+				`bar`,
+			},
+		},
+
+		{
+			`^foo`,
+			[]string{
+				`foo`,
+				`   bar`,
+			},
+		},
+
+		{
+			`.*simpleIdent.*`,
+			[]string{
+				`text simpleIdent other text`,
+				`text without matching ident`,
+			},
+		},
+
+		{
+			`simpleIdent`,
+			[]string{
+				`simpleIdent`,
+				`text without simpleIdent`,
+			},
+		},
+	}
+
+	for _, test := range tests {
+		re, err := regexp.Compile(test.re)
+		if err != nil {
+			b.Fatal(err)
+		}
+		pat, err := Compile(test.re)
+		if err != nil {
+			b.Fatal(err)
+		}
+		if IsRegexp(pat) {
+			b.Fatalf("`%s` is not optimized", test.re)
+		}
+		for i, input := range test.inputs {
+			b.Run(fmt.Sprintf("%s_%d_re", test.re, i), func(b *testing.B) {
+				for i := 0; i < b.N; i++ {
+					_ = re.MatchString(input)
+				}
+			})
+			b.Run(fmt.Sprintf("%s_%d_opt", test.re, i), func(b *testing.B) {
+				for i := 0; i < b.N; i++ {
+					_ = pat.MatchString(input)
+				}
+			})
+		}
+	}
+}
+
+var inputStrings = []string{
+	``,
+	"\x00",
+	`foo`,
+	`foo2`,
+	`_foo`,
+	`foobarfoo`,
+	`Foo`,
+	`FOO`,
+	`bar_baz`,
+	`2493`,
+	"some longer text fragment (foo)",
+	"multi\nline\ntext\fragment",
+	"foo\nbar\n(foo)\n\n",
+	"ƇƉ",
+}