From 37ed947a3650d84a6814e69e60e97ec4e5df4908 Mon Sep 17 00:00:00 2001 From: Iskander Sharipov Date: Fri, 15 Oct 2021 01:36:22 +0300 Subject: [PATCH] ruleguard/textmatch: an abstraction on top of regexp for performance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `textmatch.Compile()` takes a regexp pattern and tries to recognize it, returning the matcher that can match the input strings faster than real `*regexp.Regexp` would. If it can't recognize the pattern, it returns a normal `*regexp.Regexp`. Right now we only optimize the simplest patterns, but it's a first step to prove that we can still use regexp in ruleguard rules and avoid big performance loses. ``` name old time/op new time/op delta Match/^\p{Lu}_0-8 153ns ± 4% 11ns ± 1% -92.81% (p=0.008 n=5+5) Match/^\p{Lu}_1-8 140ns ± 2% 11ns ± 0% -92.13% (p=0.008 n=5+5) Match/^\p{Ll}_0-8 152ns ± 1% 11ns ± 1% -92.77% (p=0.008 n=5+5) Match/^\p{Ll}_1-8 140ns ± 2% 11ns ± 3% -92.04% (p=0.008 n=5+5) Match/foo$_0-8 174ns ± 1% 13ns ± 1% -92.26% (p=0.008 n=5+5) Match/foo$_1-8 83.4ns ± 2% 13.4ns ± 6% -83.96% (p=0.008 n=5+5) Match/^foo_0-8 135ns ± 0% 10ns ± 1% -92.33% (p=0.016 n=4+5) Match/^foo_1-8 108ns ± 4% 11ns ± 4% -89.78% (p=0.008 n=5+5) Match/simpleIdent_0-8 243ns ± 2% 18ns ± 1% -92.51% (p=0.008 n=5+5) Match/simpleIdent_1-8 92.7ns ± 1% 26.5ns ± 1% -71.43% (p=0.008 n=5+5) Match/.*simpleIdent.*_0-8 1.59µs ± 2% 0.02µs ± 1% -98.86% (p=0.008 n=5+5) Match/.*simpleIdent.*_1-8 1.70µs ± 1% 0.03µs ± 1% -98.46% (p=0.008 n=5+5) Match/simpleIdent_0#01-8 237ns ± 1% 14ns ± 1% -94.03% (p=0.008 n=5+5) Match/simpleIdent_1#01-8 247ns ± 1% 24ns ± 3% -90.42% (p=0.008 n=5+5) [Geo mean] 211ns 15ns -93.00% ``` --- ruleguard/filters.go | 8 +- ruleguard/ir_loader.go | 5 +- ruleguard/textmatch/compile.go | 84 +++++++++++++ ruleguard/textmatch/matchers.go | 72 +++++++++++ ruleguard/textmatch/textmatch.go | 26 ++++ ruleguard/textmatch/textmatch_test.go | 167 ++++++++++++++++++++++++++ 6 files changed, 356 insertions(+), 6 deletions(-) create mode 100644 ruleguard/textmatch/compile.go create mode 100644 ruleguard/textmatch/matchers.go create mode 100644 ruleguard/textmatch/textmatch.go create mode 100644 ruleguard/textmatch/textmatch_test.go diff --git a/ruleguard/filters.go b/ruleguard/filters.go index 525458da..9bf50dab 100644 --- a/ruleguard/filters.go +++ b/ruleguard/filters.go @@ -6,12 +6,12 @@ import ( "go/token" "go/types" "path/filepath" - "regexp" "github.com/quasilyte/go-ruleguard/internal/gogrep" "github.com/quasilyte/go-ruleguard/internal/xtypes" "github.com/quasilyte/go-ruleguard/nodetag" "github.com/quasilyte/go-ruleguard/ruleguard/quasigo" + "github.com/quasilyte/go-ruleguard/ruleguard/textmatch" "github.com/quasilyte/go-ruleguard/ruleguard/typematch" ) @@ -76,7 +76,7 @@ func makeFileImportsFilter(src, pkgPath string) filterFunc { } } -func makeFilePkgPathMatchesFilter(src string, re *regexp.Regexp) filterFunc { +func makeFilePkgPathMatchesFilter(src string, re textmatch.Pattern) filterFunc { return func(params *filterParams) matchFilterResult { pkgPath := params.ctx.Pkg.Path() if re.MatchString(pkgPath) { @@ -86,7 +86,7 @@ func makeFilePkgPathMatchesFilter(src string, re *regexp.Regexp) filterFunc { } } -func makeFileNameMatchesFilter(src string, re *regexp.Regexp) filterFunc { +func makeFileNameMatchesFilter(src string, re textmatch.Pattern) filterFunc { return func(params *filterParams) matchFilterResult { if re.MatchString(filepath.Base(params.filename)) { return filterSuccess @@ -373,7 +373,7 @@ func makeTextFilter(src, varname string, op token.Token, rhsVarname string) filt } } -func makeTextMatchesFilter(src, varname string, re *regexp.Regexp) filterFunc { +func makeTextMatchesFilter(src, varname string, re textmatch.Pattern) filterFunc { // TODO(quasilyte): add variadic support. return func(params *filterParams) matchFilterResult { if re.Match(params.nodeText(params.subNode(varname))) { diff --git a/ruleguard/ir_loader.go b/ruleguard/ir_loader.go index fa7bf163..f4d10411 100644 --- a/ruleguard/ir_loader.go +++ b/ruleguard/ir_loader.go @@ -16,6 +16,7 @@ import ( "github.com/quasilyte/go-ruleguard/ruleguard/goutil" "github.com/quasilyte/go-ruleguard/ruleguard/ir" "github.com/quasilyte/go-ruleguard/ruleguard/quasigo" + "github.com/quasilyte/go-ruleguard/ruleguard/textmatch" "github.com/quasilyte/go-ruleguard/ruleguard/typematch" ) @@ -409,12 +410,12 @@ func (l *irLoader) unwrapInterfaceExpr(filter ir.FilterExpr) (*types.Interface, return iface, nil } -func (l *irLoader) unwrapRegexpExpr(filter ir.FilterExpr) (*regexp.Regexp, error) { +func (l *irLoader) unwrapRegexpExpr(filter ir.FilterExpr) (textmatch.Pattern, error) { patternString := l.unwrapStringExpr(filter) if patternString == "" { return nil, l.errorf(filter.Line, nil, "expected a non-empty regexp pattern argument") } - re, err := regexp.Compile(patternString) + re, err := textmatch.Compile(patternString) if err != nil { return nil, l.errorf(filter.Line, err, "compile regexp") } diff --git a/ruleguard/textmatch/compile.go b/ruleguard/textmatch/compile.go new file mode 100644 index 00000000..d320bf88 --- /dev/null +++ b/ruleguard/textmatch/compile.go @@ -0,0 +1,84 @@ +package textmatch + +import ( + "regexp" + "regexp/syntax" + "unicode" +) + +func compile(s string) (Pattern, error) { + reSyntax, err := syntax.Parse(s, syntax.Perl) + if err == nil { + if optimized := compileOptimized(s, reSyntax); optimized != nil { + return optimized, nil + } + } + return regexp.Compile(s) +} + +func compileOptimized(s string, re *syntax.Regexp) Pattern { + // .* + isAny := func(re *syntax.Regexp) bool { + return re.Op == syntax.OpStar && re.Sub[0].Op == syntax.OpAnyCharNotNL + } + // "literal" + isLit := func(re *syntax.Regexp) bool { + return re.Op == syntax.OpLiteral + } + // ^ + isBegin := func(re *syntax.Regexp) bool { + return re.Op == syntax.OpBeginText + } + // $ + isEnd := func(re *syntax.Regexp) bool { + return re.Op == syntax.OpEndText + } + + // TODO: analyze what kind of regexps people use in rules + // more often and optimize those as well. + + // lit => strings.Contains($input, lit) + if re.Op == syntax.OpLiteral { + return &containsLiteralMatcher{value: newInputValue(string(re.Rune))} + } + + // `.*` lit `.*` => strings.Contains($input, lit) + if re.Op == syntax.OpConcat && len(re.Sub) == 3 { + if isAny(re.Sub[0]) && isLit(re.Sub[1]) && isAny(re.Sub[2]) { + return &containsLiteralMatcher{value: newInputValue(string(re.Sub[1].Rune))} + } + } + + // `^` lit => strings.HasPrefix($input, lit) + if re.Op == syntax.OpConcat && len(re.Sub) == 2 { + if isBegin(re.Sub[0]) && isLit(re.Sub[1]) { + return &prefixLiteralMatcher{value: newInputValue(string(re.Sub[1].Rune))} + } + } + + // lit `$` => strings.HasSuffix($input, lit) + if re.Op == syntax.OpConcat && len(re.Sub) == 2 { + if isLit(re.Sub[0]) && isEnd(re.Sub[1]) { + return &suffixLiteralMatcher{value: newInputValue(string(re.Sub[0].Rune))} + } + } + + // `^` lit `$` => $input == lit + if re.Op == syntax.OpConcat && len(re.Sub) == 3 { + if isBegin(re.Sub[0]) && isLit(re.Sub[1]) && isEnd(re.Sub[2]) { + return &eqLiteralMatcher{value: newInputValue(string(re.Sub[1].Rune))} + } + } + + // `^\p{Lu}` => prefixRunePredMatcher:unicode.IsUpper + // `^\p{Ll}` => prefixRunePredMatcher:unicode.IsLower + switch s { + case `^\p{Lu}`: + return &prefixRunePredMatcher{pred: unicode.IsUpper} + case `^\p{Ll}`: + return &prefixRunePredMatcher{pred: unicode.IsLower} + } + + // Can't optimize. + return nil +} diff --git a/ruleguard/textmatch/matchers.go b/ruleguard/textmatch/matchers.go new file mode 100644 index 00000000..2f68c9ae --- /dev/null +++ b/ruleguard/textmatch/matchers.go @@ -0,0 +1,72 @@ +package textmatch + +import ( + "bytes" + "strings" + "unicode/utf8" +) + +// inputValue is a wrapper for string|[]byte. +// +// We hold both values to avoid string->[]byte and vice versa +// conversions when doing Match and MatchString. +type inputValue struct { + s string + b []byte +} + +func newInputValue(s string) inputValue { + return inputValue{s: s, b: []byte(s)} +} + +type containsLiteralMatcher struct{ value inputValue } + +func (m *containsLiteralMatcher) MatchString(s string) bool { + return strings.Contains(s, m.value.s) +} + +func (m *containsLiteralMatcher) Match(b []byte) bool { + return bytes.Contains(b, m.value.b) +} + +type prefixLiteralMatcher struct{ value inputValue } + +func (m *prefixLiteralMatcher) MatchString(s string) bool { + return strings.HasPrefix(s, m.value.s) +} + +func (m *prefixLiteralMatcher) Match(b []byte) bool { + return bytes.HasPrefix(b, m.value.b) +} + +type suffixLiteralMatcher struct{ value inputValue } + +func (m *suffixLiteralMatcher) MatchString(s string) bool { + return strings.HasSuffix(s, m.value.s) +} + +func (m *suffixLiteralMatcher) Match(b []byte) bool { + return bytes.HasSuffix(b, m.value.b) +} + +type eqLiteralMatcher struct{ value inputValue } + +func (m *eqLiteralMatcher) MatchString(s string) bool { + return m.value.s == s +} + +func (m *eqLiteralMatcher) Match(b []byte) bool { + return bytes.Equal(m.value.b, b) +} + +type prefixRunePredMatcher struct{ pred func(rune) bool } + +func (m *prefixRunePredMatcher) MatchString(s string) bool { + r, _ := utf8.DecodeRuneInString(s) + return m.pred(r) +} + +func (m *prefixRunePredMatcher) Match(b []byte) bool { + r, _ := utf8.DecodeRune(b) + return m.pred(r) +} diff --git a/ruleguard/textmatch/textmatch.go b/ruleguard/textmatch/textmatch.go new file mode 100644 index 00000000..a3787e2c --- /dev/null +++ b/ruleguard/textmatch/textmatch.go @@ -0,0 +1,26 @@ +package textmatch + +import "regexp" + +// Pattern is a compiled regular expression. +type Pattern interface { + MatchString(s string) bool + Match(b []byte) bool +} + +// Compile parses a regular expression and returns a compiled +// pattern that can match inputs descriped by the regexp. +// +// Semantically it's close to the regexp.Compile, but +// it does recognize some common patterns and creates +// a more optimized matcher for them. +func Compile(re string) (Pattern, error) { + return compile(re) +} + +// IsRegexp reports whether p is implemented using regexp. +// False means that the underlying matcher is something optimized. +func IsRegexp(p Pattern) bool { + _, ok := p.(*regexp.Regexp) + return ok +} diff --git a/ruleguard/textmatch/textmatch_test.go b/ruleguard/textmatch/textmatch_test.go new file mode 100644 index 00000000..dfdf1a2a --- /dev/null +++ b/ruleguard/textmatch/textmatch_test.go @@ -0,0 +1,167 @@ +package textmatch + +import ( + "fmt" + "regexp" + "testing" +) + +func TestCompileAndRun(t *testing.T) { + tests := []struct { + re string + expectedMatcher string + }{ + {`foo`, `containsLiteralMatcher`}, + {`.*foo.*`, `containsLiteralMatcher`}, + {`^foo$`, `eqLiteralMatcher`}, + {`^foo`, `prefixLiteralMatcher`}, + {`foo$`, `suffixLiteralMatcher`}, + + {`^\p{Lu}`, `prefixRunePredMatcher`}, + {`^\p{Ll}`, `prefixRunePredMatcher`}, + } + + inputs := make([]string, 0, len(inputStrings)) + for _, s := range inputStrings { + inputs = append(inputs, s) + inputs = append(inputs, s+" "+s) + inputs = append(inputs, s+"_"+s) + inputs = append(inputs, " "+s) + inputs = append(inputs, s+" ") + inputs = append(inputs, " "+s+" ") + inputs = append(inputs, "\n"+s+"\n") + } + + for _, test := range tests { + p, err := Compile(test.re) + if err != nil { + t.Fatal(err) + } + wantMatcher := `*textmatch.` + test.expectedMatcher + if IsRegexp(p) { + t.Errorf("`%s` is not optimized (want %s)", test.re, wantMatcher) + continue + } + haveMatcher := fmt.Sprintf("%T", p) + if haveMatcher != wantMatcher { + t.Errorf("`%s` matcher is %s, want %s", test.re, haveMatcher, wantMatcher) + continue + } + re, err := regexp.Compile(test.re) + if err != nil { + t.Fatal(err) + } + for _, input := range inputs { + have := p.MatchString(input) + want := re.MatchString(input) + if have != want { + t.Errorf("`%s` invalid MatchString() result on %q (want %v)", test.re, input, want) + break + } + have = p.Match([]byte(input)) + want = re.Match([]byte(input)) + if have != want { + t.Errorf("`%s` invalid Match() result on %q (want %v)", test.re, input, want) + break + } + } + } +} + +func BenchmarkMatch(b *testing.B) { + tests := []struct { + re string + inputs []string + }{ + { + `^\p{Lu}`, + []string{ + `Foo`, + `foo`, + }, + }, + + { + `^\p{Ll}`, + []string{ + `foo`, + `Foo`, + }, + }, + + { + `foo$`, + []string{ + ` foo`, + `bar`, + }, + }, + + { + `^foo`, + []string{ + `foo`, + ` bar`, + }, + }, + + { + `.*simpleIdent.*`, + []string{ + `text simpleIdent other text`, + `text without matching ident`, + }, + }, + + { + `simpleIdent`, + []string{ + `simpleIdent`, + `text without simpleIdent`, + }, + }, + } + + for _, test := range tests { + re, err := regexp.Compile(test.re) + if err != nil { + b.Fatal(err) + } + pat, err := Compile(test.re) + if err != nil { + b.Fatal(err) + } + if IsRegexp(pat) { + b.Fatalf("`%s` is not optimized", test.re) + } + for i, input := range test.inputs { + b.Run(fmt.Sprintf("%s_%d_re", test.re, i), func(b *testing.B) { + for i := 0; i < b.N; i++ { + _ = re.MatchString(input) + } + }) + b.Run(fmt.Sprintf("%s_%d_opt", test.re, i), func(b *testing.B) { + for i := 0; i < b.N; i++ { + _ = pat.MatchString(input) + } + }) + } + } +} + +var inputStrings = []string{ + ``, + "\x00", + `foo`, + `foo2`, + `_foo`, + `foobarfoo`, + `Foo`, + `FOO`, + `bar_baz`, + `2493`, + "some longer text fragment (foo)", + "multi\nline\ntext\fragment", + "foo\nbar\n(foo)\n\n", + "ƇƉ", +}