Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ruleguard/textmatch: an abstraction on top of regexp for performance #281

Merged
merged 1 commit into from Oct 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 4 additions & 4 deletions ruleguard/filters.go
Expand Up @@ -6,12 +6,12 @@ import (
"go/token"
"go/types"
"path/filepath"
"regexp"

"github.com/quasilyte/go-ruleguard/internal/gogrep"
"github.com/quasilyte/go-ruleguard/internal/xtypes"
"github.com/quasilyte/go-ruleguard/nodetag"
"github.com/quasilyte/go-ruleguard/ruleguard/quasigo"
"github.com/quasilyte/go-ruleguard/ruleguard/textmatch"
"github.com/quasilyte/go-ruleguard/ruleguard/typematch"
)

Expand Down Expand Up @@ -76,7 +76,7 @@ func makeFileImportsFilter(src, pkgPath string) filterFunc {
}
}

func makeFilePkgPathMatchesFilter(src string, re *regexp.Regexp) filterFunc {
func makeFilePkgPathMatchesFilter(src string, re textmatch.Pattern) filterFunc {
return func(params *filterParams) matchFilterResult {
pkgPath := params.ctx.Pkg.Path()
if re.MatchString(pkgPath) {
Expand All @@ -86,7 +86,7 @@ func makeFilePkgPathMatchesFilter(src string, re *regexp.Regexp) filterFunc {
}
}

func makeFileNameMatchesFilter(src string, re *regexp.Regexp) filterFunc {
func makeFileNameMatchesFilter(src string, re textmatch.Pattern) filterFunc {
return func(params *filterParams) matchFilterResult {
if re.MatchString(filepath.Base(params.filename)) {
return filterSuccess
Expand Down Expand Up @@ -373,7 +373,7 @@ func makeTextFilter(src, varname string, op token.Token, rhsVarname string) filt
}
}

func makeTextMatchesFilter(src, varname string, re *regexp.Regexp) filterFunc {
func makeTextMatchesFilter(src, varname string, re textmatch.Pattern) filterFunc {
// TODO(quasilyte): add variadic support.
return func(params *filterParams) matchFilterResult {
if re.Match(params.nodeText(params.subNode(varname))) {
Expand Down
5 changes: 3 additions & 2 deletions ruleguard/ir_loader.go
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/quasilyte/go-ruleguard/ruleguard/goutil"
"github.com/quasilyte/go-ruleguard/ruleguard/ir"
"github.com/quasilyte/go-ruleguard/ruleguard/quasigo"
"github.com/quasilyte/go-ruleguard/ruleguard/textmatch"
"github.com/quasilyte/go-ruleguard/ruleguard/typematch"
)

Expand Down Expand Up @@ -409,12 +410,12 @@ func (l *irLoader) unwrapInterfaceExpr(filter ir.FilterExpr) (*types.Interface,
return iface, nil
}

func (l *irLoader) unwrapRegexpExpr(filter ir.FilterExpr) (*regexp.Regexp, error) {
func (l *irLoader) unwrapRegexpExpr(filter ir.FilterExpr) (textmatch.Pattern, error) {
patternString := l.unwrapStringExpr(filter)
if patternString == "" {
return nil, l.errorf(filter.Line, nil, "expected a non-empty regexp pattern argument")
}
re, err := regexp.Compile(patternString)
re, err := textmatch.Compile(patternString)
if err != nil {
return nil, l.errorf(filter.Line, err, "compile regexp")
}
Expand Down
84 changes: 84 additions & 0 deletions ruleguard/textmatch/compile.go
@@ -0,0 +1,84 @@
package textmatch

import (
"regexp"
"regexp/syntax"
"unicode"
)

func compile(s string) (Pattern, error) {
reSyntax, err := syntax.Parse(s, syntax.Perl)
if err == nil {
if optimized := compileOptimized(s, reSyntax); optimized != nil {
return optimized, nil
}
}
return regexp.Compile(s)
}

func compileOptimized(s string, re *syntax.Regexp) Pattern {
// .*
isAny := func(re *syntax.Regexp) bool {
return re.Op == syntax.OpStar && re.Sub[0].Op == syntax.OpAnyCharNotNL
}
// "literal"
isLit := func(re *syntax.Regexp) bool {
return re.Op == syntax.OpLiteral
}
// ^
isBegin := func(re *syntax.Regexp) bool {
return re.Op == syntax.OpBeginText
}
// $
isEnd := func(re *syntax.Regexp) bool {
return re.Op == syntax.OpEndText
}

// TODO: analyze what kind of regexps people use in rules
// more often and optimize those as well.

// lit => strings.Contains($input, lit)
if re.Op == syntax.OpLiteral {
return &containsLiteralMatcher{value: newInputValue(string(re.Rune))}
}

// `.*` lit `.*` => strings.Contains($input, lit)
if re.Op == syntax.OpConcat && len(re.Sub) == 3 {
if isAny(re.Sub[0]) && isLit(re.Sub[1]) && isAny(re.Sub[2]) {
return &containsLiteralMatcher{value: newInputValue(string(re.Sub[1].Rune))}
}
}

// `^` lit => strings.HasPrefix($input, lit)
if re.Op == syntax.OpConcat && len(re.Sub) == 2 {
if isBegin(re.Sub[0]) && isLit(re.Sub[1]) {
return &prefixLiteralMatcher{value: newInputValue(string(re.Sub[1].Rune))}
}
}

// lit `$` => strings.HasSuffix($input, lit)
if re.Op == syntax.OpConcat && len(re.Sub) == 2 {
if isLit(re.Sub[0]) && isEnd(re.Sub[1]) {
return &suffixLiteralMatcher{value: newInputValue(string(re.Sub[0].Rune))}
}
}

// `^` lit `$` => $input == lit
if re.Op == syntax.OpConcat && len(re.Sub) == 3 {
if isBegin(re.Sub[0]) && isLit(re.Sub[1]) && isEnd(re.Sub[2]) {
return &eqLiteralMatcher{value: newInputValue(string(re.Sub[1].Rune))}
}
}

// `^\p{Lu}` => prefixRunePredMatcher:unicode.IsUpper
// `^\p{Ll}` => prefixRunePredMatcher:unicode.IsLower
switch s {
case `^\p{Lu}`:
return &prefixRunePredMatcher{pred: unicode.IsUpper}
case `^\p{Ll}`:
return &prefixRunePredMatcher{pred: unicode.IsLower}
}

// Can't optimize.
return nil
}
72 changes: 72 additions & 0 deletions ruleguard/textmatch/matchers.go
@@ -0,0 +1,72 @@
package textmatch

import (
"bytes"
"strings"
"unicode/utf8"
)

// inputValue is a wrapper for string|[]byte.
//
// We hold both values to avoid string->[]byte and vice versa
// conversions when doing Match and MatchString.
type inputValue struct {
s string
b []byte
}

func newInputValue(s string) inputValue {
return inputValue{s: s, b: []byte(s)}
}

type containsLiteralMatcher struct{ value inputValue }

func (m *containsLiteralMatcher) MatchString(s string) bool {
return strings.Contains(s, m.value.s)
}

func (m *containsLiteralMatcher) Match(b []byte) bool {
return bytes.Contains(b, m.value.b)
}

type prefixLiteralMatcher struct{ value inputValue }

func (m *prefixLiteralMatcher) MatchString(s string) bool {
return strings.HasPrefix(s, m.value.s)
}

func (m *prefixLiteralMatcher) Match(b []byte) bool {
return bytes.HasPrefix(b, m.value.b)
}

type suffixLiteralMatcher struct{ value inputValue }

func (m *suffixLiteralMatcher) MatchString(s string) bool {
return strings.HasSuffix(s, m.value.s)
}

func (m *suffixLiteralMatcher) Match(b []byte) bool {
return bytes.HasSuffix(b, m.value.b)
}

type eqLiteralMatcher struct{ value inputValue }

func (m *eqLiteralMatcher) MatchString(s string) bool {
return m.value.s == s
}

func (m *eqLiteralMatcher) Match(b []byte) bool {
return bytes.Equal(m.value.b, b)
}

type prefixRunePredMatcher struct{ pred func(rune) bool }

func (m *prefixRunePredMatcher) MatchString(s string) bool {
r, _ := utf8.DecodeRuneInString(s)
return m.pred(r)
}

func (m *prefixRunePredMatcher) Match(b []byte) bool {
r, _ := utf8.DecodeRune(b)
return m.pred(r)
}
26 changes: 26 additions & 0 deletions ruleguard/textmatch/textmatch.go
@@ -0,0 +1,26 @@
package textmatch

import "regexp"

// Pattern is a compiled regular expression.
type Pattern interface {
MatchString(s string) bool
Match(b []byte) bool
}

// Compile parses a regular expression and returns a compiled
// pattern that can match inputs descriped by the regexp.
//
// Semantically it's close to the regexp.Compile, but
// it does recognize some common patterns and creates
// a more optimized matcher for them.
func Compile(re string) (Pattern, error) {
return compile(re)
}

// IsRegexp reports whether p is implemented using regexp.
// False means that the underlying matcher is something optimized.
func IsRegexp(p Pattern) bool {
_, ok := p.(*regexp.Regexp)
return ok
}