Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hash based lookup implementation #143

Merged
merged 3 commits into from
Aug 23, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 5 additions & 3 deletions cmd/gen/gen.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@ import (
"context"
"fmt"
"go/format"
"io/ioutil"
"log"
"net/http"
"strings"
"text/template"
"time"

"github.com/google/go-github/github"
"github.com/weppos/publicsuffix-go/publicsuffix"
"io/ioutil"
"log"
)

const (
Expand All @@ -39,7 +39,9 @@ func init() {
{ {{$r.Type}}, "{{$r.Value}}", {{$r.Length}}, {{$r.Private}} },
{{end}}
}
DefaultList.rules = r[:]
for i := range r {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the reason of this change? Is it because of #141?

DefaultList.AddRule(&r[i])
}
}

`
Expand Down
149 changes: 66 additions & 83 deletions publicsuffix/publicsuffix.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ import (
"io"
"net/http/cookiejar"
"os"
"regexp"
"strings"

"golang.org/x/net/idna"
Expand Down Expand Up @@ -80,13 +79,14 @@ type FindOptions struct {
// List represents a Public Suffix List.
type List struct {
// rules is kept private because you should not access rules directly
// for lookup optimization the list will not be guaranteed to be a simple slice forever
rules []Rule
rules map[string]*Rule
}

// NewList creates a new empty list.
func NewList() *List {
return &List{}
return &List{
rules: map[string]*Rule{},
}
}

// NewListFromString parses a string that represents a Public Suffix source
Expand Down Expand Up @@ -132,7 +132,7 @@ func (l *List) LoadFile(path string, options *ParserOption) ([]Rule, error) {
// The list may be optimized internally for lookups, therefore the algorithm
// will decide the best position for the new rule.
func (l *List) AddRule(r *Rule) error {
l.rules = append(l.rules, *r)
l.rules[r.Value] = r
return nil
}

Expand Down Expand Up @@ -195,43 +195,23 @@ Scanning:

// Find and returns the most appropriate rule for the domain name.
func (l *List) Find(name string, options *FindOptions) *Rule {
var bestRule *Rule

if options == nil {
options = DefaultFindOptions
}

for _, r := range l.selectRules(name, options) {
if r.Type == ExceptionType {
return &r
}
if bestRule == nil || bestRule.Length < r.Length {
bestRule = &r
}
}

if bestRule != nil {
return bestRule
}

return options.DefaultRule
}

func (l *List) selectRules(name string, options *FindOptions) []Rule {
var found []Rule

// In this phase the search is a simple sequential scan
for _, rule := range l.rules {
if !rule.Match(name) {
continue
for {
rule, ok := l.rules[name]
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a subtle but important implication in this change, and I want to add a note here mostly as a reminder.
This change will assume that *.foo and foo won't exist in the same definition.

Please see the main ticket for further information.

if ok && (!options.IgnorePrivate || !rule.Private) {
return rule
}
if options.IgnorePrivate && rule.Private {
continue
i := strings.IndexRune(name, '.')
if i < 0 {
return options.DefaultRule
}
found = append(found, rule)
name = name[i+1:]
}

return found
return nil
}

// NewRule parses the rule content, creates and returns a Rule.
Expand Down Expand Up @@ -309,36 +289,46 @@ func (r *Rule) Match(name string) bool {

// Decompose takes a name as input and decomposes it into a tuple of <TRD+SLD, TLD>,
// according to the rule definition and type.
func (r *Rule) Decompose(name string) [2]string {
var parts []string

func (r *Rule) Decompose(name string) (result [2]string) {
if r == DefaultRule {
i := strings.LastIndex(name, ".")
if i < 0 {
return
}
result[0], result[1] = name[:i], name[i+1:]
return
}
switch r.Type {
case NormalType:
name = strings.TrimSuffix(name, r.Value)
if len(name) == 0 {
return
}
result[0], result[1] = name[:len(name)-1], r.Value
case WildcardType:
parts = append([]string{`.*?`}, r.parts()...)
default:
parts = r.parts()
}

suffix := strings.Join(parts, `\.`)
re := regexp.MustCompile(fmt.Sprintf(`^(.+)\.(%s)$`, suffix))

matches := re.FindStringSubmatch(name)
if len(matches) < 3 {
return [2]string{"", ""}
}

return [2]string{matches[1], matches[2]}
}

func (r *Rule) parts() []string {
labels := Labels(r.Value)
if r.Type == ExceptionType {
return labels[1:]
}
if r.Type == WildcardType && r.Value == "" {
return []string{}
name := strings.TrimSuffix(name, r.Value)
if len(name) == 0 {
return
}
name = name[:len(name)-1]
i := strings.LastIndex(name, ".")
if i < 0 {
return
}
result[0], result[1] = name[:i], name[i+1:]+"."+r.Value
case ExceptionType:
i := strings.IndexRune(r.Value, '.')
if i < 0 {
return
}
suffix := r.Value[i+1:]
name = strings.TrimSuffix(name, suffix)
if len(name) == 0 {
return
}
result[0], result[1] = name[:len(name)-1], suffix
}
return labels
return
}

// Labels decomposes given domain name into labels,
Expand Down Expand Up @@ -432,7 +422,6 @@ func DomainFromListWithOptions(l *List, name string, options *FindOptions) (stri
if err != nil {
return "", err
}

return dn.SLD + "." + dn.TLD, nil
}

Expand All @@ -458,44 +447,38 @@ func ParseFromListWithOptions(l *List, name string, options *FindOptions) (*Doma
}

r := l.Find(n, options)
if tld := r.Decompose(n)[1]; tld == "" {
parts := r.Decompose(n)
left, tld := parts[0], parts[1]
if tld == "" {
return nil, fmt.Errorf("%s is a suffix", n)
}

dn := &DomainName{Rule: r}
dn.TLD, dn.SLD, dn.TRD = decompose(r, n)
dn := &DomainName{
Rule: r,
TLD: tld,
}
if i := strings.LastIndex(left, "."); i < 0 {
dn.SLD = left
} else {
dn.TRD = left[:i]
dn.SLD = left[i+1:]
}
return dn, nil
}

func normalize(name string) (string, error) {
ret := strings.ToLower(name)

if ret == "" {
return "", fmt.Errorf("Name is blank")
return "", fmt.Errorf("name is blank")
}
if ret[0] == '.' {
return "", fmt.Errorf("Name %s starts with a dot", ret)
return "", fmt.Errorf("name %s starts with a dot", ret)
}

return ret, nil
}

func decompose(r *Rule, name string) (tld, sld, trd string) {
parts := r.Decompose(name)
left, tld := parts[0], parts[1]

dot := strings.LastIndex(left, ".")
if dot == -1 {
sld = left
trd = ""
} else {
sld = left[dot+1:]
trd = left[0:dot]
}

return
}

// ToASCII is a wrapper for idna.ToASCII.
//
// This wrapper exists because idna.ToASCII backward-compatibility was broken twice in few months
Expand Down
49 changes: 43 additions & 6 deletions publicsuffix/publicsuffix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ package publicsuffix
import (
"reflect"
"testing"

xlib "golang.org/x/net/publicsuffix"
)

func TestNewListFromString(t *testing.T) {
Expand Down Expand Up @@ -42,7 +44,7 @@ blogspot.com
testRules = []Rule{}
for _, rule := range rules {
if rule.Private == false {
testRules = append(testRules, rule)
testRules = append(testRules, *rule)
}
}
if want, got := 2, len(testRules); want != got {
Expand All @@ -53,7 +55,7 @@ blogspot.com
testRules = []Rule{}
for _, rule := range rules {
if rule.Private == true {
testRules = append(testRules, rule)
testRules = append(testRules, *rule)
}
}
if want, got := 1, len(testRules); want != got {
Expand Down Expand Up @@ -141,7 +143,7 @@ func TestNewListFromFile(t *testing.T) {
testRules = []Rule{}
for _, rule := range rules {
if rule.Private == false {
testRules = append(testRules, rule)
testRules = append(testRules, *rule)
}
}
if want, got := 2, len(testRules); want != got {
Expand All @@ -152,7 +154,7 @@ func TestNewListFromFile(t *testing.T) {
testRules = []Rule{}
for _, rule := range rules {
if rule.Private == true {
testRules = append(testRules, rule)
testRules = append(testRules, *rule)
}
}
if want, got := 1, len(testRules); want != got {
Expand All @@ -173,8 +175,10 @@ func TestListAddRule(t *testing.T) {
if list.Size() != 1 {
t.Fatalf("List should have 1 rule, got %v", list.Size())
}
if got := &list.rules[0]; !reflect.DeepEqual(rule, got) {
t.Fatalf("List[0] expected to be %v, got %v", rule, got)
for _, got := range list.rules {
if !reflect.DeepEqual(rule, got) {
t.Fatalf("List[0] expected to be %v, got %v", rule, got)
}
}
}

Expand Down Expand Up @@ -439,3 +443,36 @@ func TestCookieJarList(t *testing.T) {
}
}
}

var benchmarkTestCases = map[string]string{
"example.com": "example.com",
"example.id.au": "example.id.au",
"www.ck": "www.ck",
"foo.bar.xn--55qx5d.cn": "bar.xn--55qx5d.cn",
"a.b.c.minami.fukuoka.jp": "c.minami.fukuoka.jp",
"posts-and-telecommunications.museum": "",
"www.example.pvt.k12.ma.us": "example.pvt.k12.ma.us",
"many.lol": "many.lol",
"the.russian.for.moscow.is.xn--80adxhks": "is.xn--80adxhks",
"blah.blah.s3-us-west-1.amazonaws.com": "blah.s3-us-west-1.amazonaws.com",
"thing.dyndns.org": "thing.dyndns.org",
"nosuchtld": "",
}

func benchmarkDomain(b *testing.B, domainFunc func(string) (string, error)) {
var got string
for i := 0; i < b.N; i++ {
for input := range benchmarkTestCases {
got, _ = domainFunc(input)
}
}
_ = got
}

func BenchmarkDomain(b *testing.B) {
benchmarkDomain(b, Domain)
}

func BenchmarkXNet(b *testing.B) {
benchmarkDomain(b, xlib.EffectiveTLDPlusOne)
}
4 changes: 3 additions & 1 deletion publicsuffix/rules.go
Original file line number Diff line number Diff line change
Expand Up @@ -8626,5 +8626,7 @@ func init() {
{1, "now.sh", 2, true},
{1, "zone.id", 2, true},
}
DefaultList.rules = r[:]
for i := range r {
DefaultList.AddRule(&r[i])
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the reason of this change? Is it because of #141?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, nevermind. I actually figured it out, we need to pass via AddRule because the internal rule is no longer a list.

My bad, sorry for the stupid question.

I need to determine the cost of this operation. It could be possible that it's more efficient to compile directly the r in the appropriate form and then assign.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't speak with absolute certainty, but my gut feeling is that this implementation eases the load on the garbage collector because nothing in this list should ever be garbage collected which means it's better to have all of them in a single giant array rather than allocating memory for each individually on the heap. I might be wrong and it might be worth benchmarking.

}
}