-
-
Notifications
You must be signed in to change notification settings - Fork 32
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Hash based lookup implementation #143
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,7 +11,6 @@ import ( | |
"io" | ||
"net/http/cookiejar" | ||
"os" | ||
"regexp" | ||
"strings" | ||
|
||
"golang.org/x/net/idna" | ||
|
@@ -80,13 +79,14 @@ type FindOptions struct { | |
// List represents a Public Suffix List. | ||
type List struct { | ||
// rules is kept private because you should not access rules directly | ||
// for lookup optimization the list will not be guaranteed to be a simple slice forever | ||
rules []Rule | ||
rules map[string]*Rule | ||
} | ||
|
||
// NewList creates a new empty list. | ||
func NewList() *List { | ||
return &List{} | ||
return &List{ | ||
rules: map[string]*Rule{}, | ||
} | ||
} | ||
|
||
// NewListFromString parses a string that represents a Public Suffix source | ||
|
@@ -132,7 +132,7 @@ func (l *List) LoadFile(path string, options *ParserOption) ([]Rule, error) { | |
// The list may be optimized internally for lookups, therefore the algorithm | ||
// will decide the best position for the new rule. | ||
func (l *List) AddRule(r *Rule) error { | ||
l.rules = append(l.rules, *r) | ||
l.rules[r.Value] = r | ||
return nil | ||
} | ||
|
||
|
@@ -195,43 +195,23 @@ Scanning: | |
|
||
// Find and returns the most appropriate rule for the domain name. | ||
func (l *List) Find(name string, options *FindOptions) *Rule { | ||
var bestRule *Rule | ||
|
||
if options == nil { | ||
options = DefaultFindOptions | ||
} | ||
|
||
for _, r := range l.selectRules(name, options) { | ||
if r.Type == ExceptionType { | ||
return &r | ||
} | ||
if bestRule == nil || bestRule.Length < r.Length { | ||
bestRule = &r | ||
} | ||
} | ||
|
||
if bestRule != nil { | ||
return bestRule | ||
} | ||
|
||
return options.DefaultRule | ||
} | ||
|
||
func (l *List) selectRules(name string, options *FindOptions) []Rule { | ||
var found []Rule | ||
|
||
// In this phase the search is a simple sequential scan | ||
for _, rule := range l.rules { | ||
if !rule.Match(name) { | ||
continue | ||
for { | ||
rule, ok := l.rules[name] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is a subtle but important implication in this change, and I want to add a note here mostly as a reminder. Please see the main ticket for further information. |
||
if ok && (!options.IgnorePrivate || !rule.Private) { | ||
return rule | ||
} | ||
if options.IgnorePrivate && rule.Private { | ||
continue | ||
i := strings.IndexRune(name, '.') | ||
if i < 0 { | ||
return options.DefaultRule | ||
} | ||
found = append(found, rule) | ||
name = name[i+1:] | ||
} | ||
|
||
return found | ||
return nil | ||
} | ||
|
||
// NewRule parses the rule content, creates and returns a Rule. | ||
|
@@ -309,36 +289,46 @@ func (r *Rule) Match(name string) bool { | |
|
||
// Decompose takes a name as input and decomposes it into a tuple of <TRD+SLD, TLD>, | ||
// according to the rule definition and type. | ||
func (r *Rule) Decompose(name string) [2]string { | ||
var parts []string | ||
|
||
func (r *Rule) Decompose(name string) (result [2]string) { | ||
if r == DefaultRule { | ||
i := strings.LastIndex(name, ".") | ||
if i < 0 { | ||
return | ||
} | ||
result[0], result[1] = name[:i], name[i+1:] | ||
return | ||
} | ||
switch r.Type { | ||
case NormalType: | ||
name = strings.TrimSuffix(name, r.Value) | ||
if len(name) == 0 { | ||
return | ||
} | ||
result[0], result[1] = name[:len(name)-1], r.Value | ||
case WildcardType: | ||
parts = append([]string{`.*?`}, r.parts()...) | ||
default: | ||
parts = r.parts() | ||
} | ||
|
||
suffix := strings.Join(parts, `\.`) | ||
re := regexp.MustCompile(fmt.Sprintf(`^(.+)\.(%s)$`, suffix)) | ||
|
||
matches := re.FindStringSubmatch(name) | ||
if len(matches) < 3 { | ||
return [2]string{"", ""} | ||
} | ||
|
||
return [2]string{matches[1], matches[2]} | ||
} | ||
|
||
func (r *Rule) parts() []string { | ||
labels := Labels(r.Value) | ||
if r.Type == ExceptionType { | ||
return labels[1:] | ||
} | ||
if r.Type == WildcardType && r.Value == "" { | ||
return []string{} | ||
name := strings.TrimSuffix(name, r.Value) | ||
if len(name) == 0 { | ||
return | ||
} | ||
name = name[:len(name)-1] | ||
i := strings.LastIndex(name, ".") | ||
if i < 0 { | ||
return | ||
} | ||
result[0], result[1] = name[:i], name[i+1:]+"."+r.Value | ||
case ExceptionType: | ||
i := strings.IndexRune(r.Value, '.') | ||
if i < 0 { | ||
return | ||
} | ||
suffix := r.Value[i+1:] | ||
name = strings.TrimSuffix(name, suffix) | ||
if len(name) == 0 { | ||
return | ||
} | ||
result[0], result[1] = name[:len(name)-1], suffix | ||
} | ||
return labels | ||
return | ||
} | ||
|
||
// Labels decomposes given domain name into labels, | ||
|
@@ -432,7 +422,6 @@ func DomainFromListWithOptions(l *List, name string, options *FindOptions) (stri | |
if err != nil { | ||
return "", err | ||
} | ||
|
||
return dn.SLD + "." + dn.TLD, nil | ||
} | ||
|
||
|
@@ -458,44 +447,38 @@ func ParseFromListWithOptions(l *List, name string, options *FindOptions) (*Doma | |
} | ||
|
||
r := l.Find(n, options) | ||
if tld := r.Decompose(n)[1]; tld == "" { | ||
parts := r.Decompose(n) | ||
left, tld := parts[0], parts[1] | ||
if tld == "" { | ||
return nil, fmt.Errorf("%s is a suffix", n) | ||
} | ||
|
||
dn := &DomainName{Rule: r} | ||
dn.TLD, dn.SLD, dn.TRD = decompose(r, n) | ||
dn := &DomainName{ | ||
Rule: r, | ||
TLD: tld, | ||
} | ||
if i := strings.LastIndex(left, "."); i < 0 { | ||
dn.SLD = left | ||
} else { | ||
dn.TRD = left[:i] | ||
dn.SLD = left[i+1:] | ||
} | ||
return dn, nil | ||
} | ||
|
||
func normalize(name string) (string, error) { | ||
ret := strings.ToLower(name) | ||
|
||
if ret == "" { | ||
return "", fmt.Errorf("Name is blank") | ||
return "", fmt.Errorf("name is blank") | ||
} | ||
if ret[0] == '.' { | ||
return "", fmt.Errorf("Name %s starts with a dot", ret) | ||
return "", fmt.Errorf("name %s starts with a dot", ret) | ||
} | ||
|
||
return ret, nil | ||
} | ||
|
||
func decompose(r *Rule, name string) (tld, sld, trd string) { | ||
parts := r.Decompose(name) | ||
left, tld := parts[0], parts[1] | ||
|
||
dot := strings.LastIndex(left, ".") | ||
if dot == -1 { | ||
sld = left | ||
trd = "" | ||
} else { | ||
sld = left[dot+1:] | ||
trd = left[0:dot] | ||
} | ||
|
||
return | ||
} | ||
|
||
// ToASCII is a wrapper for idna.ToASCII. | ||
// | ||
// This wrapper exists because idna.ToASCII backward-compatibility was broken twice in few months | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8626,5 +8626,7 @@ func init() { | |
{1, "now.sh", 2, true}, | ||
{1, "zone.id", 2, true}, | ||
} | ||
DefaultList.rules = r[:] | ||
for i := range r { | ||
DefaultList.AddRule(&r[i]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the reason of this change? Is it because of #141? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, nevermind. I actually figured it out, we need to pass via My bad, sorry for the stupid question. I need to determine the cost of this operation. It could be possible that it's more efficient to compile directly the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can't speak with absolute certainty, but my gut feeling is that this implementation eases the load on the garbage collector because nothing in this list should ever be garbage collected which means it's better to have all of them in a single giant array rather than allocating memory for each individually on the heap. I might be wrong and it might be worth benchmarking. |
||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's the reason of this change? Is it because of #141?