-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
unic.go
100 lines (81 loc) · 1.99 KB
/
unic.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
package unic
import (
"bufio"
"io"
"strings"
multierror "github.com/hashicorp/go-multierror"
cuckoo "github.com/seiflotfy/cuckoofilter"
)
// Filter is a unique filter utilizing Cuckoo Filters
type Filter struct {
CaseI bool
FilterCapacity uint
}
// FilterOption sets an option of the passed Filter
type FilterOption func(*Filter) error
// FilterCaseInsensitive configures the Filter to be Case Insensitive
func FilterCaseInsensitive(f *Filter) error {
f.CaseI = true
return nil
}
// FilterCapacity sets the cuckoo filter capacity for the Filter's
// internal cuckoo filters
func FilterCapacity(capacity uint) FilterOption {
return func(f *Filter) error {
f.FilterCapacity = capacity
return nil
}
}
// NewFilter returns a Filter configured with the given FilterOptions
func NewFilter(options ...FilterOption) (*Filter, error) {
filter := &Filter{
FilterCapacity: 1000000,
}
var result *multierror.Error
for _, option := range options {
err := option(filter)
result = multierror.Append(result, err)
}
return filter, result.ErrorOrNil()
}
// Exec executes the filter on the given input.
// Writes unique output to the unique stream.
// Writes repeated output to the repeated stream.
func (u *Filter) Exec(input io.Reader, unique, repeated io.Writer) error {
cf := cuckoo.NewFilter(u.FilterCapacity)
cf2 := cuckoo.NewFilter(u.FilterCapacity)
reader := bufio.NewReader(input)
for {
text, readErr := reader.ReadBytes('\n')
if readErr == io.EOF {
if len(text) == 0 {
return nil
}
text = append(text, '\n')
} else if readErr != nil {
return readErr
}
cmptxt := text
if u.CaseI {
cmptxt = []byte(strings.ToLower(string(text)))
}
if !cf.Lookup(cmptxt) {
_, err := unique.Write(text)
if err != nil {
return err
}
} else {
if !cf2.Lookup(cmptxt) {
_, err := repeated.Write(text)
if err != nil {
return err
}
}
cf2.InsertUnique(cmptxt)
}
cf.InsertUnique(cmptxt)
if readErr == io.EOF {
return nil
}
}
}