Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace plain decoder with utf8 #922

Merged
merged 1 commit into from Nov 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion pkg/decoders/decoders.go
Expand Up @@ -6,7 +6,7 @@ import (

func DefaultDecoders() []Decoder {
return []Decoder{
&Plain{},
&UTF8{},
&Base64{},
}
}
Expand Down
14 changes: 0 additions & 14 deletions pkg/decoders/plain.go

This file was deleted.

44 changes: 44 additions & 0 deletions pkg/decoders/utf8.go
@@ -0,0 +1,44 @@
package decoders

import (
"bytes"
"unicode/utf8"

"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)

type UTF8 struct{}

func (d *UTF8) FromChunk(chunk *sources.Chunk) *sources.Chunk {
if chunk == nil || len(chunk.Data) == 0 {
return nil
}

if !utf8.Valid(chunk.Data) {
chunk.Data = extractSubstrings(chunk.Data)
return chunk
}

return chunk
}

// extractSubstrings performs similarly to the strings binutil,
// extacting contigous portions of printable characters that we care
// about from some bytes
func extractSubstrings(b []byte) []byte {
fields := bytes.FieldsFunc(b, func(r rune) bool {
// https://www.rapidtables.com/code/text/ascii-table.html
// split on anything that is not ascii space through tilde
return !(r > 31 && r < 127)
})

keep := [][]byte{}
for _, field := range fields {
// Remove fields shorter than 6 characters.
if bts := bytes.TrimSpace(field); len(bts) > 5 {
keep = append(keep, bts)
}
}

return bytes.Join(keep, []byte("\n"))
}
87 changes: 87 additions & 0 deletions pkg/decoders/utf8_test.go
@@ -0,0 +1,87 @@
package decoders

import (
"testing"

"github.com/kylelemons/godebug/pretty"
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
)

func TestUTF8_FromChunk(t *testing.T) {
type args struct {
chunk *sources.Chunk
}
tests := []struct {
name string
d *UTF8
args args
want *sources.Chunk
wantErr bool
}{
{
name: "successful UTF8 decode",
d: &UTF8{},
args: args{
chunk: &sources.Chunk{Data: []byte("plain 'ol chunk that should decode successfully")},
},
want: &sources.Chunk{Data: []byte("plain 'ol chunk that should decode successfully")},
wantErr: false,
},
{
name: "successful binary decode",
d: &UTF8{},
args: args{
chunk: &sources.Chunk{Data: []byte("\xf0\x28\x8c\x28 not-entirely utf8 chunk that should decode successfully")},
},
want: &sources.Chunk{Data: []byte("( not-entirely utf8 chunk that should decode successfully")},
wantErr: false,
},
{
name: "unsuccessful decode",
d: &UTF8{},
args: args{
chunk: nil,
},
want: nil,
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
d := &UTF8{}
got := d.FromChunk(tt.args.chunk)
if got != nil && tt.want != nil {
if diff := pretty.Compare(string(got.Data), string(tt.want.Data)); diff != "" {
t.Errorf("%s: Plain.FromChunk() diff: (-got +want)\n%s", tt.name, diff)
}
} else {
if diff := pretty.Compare(got, tt.want); diff != "" {
t.Errorf("%s: Plain.FromChunk() diff: (-got +want)\n%s", tt.name, diff)
}
}
})
}
}

var testBytes = []byte(`some words with random spaces and

newlines with
arbitrary length
of

hey

the lines themselves.

and
short
words
that
go
away.`)

func Benchmark_extractSubstrings(b *testing.B) {
for i := 0; i < b.N; i++ {
extractSubstrings(testBytes)
}
}
2 changes: 1 addition & 1 deletion pkg/engine/engine.go
Expand Up @@ -185,7 +185,7 @@ func (e *Engine) detectorWorker(ctx context.Context) {
for _, decoder := range e.decoders {
var decoderType detectorspb.DecoderType
switch decoder.(type) {
case *decoders.Plain:
case *decoders.UTF8:
decoderType = detectorspb.DecoderType_PLAIN
case *decoders.Base64:
decoderType = detectorspb.DecoderType_BASE64
Expand Down