diff --git a/pkg/decoders/decoders.go b/pkg/decoders/decoders.go index 3a20e1d9b36f..d069ee5511f1 100644 --- a/pkg/decoders/decoders.go +++ b/pkg/decoders/decoders.go @@ -6,7 +6,7 @@ import ( func DefaultDecoders() []Decoder { return []Decoder{ - &Plain{}, + &UTF8{}, &Base64{}, } } diff --git a/pkg/decoders/plain.go b/pkg/decoders/plain.go deleted file mode 100644 index 9310194cbfa6..000000000000 --- a/pkg/decoders/plain.go +++ /dev/null @@ -1,14 +0,0 @@ -package decoders - -import ( - "github.com/trufflesecurity/trufflehog/v3/pkg/sources" -) - -// Ensure the Decoder satisfies the interface at compile time -var _ Decoder = (*Plain)(nil) - -type Plain struct{} - -func (d *Plain) FromChunk(chunk *sources.Chunk) *sources.Chunk { - return chunk -} diff --git a/pkg/decoders/utf8.go b/pkg/decoders/utf8.go new file mode 100644 index 000000000000..6a494a201c54 --- /dev/null +++ b/pkg/decoders/utf8.go @@ -0,0 +1,44 @@ +package decoders + +import ( + "bytes" + "unicode/utf8" + + "github.com/trufflesecurity/trufflehog/v3/pkg/sources" +) + +type UTF8 struct{} + +func (d *UTF8) FromChunk(chunk *sources.Chunk) *sources.Chunk { + if chunk == nil || len(chunk.Data) == 0 { + return nil + } + + if !utf8.Valid(chunk.Data) { + chunk.Data = extractSubstrings(chunk.Data) + return chunk + } + + return chunk +} + +// extractSubstrings performs similarly to the strings binutil, +// extacting contigous portions of printable characters that we care +// about from some bytes +func extractSubstrings(b []byte) []byte { + fields := bytes.FieldsFunc(b, func(r rune) bool { + // https://www.rapidtables.com/code/text/ascii-table.html + // split on anything that is not ascii space through tilde + return !(r > 31 && r < 127) + }) + + keep := [][]byte{} + for _, field := range fields { + // Remove fields shorter than 6 characters. + if bts := bytes.TrimSpace(field); len(bts) > 5 { + keep = append(keep, bts) + } + } + + return bytes.Join(keep, []byte("\n")) +} diff --git a/pkg/decoders/utf8_test.go b/pkg/decoders/utf8_test.go new file mode 100644 index 000000000000..283195e66860 --- /dev/null +++ b/pkg/decoders/utf8_test.go @@ -0,0 +1,87 @@ +package decoders + +import ( + "testing" + + "github.com/kylelemons/godebug/pretty" + "github.com/trufflesecurity/trufflehog/v3/pkg/sources" +) + +func TestUTF8_FromChunk(t *testing.T) { + type args struct { + chunk *sources.Chunk + } + tests := []struct { + name string + d *UTF8 + args args + want *sources.Chunk + wantErr bool + }{ + { + name: "successful UTF8 decode", + d: &UTF8{}, + args: args{ + chunk: &sources.Chunk{Data: []byte("plain 'ol chunk that should decode successfully")}, + }, + want: &sources.Chunk{Data: []byte("plain 'ol chunk that should decode successfully")}, + wantErr: false, + }, + { + name: "successful binary decode", + d: &UTF8{}, + args: args{ + chunk: &sources.Chunk{Data: []byte("\xf0\x28\x8c\x28 not-entirely utf8 chunk that should decode successfully")}, + }, + want: &sources.Chunk{Data: []byte("( not-entirely utf8 chunk that should decode successfully")}, + wantErr: false, + }, + { + name: "unsuccessful decode", + d: &UTF8{}, + args: args{ + chunk: nil, + }, + want: nil, + wantErr: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + d := &UTF8{} + got := d.FromChunk(tt.args.chunk) + if got != nil && tt.want != nil { + if diff := pretty.Compare(string(got.Data), string(tt.want.Data)); diff != "" { + t.Errorf("%s: Plain.FromChunk() diff: (-got +want)\n%s", tt.name, diff) + } + } else { + if diff := pretty.Compare(got, tt.want); diff != "" { + t.Errorf("%s: Plain.FromChunk() diff: (-got +want)\n%s", tt.name, diff) + } + } + }) + } +} + +var testBytes = []byte(`some words with random spaces and + +newlines with +arbitrary length +of + + hey + +the lines themselves. + +and +short +words +that +go +away.`) + +func Benchmark_extractSubstrings(b *testing.B) { + for i := 0; i < b.N; i++ { + extractSubstrings(testBytes) + } +} diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 095a5a33492e..651007e70fc3 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -185,7 +185,7 @@ func (e *Engine) detectorWorker(ctx context.Context) { for _, decoder := range e.decoders { var decoderType detectorspb.DecoderType switch decoder.(type) { - case *decoders.Plain: + case *decoders.UTF8: decoderType = detectorspb.DecoderType_PLAIN case *decoders.Base64: decoderType = detectorspb.DecoderType_BASE64