From 7c8bf2c093ae25f47aca4061c7acb043d24bedd2 Mon Sep 17 00:00:00 2001 From: Joe Tsai Date: Thu, 29 Apr 2021 17:40:59 -0700 Subject: [PATCH] Print as text if mostly text The previous heuristic of treating strings as binary data if it contains any invalid UTF-8 was too strict. Loosen the heuristic to check if most of the characters are printable text. Fixes #257 --- cmp/compare_test.go | 5 +++++ cmp/report_slices.go | 35 ++++++++++++++++++----------------- cmp/testdata/diffs | 19 +++++++++++++++++++ 3 files changed, 42 insertions(+), 17 deletions(-) diff --git a/cmp/compare_test.go b/cmp/compare_test.go index baae591..649c917 100644 --- a/cmp/compare_test.go +++ b/cmp/compare_test.go @@ -1307,6 +1307,11 @@ using the AllowUnexported option.`, "\n"), x: "org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa,#=_value _value=2 11\torg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=bb,#=_value _value=2 21\torg-4747474747474747,bucket-4242424242424242:m,tag1=b,tag2=cc,#=_value _value=1 21\torg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=dd,#=_value _value=3 31\torg-4747474747474747,bucket-4242424242424242:m,tag1=c,#=_value _value=4 41\t", y: "org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa _value=2 11\torg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=bb _value=2 21\torg-4747474747474747,bucket-4242424242424242:m,tag1=b,tag2=cc _value=1 21\torg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=dd _value=3 31\torg-4747474747474747,bucket-4242424242424242:m,tag1=c _value=4 41\t", reason: "leading/trailing equal spans should not appear in diff lines", + }, { + label: label + "/MostlyTextString", + x: "org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa,\xff=_value _value=2 11\norg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=bb,\xff=_value _value=2 21\norg-4747474747474747,bucket-4242424242424242:m,tag1=b,tag2=cc,\xff=_value _value=1 21\norg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=dd,\xff=_value _value=3 31\norg-4747474747474747,bucket-4242424242424242:m,tag1=c,\xff=_value _value=4 41\n", + y: "org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa _value=2 11\norg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=bb _value=2 21\norg-4747474747474747,bucket-4242424242424242:m,tag1=b,tag2=cc _value=1 21\norg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=dd _value=3 31\norg-4747474747474747,bucket-4242424242424242:m,tag1=c _value=4 41\n", + reason: "the presence of a few invalid UTF-8 characters should not prevent printing this as text", }, { label: label + "/AllLinesDiffer", x: "d5c14bdf6bac81c27afc5429500ed750\n25483503b557c606dad4f144d27ae10b\n90bdbcdbb6ea7156068e3dcfb7459244\n978f480a6e3cced51e297fbff9a506b7\n", diff --git a/cmp/report_slices.go b/cmp/report_slices.go index bd8ca15..2ad3bc8 100644 --- a/cmp/report_slices.go +++ b/cmp/report_slices.go @@ -7,6 +7,7 @@ package cmp import ( "bytes" "fmt" + "math" "reflect" "strconv" "strings" @@ -96,16 +97,16 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode { } // Auto-detect the type of the data. - var isLinedText, isText, isBinary bool var sx, sy string var ssx, ssy []string + var isString, isMostlyText, isPureLinedText, isBinary bool switch { case t.Kind() == reflect.String: sx, sy = vx.String(), vy.String() - isText = true // Initial estimate, verify later + isString = true case t.Kind() == reflect.Slice && t.Elem() == reflect.TypeOf(byte(0)): sx, sy = string(vx.Bytes()), string(vy.Bytes()) - isBinary = true // Initial estimate, verify later + isString = true case t.Kind() == reflect.Array: // Arrays need to be addressable for slice operations to work. vx2, vy2 := reflect.New(t).Elem(), reflect.New(t).Elem() @@ -113,13 +114,12 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode { vy2.Set(vy) vx, vy = vx2, vy2 } - if isText || isBinary { - var numLines, lastLineIdx, maxLineLen int - isBinary = !utf8.ValidString(sx) || !utf8.ValidString(sy) + if isString { + var numTotalRunes, numValidRunes, numLines, lastLineIdx, maxLineLen int for i, r := range sx + sy { - if !(unicode.IsPrint(r) || unicode.IsSpace(r)) || r == utf8.RuneError { - isBinary = true - break + numTotalRunes++ + if (unicode.IsPrint(r) || unicode.IsSpace(r)) && r != utf8.RuneError { + numValidRunes++ } if r == '\n' { if maxLineLen < i-lastLineIdx { @@ -129,12 +129,14 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode { numLines++ } } - isText = !isBinary - isLinedText = isText && numLines >= 4 && maxLineLen <= 1024 + isPureText := numValidRunes == numTotalRunes + isMostlyText = float64(numValidRunes) > math.Floor(0.90*float64(numTotalRunes)) + isPureLinedText = isPureText && numLines >= 4 && maxLineLen <= 1024 + isBinary = !isMostlyText // Avoid diffing by lines if it produces a significantly more complex // edit script than diffing by bytes. - if isLinedText { + if isPureLinedText { ssx = strings.Split(sx, "\n") ssy = strings.Split(sy, "\n") esLines := diff.Difference(len(ssx), len(ssy), func(ix, iy int) diff.Result { @@ -145,7 +147,7 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode { }) efficiencyLines := float64(esLines.Dist()) / float64(len(esLines)) efficiencyBytes := float64(esBytes.Dist()) / float64(len(esBytes)) - isLinedText = efficiencyLines < 4*efficiencyBytes + isPureLinedText = efficiencyLines < 4*efficiencyBytes } } @@ -155,7 +157,7 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode { switch { // If the text appears to be multi-lined text, // then perform differencing across individual lines. - case isLinedText: + case isPureLinedText: list = opts.formatDiffSlice( reflect.ValueOf(ssx), reflect.ValueOf(ssy), 1, "line", func(v reflect.Value, d diffMode) textRecord { @@ -244,7 +246,7 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode { // If the text appears to be single-lined text, // then perform differencing in approximately fixed-sized chunks. // The output is printed as quoted strings. - case isText: + case isMostlyText: list = opts.formatDiffSlice( reflect.ValueOf(sx), reflect.ValueOf(sy), 64, "byte", func(v reflect.Value, d diffMode) textRecord { @@ -252,7 +254,6 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode { return textRecord{Diff: d, Value: textLine(s)} }, ) - delim = "" // If the text appears to be binary data, // then perform differencing in approximately fixed-sized chunks. @@ -314,7 +315,7 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode { // Wrap the output with appropriate type information. var out textNode = &textWrap{Prefix: "{", Value: list, Suffix: "}"} - if !isText { + if !isMostlyText { // The "{...}" byte-sequence literal is not valid Go syntax for strings. // Emit the type for extra clarity (e.g. "string{...}"). if t.Kind() == reflect.String { diff --git a/cmp/testdata/diffs b/cmp/testdata/diffs index c02df29..a3d5909 100644 --- a/cmp/testdata/diffs +++ b/cmp/testdata/diffs @@ -1065,6 +1065,25 @@ ` _value=4 41 `, }, "") >>> TestDiff/Reporter/SurroundingEqualElements +<<< TestDiff/Reporter/MostlyTextString + strings.Join({ + "org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa", +- ",\xff=_value", + " _value=2 11\norg-4747474747474747,bucket-4242424242424242:m,tag1", + "=a,tag2=bb", +- ",\xff=_value", + " _value=2 21\norg-4747474747474747,bucket-4242424242424242:m,tag1", + "=b,tag2=cc", +- ",\xff=_value", + " _value=1 21\norg-4747474747474747,bucket-4242424242424242:m,tag1", + "=a,tag2=dd", +- ",\xff=_value", + " _value=3 31\norg-4747474747474747,bucket-4242424242424242:m,tag1", + "=c", +- ",\xff=_value", + " _value=4 41\n", + }, "") +>>> TestDiff/Reporter/MostlyTextString <<< TestDiff/Reporter/AllLinesDiffer strings.Join({ + "X",