From 7c8bf2c093ae25f47aca4061c7acb043d24bedd2 Mon Sep 17 00:00:00 2001
From: Joe Tsai <joetsai@digital-static.net>
Date: Thu, 29 Apr 2021 17:40:59 -0700
Subject: [PATCH] Print as text if mostly text

The previous heuristic of treating strings as binary data
if it contains any invalid UTF-8 was too strict.
Loosen the heuristic to check if most of the characters
are printable text.

Fixes #257
---
 cmp/compare_test.go  |  5 +++++
 cmp/report_slices.go | 35 ++++++++++++++++++-----------------
 cmp/testdata/diffs   | 19 +++++++++++++++++++
 3 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/cmp/compare_test.go b/cmp/compare_test.go
index baae591..649c917 100644
--- a/cmp/compare_test.go
+++ b/cmp/compare_test.go
@@ -1307,6 +1307,11 @@ using the AllowUnexported option.`, "\n"),
 		x:      "org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa,#=_value _value=2 11\torg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=bb,#=_value _value=2 21\torg-4747474747474747,bucket-4242424242424242:m,tag1=b,tag2=cc,#=_value _value=1 21\torg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=dd,#=_value _value=3 31\torg-4747474747474747,bucket-4242424242424242:m,tag1=c,#=_value _value=4 41\t",
 		y:      "org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa _value=2 11\torg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=bb _value=2 21\torg-4747474747474747,bucket-4242424242424242:m,tag1=b,tag2=cc _value=1 21\torg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=dd _value=3 31\torg-4747474747474747,bucket-4242424242424242:m,tag1=c _value=4 41\t",
 		reason: "leading/trailing equal spans should not appear in diff lines",
+	}, {
+		label:  label + "/MostlyTextString",
+		x:      "org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa,\xff=_value _value=2 11\norg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=bb,\xff=_value _value=2 21\norg-4747474747474747,bucket-4242424242424242:m,tag1=b,tag2=cc,\xff=_value _value=1 21\norg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=dd,\xff=_value _value=3 31\norg-4747474747474747,bucket-4242424242424242:m,tag1=c,\xff=_value _value=4 41\n",
+		y:      "org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa _value=2 11\norg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=bb _value=2 21\norg-4747474747474747,bucket-4242424242424242:m,tag1=b,tag2=cc _value=1 21\norg-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=dd _value=3 31\norg-4747474747474747,bucket-4242424242424242:m,tag1=c _value=4 41\n",
+		reason: "the presence of a few invalid UTF-8 characters should not prevent printing this as text",
 	}, {
 		label:  label + "/AllLinesDiffer",
 		x:      "d5c14bdf6bac81c27afc5429500ed750\n25483503b557c606dad4f144d27ae10b\n90bdbcdbb6ea7156068e3dcfb7459244\n978f480a6e3cced51e297fbff9a506b7\n",
diff --git a/cmp/report_slices.go b/cmp/report_slices.go
index bd8ca15..2ad3bc8 100644
--- a/cmp/report_slices.go
+++ b/cmp/report_slices.go
@@ -7,6 +7,7 @@ package cmp
 import (
 	"bytes"
 	"fmt"
+	"math"
 	"reflect"
 	"strconv"
 	"strings"
@@ -96,16 +97,16 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
 	}
 
 	// Auto-detect the type of the data.
-	var isLinedText, isText, isBinary bool
 	var sx, sy string
 	var ssx, ssy []string
+	var isString, isMostlyText, isPureLinedText, isBinary bool
 	switch {
 	case t.Kind() == reflect.String:
 		sx, sy = vx.String(), vy.String()
-		isText = true // Initial estimate, verify later
+		isString = true
 	case t.Kind() == reflect.Slice && t.Elem() == reflect.TypeOf(byte(0)):
 		sx, sy = string(vx.Bytes()), string(vy.Bytes())
-		isBinary = true // Initial estimate, verify later
+		isString = true
 	case t.Kind() == reflect.Array:
 		// Arrays need to be addressable for slice operations to work.
 		vx2, vy2 := reflect.New(t).Elem(), reflect.New(t).Elem()
@@ -113,13 +114,12 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
 		vy2.Set(vy)
 		vx, vy = vx2, vy2
 	}
-	if isText || isBinary {
-		var numLines, lastLineIdx, maxLineLen int
-		isBinary = !utf8.ValidString(sx) || !utf8.ValidString(sy)
+	if isString {
+		var numTotalRunes, numValidRunes, numLines, lastLineIdx, maxLineLen int
 		for i, r := range sx + sy {
-			if !(unicode.IsPrint(r) || unicode.IsSpace(r)) || r == utf8.RuneError {
-				isBinary = true
-				break
+			numTotalRunes++
+			if (unicode.IsPrint(r) || unicode.IsSpace(r)) && r != utf8.RuneError {
+				numValidRunes++
 			}
 			if r == '\n' {
 				if maxLineLen < i-lastLineIdx {
@@ -129,12 +129,14 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
 				numLines++
 			}
 		}
-		isText = !isBinary
-		isLinedText = isText && numLines >= 4 && maxLineLen <= 1024
+		isPureText := numValidRunes == numTotalRunes
+		isMostlyText = float64(numValidRunes) > math.Floor(0.90*float64(numTotalRunes))
+		isPureLinedText = isPureText && numLines >= 4 && maxLineLen <= 1024
+		isBinary = !isMostlyText
 
 		// Avoid diffing by lines if it produces a significantly more complex
 		// edit script than diffing by bytes.
-		if isLinedText {
+		if isPureLinedText {
 			ssx = strings.Split(sx, "\n")
 			ssy = strings.Split(sy, "\n")
 			esLines := diff.Difference(len(ssx), len(ssy), func(ix, iy int) diff.Result {
@@ -145,7 +147,7 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
 			})
 			efficiencyLines := float64(esLines.Dist()) / float64(len(esLines))
 			efficiencyBytes := float64(esBytes.Dist()) / float64(len(esBytes))
-			isLinedText = efficiencyLines < 4*efficiencyBytes
+			isPureLinedText = efficiencyLines < 4*efficiencyBytes
 		}
 	}
 
@@ -155,7 +157,7 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
 	switch {
 	// If the text appears to be multi-lined text,
 	// then perform differencing across individual lines.
-	case isLinedText:
+	case isPureLinedText:
 		list = opts.formatDiffSlice(
 			reflect.ValueOf(ssx), reflect.ValueOf(ssy), 1, "line",
 			func(v reflect.Value, d diffMode) textRecord {
@@ -244,7 +246,7 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
 	// If the text appears to be single-lined text,
 	// then perform differencing in approximately fixed-sized chunks.
 	// The output is printed as quoted strings.
-	case isText:
+	case isMostlyText:
 		list = opts.formatDiffSlice(
 			reflect.ValueOf(sx), reflect.ValueOf(sy), 64, "byte",
 			func(v reflect.Value, d diffMode) textRecord {
@@ -252,7 +254,6 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
 				return textRecord{Diff: d, Value: textLine(s)}
 			},
 		)
-		delim = ""
 
 	// If the text appears to be binary data,
 	// then perform differencing in approximately fixed-sized chunks.
@@ -314,7 +315,7 @@ func (opts formatOptions) FormatDiffSlice(v *valueNode) textNode {
 
 	// Wrap the output with appropriate type information.
 	var out textNode = &textWrap{Prefix: "{", Value: list, Suffix: "}"}
-	if !isText {
+	if !isMostlyText {
 		// The "{...}" byte-sequence literal is not valid Go syntax for strings.
 		// Emit the type for extra clarity (e.g. "string{...}").
 		if t.Kind() == reflect.String {
diff --git a/cmp/testdata/diffs b/cmp/testdata/diffs
index c02df29..a3d5909 100644
--- a/cmp/testdata/diffs
+++ b/cmp/testdata/diffs
@@ -1065,6 +1065,25 @@
   	` _value=4 41	`,
   }, "")
 >>> TestDiff/Reporter/SurroundingEqualElements
+<<< TestDiff/Reporter/MostlyTextString
+  strings.Join({
+  	"org-4747474747474747,bucket-4242424242424242:m,tag1=a,tag2=aa",
+- 	",\xff=_value",
+  	" _value=2 11\norg-4747474747474747,bucket-4242424242424242:m,tag1",
+  	"=a,tag2=bb",
+- 	",\xff=_value",
+  	" _value=2 21\norg-4747474747474747,bucket-4242424242424242:m,tag1",
+  	"=b,tag2=cc",
+- 	",\xff=_value",
+  	" _value=1 21\norg-4747474747474747,bucket-4242424242424242:m,tag1",
+  	"=a,tag2=dd",
+- 	",\xff=_value",
+  	" _value=3 31\norg-4747474747474747,bucket-4242424242424242:m,tag1",
+  	"=c",
+- 	",\xff=_value",
+  	" _value=4 41\n",
+  }, "")
+>>> TestDiff/Reporter/MostlyTextString
 <<< TestDiff/Reporter/AllLinesDiffer
   strings.Join({
 + 	"X",