hashicorp · bflad · Nov 22, 2022 · Nov 18, 2022 · Nov 22, 2022
@@ -0,0 +1,7 @@
+```release-note:bug
+tfprotov5: Allow diagnostic messages with incorrect UTF-8 encoding to pass through with the invalid sequences replaced with the Unicode Replacement Character. This avoids returning the unhelpful message "string field contains invalid UTF-8" in that case.
+```
+
+```release-note:bug
+tfprotov6: Allow diagnostic messages with incorrect UTF-8 encoding to pass through with the invalid sequences replaced with the Unicode Replacement Character. This avoids returning the unhelpful message "string field contains invalid UTF-8" in that case.
+```
@@ -1,15 +1,17 @@
 package toproto
 
 import (
+	"unicode/utf8"
+
 	"github.com/hashicorp/terraform-plugin-go/tfprotov5"
 	"github.com/hashicorp/terraform-plugin-go/tfprotov5/internal/tfplugin5"
 )
 
 func Diagnostic(in *tfprotov5.Diagnostic) (*tfplugin5.Diagnostic, error) {
 	diag := &tfplugin5.Diagnostic{
 		Severity: Diagnostic_Severity(in.Severity),
-		Summary:  in.Summary,
-		Detail:   in.Detail,
+		Summary:  forceValidUTF8(in.Summary),
+		Detail:   forceValidUTF8(in.Detail),
 	}
 	if in.Attribute != nil {
 		attr, err := AttributePath(in.Attribute)
@@ -41,6 +43,59 @@ func Diagnostics(in []*tfprotov5.Diagnostic) ([]*tfplugin5.Diagnostic, error) {
 	return diagnostics, nil
 }
 
+// forceValidUTF8 returns a string guaranteed to be valid UTF-8 even if the
+// input isn't, by replacing any invalid bytes with a valid UTF-8 encoding of
+// the Unicode Replacement Character (\uFFFD).
+//
+// The protobuf serialization library will reject invalid UTF-8 with an
+// unhelpful error message:
+//
+//	string field contains invalid UTF-8
+//
+// Passing a string result through this function makes invalid UTF-8 instead
+// emerge as placeholder characters on the other side of the wire protocol,
+// giving a better chance of still returning a partially-legible message
+// instead of a generic character encoding error.
+//
+// This is intended for user-facing messages such as diagnostic summary and
+// detail messages, where Terraform will just treat the value as opaque and
+// it's ultimately up to the user and their terminal or web browser to
+// interpret the result. Don't use this for strings that have machine-readable
+// meaning.
+func forceValidUTF8(s string) string {
+	// Most strings that pass through here will already be valid UTF-8 and
+	// utf8.ValidString has a fast path which will beat our rune-by-rune
+	// analysis below, so it's worth the cost of walking the string twice
+	// in the rarer invalid case.
+	if utf8.ValidString(s) {
+		return s
+	}
+
+	// If we get down here then we know there's at least one invalid UTF-8
+	// sequence in the string, so in this slow path we'll reconstruct the
+	// string one rune at a time, guaranteeing that we'll only write valid
+	// UTF-8 sequences into the resulting buffer.
+	//
+	// Any invalid string will grow at least a little larger as a result of
+	// this operation because we'll be replacing each invalid byte with
+	// the three-byte sequence \xEF\xBF\xBD, which is the UTF-8 encoding of
+	// the replacement character \uFFFD. 9 is a magic number giving room for
+	// three such expansions without any further allocation.
+	ret := make([]byte, 0, len(s)+9)
+	for {
+		// If the first byte in s is not the start of a valid UTF-8 sequence
+		// then the following will return utf8.RuneError, 1, where
+		// utf8.RuneError is the unicode replacement character.
+		r, advance := utf8.DecodeRuneInString(s)
+		if advance == 0 {
+			break
+		}
+		s = s[advance:]
+		ret = utf8.AppendRune(ret, r)
+	}
+	return string(ret)
+}
+
 // we have to say this next thing to get golint to stop yelling at us about the
 // underscores in the function names. We want the function names to match
 // actually-generated code, so it feels like fair play. It's just a shame we

@@ -0,0 +1,58 @@
+package toproto
+
+import (
+	"testing"
+)
+
+func TestForceValidUTF8(t *testing.T) {
+	tests := []struct {
+		Input string
+		Want  string
+	}{
+		{
+			"hello",
+			"hello",
+		},
+		{
+			"こんにちは",
+			"こんにちは",
+		},
+		{
+			"baﬄe", // NOTE: "ﬄ" is a single-character ligature
+			"baﬄe", // ligature is preserved exactly
+		},
+		{
+			"wé́́é́́é́́!", // NOTE: These "e" have multiple combining diacritics
+			"wé́́é́́é́́!", // diacritics are preserved exactly
+		},
+		{
+			"😸😾", // Astral-plane characters
+			"😸😾", // preserved exactly
+		},
+		{
+			"\xff\xff",     // neither byte is valid UTF-8
+			"\ufffd\ufffd", // both are replaced by replacement character
+		},
+		{
+			"\xff\xff\xff\xff\xff",           // more than three invalid bytes
+			"\ufffd\ufffd\ufffd\ufffd\ufffd", // still expanded even though it exceeds our initial slice capacity in the implementation
+		},
+		{
+			"t\xffe\xffst",     // invalid bytes interleaved with other content
+			"t\ufffde\ufffdst", // the valid content is preserved
+		},
+		{
+			"\xffこんにちは\xffこんにちは",     // invalid bytes interacting with multibyte sequences
+			"\ufffdこんにちは\ufffdこんにちは", // the valid content is preserved
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.Input, func(t *testing.T) {
+			got := forceValidUTF8(test.Input)
+			if got != test.Want {
+				t.Errorf("wrong result\ngot:  %q\nwant: %q", got, test.Want)
+			}
+		})
+	}
+}
@@ -1,15 +1,17 @@
 package toproto
 
 import (
+	"unicode/utf8"
+
 	"github.com/hashicorp/terraform-plugin-go/tfprotov6"
 	"github.com/hashicorp/terraform-plugin-go/tfprotov6/internal/tfplugin6"
 )
 
 func Diagnostic(in *tfprotov6.Diagnostic) (*tfplugin6.Diagnostic, error) {
 	diag := &tfplugin6.Diagnostic{
 		Severity: Diagnostic_Severity(in.Severity),
-		Summary:  in.Summary,
-		Detail:   in.Detail,
+		Summary:  forceValidUTF8(in.Summary),
+		Detail:   forceValidUTF8(in.Detail),
 	}
 	if in.Attribute != nil {
 		attr, err := AttributePath(in.Attribute)
@@ -41,6 +43,59 @@ func Diagnostics(in []*tfprotov6.Diagnostic) ([]*tfplugin6.Diagnostic, error) {
 	return diagnostics, nil
 }
 
+// forceValidUTF8 returns a string guaranteed to be valid UTF-8 even if the
+// input isn't, by replacing any invalid bytes with a valid UTF-8 encoding of
+// the Unicode Replacement Character (\uFFFD).
+//
+// The protobuf serialization library will reject invalid UTF-8 with an
+// unhelpful error message:
+//
+//	string field contains invalid UTF-8
+//
+// Passing a string result through this function makes invalid UTF-8 instead
+// emerge as placeholder characters on the other side of the wire protocol,
+// giving a better chance of still returning a partially-legible message
+// instead of a generic character encoding error.
+//
+// This is intended for user-facing messages such as diagnostic summary and
+// detail messages, where Terraform will just treat the value as opaque and
+// it's ultimately up to the user and their terminal or web browser to
+// interpret the result. Don't use this for strings that have machine-readable
+// meaning.
+func forceValidUTF8(s string) string {
+	// Most strings that pass through here will already be valid UTF-8 and
+	// utf8.ValidString has a fast path which will beat our rune-by-rune
+	// analysis below, so it's worth the cost of walking the string twice
+	// in the rarer invalid case.
+	if utf8.ValidString(s) {
+		return s
+	}
+
+	// If we get down here then we know there's at least one invalid UTF-8
+	// sequence in the string, so in this slow path we'll reconstruct the
+	// string one rune at a time, guaranteeing that we'll only write valid
+	// UTF-8 sequences into the resulting buffer.
+	//
+	// Any invalid string will grow at least a little larger as a result of
+	// this operation because we'll be replacing each invalid byte with
+	// the three-byte sequence \xEF\xBF\xBD, which is the UTF-8 encoding of
+	// the replacement character \uFFFD. 9 is a magic number giving room for
+	// three such expansions without any further allocation.
+	ret := make([]byte, 0, len(s)+9)
+	for {
+		// If the first byte in s is not the start of a valid UTF-8 sequence
+		// then the following will return utf8.RuneError, 1, where
+		// utf8.RuneError is the unicode replacement character.
+		r, advance := utf8.DecodeRuneInString(s)
+		if advance == 0 {
+			break
+		}
+		s = s[advance:]
+		ret = utf8.AppendRune(ret, r)
+	}
+	return string(ret)
+}
+
 // we have to say this next thing to get golint to stop yelling at us about the
 // underscores in the function names. We want the function names to match
 // actually-generated code, so it feels like fair play. It's just a shame we

@@ -0,0 +1,58 @@
+package toproto
+
+import (
+	"testing"
+)
+
+func TestForceValidUTF8(t *testing.T) {
+	tests := []struct {
+		Input string
+		Want  string
+	}{
+		{
+			"hello",
+			"hello",
+		},
+		{
+			"こんにちは",
+			"こんにちは",
+		},
+		{
+			"baﬄe", // NOTE: "ﬄ" is a single-character ligature
+			"baﬄe", // ligature is preserved exactly
+		},
+		{
+			"wé́́é́́é́́!", // NOTE: These "e" have multiple combining diacritics
+			"wé́́é́́é́́!", // diacritics are preserved exactly
+		},
+		{
+			"😸😾", // Astral-plane characters
+			"😸😾", // preserved exactly
+		},
+		{
+			"\xff\xff",     // neither byte is valid UTF-8
+			"\ufffd\ufffd", // both are replaced by replacement character
+		},
+		{
+			"\xff\xff\xff\xff\xff",           // more than three invalid bytes
+			"\ufffd\ufffd\ufffd\ufffd\ufffd", // still expanded even though it exceeds our initial slice capacity in the implementation
+		},
+		{
+			"t\xffe\xffst",     // invalid bytes interleaved with other content
+			"t\ufffde\ufffdst", // the valid content is preserved
+		},
+		{
+			"\xffこんにちは\xffこんにちは",     // invalid bytes interacting with multibyte sequences
+			"\ufffdこんにちは\ufffdこんにちは", // the valid content is preserved
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.Input, func(t *testing.T) {
+			got := forceValidUTF8(test.Input)
+			if got != test.Want {
+				t.Errorf("wrong result\ngot:  %q\nwant: %q", got, test.Want)
+			}
+		})
+	}
+}