diff --git a/.changelog/237.txt b/.changelog/237.txt new file mode 100644 index 00000000..c81e94dd --- /dev/null +++ b/.changelog/237.txt @@ -0,0 +1,7 @@ +```release-note:bug +tfprotov5: Allow diagnostic messages with incorrect UTF-8 encoding to pass through with the invalid sequences replaced with the Unicode Replacement Character. This avoids returning the unhelpful message "string field contains invalid UTF-8" in that case. +``` + +```release-note:bug +tfprotov6: Allow diagnostic messages with incorrect UTF-8 encoding to pass through with the invalid sequences replaced with the Unicode Replacement Character. This avoids returning the unhelpful message "string field contains invalid UTF-8" in that case. +``` diff --git a/tfprotov5/internal/toproto/diagnostic.go b/tfprotov5/internal/toproto/diagnostic.go index 979896f7..81d692ce 100644 --- a/tfprotov5/internal/toproto/diagnostic.go +++ b/tfprotov5/internal/toproto/diagnostic.go @@ -1,6 +1,8 @@ package toproto import ( + "unicode/utf8" + "github.com/hashicorp/terraform-plugin-go/tfprotov5" "github.com/hashicorp/terraform-plugin-go/tfprotov5/internal/tfplugin5" ) @@ -8,8 +10,8 @@ import ( func Diagnostic(in *tfprotov5.Diagnostic) (*tfplugin5.Diagnostic, error) { diag := &tfplugin5.Diagnostic{ Severity: Diagnostic_Severity(in.Severity), - Summary: in.Summary, - Detail: in.Detail, + Summary: forceValidUTF8(in.Summary), + Detail: forceValidUTF8(in.Detail), } if in.Attribute != nil { attr, err := AttributePath(in.Attribute) @@ -41,6 +43,59 @@ func Diagnostics(in []*tfprotov5.Diagnostic) ([]*tfplugin5.Diagnostic, error) { return diagnostics, nil } +// forceValidUTF8 returns a string guaranteed to be valid UTF-8 even if the +// input isn't, by replacing any invalid bytes with a valid UTF-8 encoding of +// the Unicode Replacement Character (\uFFFD). +// +// The protobuf serialization library will reject invalid UTF-8 with an +// unhelpful error message: +// +// string field contains invalid UTF-8 +// +// Passing a string result through this function makes invalid UTF-8 instead +// emerge as placeholder characters on the other side of the wire protocol, +// giving a better chance of still returning a partially-legible message +// instead of a generic character encoding error. +// +// This is intended for user-facing messages such as diagnostic summary and +// detail messages, where Terraform will just treat the value as opaque and +// it's ultimately up to the user and their terminal or web browser to +// interpret the result. Don't use this for strings that have machine-readable +// meaning. +func forceValidUTF8(s string) string { + // Most strings that pass through here will already be valid UTF-8 and + // utf8.ValidString has a fast path which will beat our rune-by-rune + // analysis below, so it's worth the cost of walking the string twice + // in the rarer invalid case. + if utf8.ValidString(s) { + return s + } + + // If we get down here then we know there's at least one invalid UTF-8 + // sequence in the string, so in this slow path we'll reconstruct the + // string one rune at a time, guaranteeing that we'll only write valid + // UTF-8 sequences into the resulting buffer. + // + // Any invalid string will grow at least a little larger as a result of + // this operation because we'll be replacing each invalid byte with + // the three-byte sequence \xEF\xBF\xBD, which is the UTF-8 encoding of + // the replacement character \uFFFD. 9 is a magic number giving room for + // three such expansions without any further allocation. + ret := make([]byte, 0, len(s)+9) + for { + // If the first byte in s is not the start of a valid UTF-8 sequence + // then the following will return utf8.RuneError, 1, where + // utf8.RuneError is the unicode replacement character. + r, advance := utf8.DecodeRuneInString(s) + if advance == 0 { + break + } + s = s[advance:] + ret = utf8.AppendRune(ret, r) + } + return string(ret) +} + // we have to say this next thing to get golint to stop yelling at us about the // underscores in the function names. We want the function names to match // actually-generated code, so it feels like fair play. It's just a shame we diff --git a/tfprotov5/internal/toproto/diagnostic_test.go b/tfprotov5/internal/toproto/diagnostic_test.go new file mode 100644 index 00000000..79e5eeb7 --- /dev/null +++ b/tfprotov5/internal/toproto/diagnostic_test.go @@ -0,0 +1,58 @@ +package toproto + +import ( + "testing" +) + +func TestForceValidUTF8(t *testing.T) { + tests := []struct { + Input string + Want string + }{ + { + "hello", + "hello", + }, + { + "こんにちは", + "こんにちは", + }, + { + "baffle", // NOTE: "ffl" is a single-character ligature + "baffle", // ligature is preserved exactly + }, + { + "wé́́é́́é́́!", // NOTE: These "e" have multiple combining diacritics + "wé́́é́́é́́!", // diacritics are preserved exactly + }, + { + "😸😾", // Astral-plane characters + "😸😾", // preserved exactly + }, + { + "\xff\xff", // neither byte is valid UTF-8 + "\ufffd\ufffd", // both are replaced by replacement character + }, + { + "\xff\xff\xff\xff\xff", // more than three invalid bytes + "\ufffd\ufffd\ufffd\ufffd\ufffd", // still expanded even though it exceeds our initial slice capacity in the implementation + }, + { + "t\xffe\xffst", // invalid bytes interleaved with other content + "t\ufffde\ufffdst", // the valid content is preserved + }, + { + "\xffこんにちは\xffこんにちは", // invalid bytes interacting with multibyte sequences + "\ufffdこんにちは\ufffdこんにちは", // the valid content is preserved + }, + } + + for _, test := range tests { + t.Run(test.Input, func(t *testing.T) { + got := forceValidUTF8(test.Input) + if got != test.Want { + t.Errorf("wrong result\ngot: %q\nwant: %q", got, test.Want) + } + }) + } +} diff --git a/tfprotov6/internal/toproto/diagnostic.go b/tfprotov6/internal/toproto/diagnostic.go index 26ce3e9d..4144222c 100644 --- a/tfprotov6/internal/toproto/diagnostic.go +++ b/tfprotov6/internal/toproto/diagnostic.go @@ -1,6 +1,8 @@ package toproto import ( + "unicode/utf8" + "github.com/hashicorp/terraform-plugin-go/tfprotov6" "github.com/hashicorp/terraform-plugin-go/tfprotov6/internal/tfplugin6" ) @@ -8,8 +10,8 @@ import ( func Diagnostic(in *tfprotov6.Diagnostic) (*tfplugin6.Diagnostic, error) { diag := &tfplugin6.Diagnostic{ Severity: Diagnostic_Severity(in.Severity), - Summary: in.Summary, - Detail: in.Detail, + Summary: forceValidUTF8(in.Summary), + Detail: forceValidUTF8(in.Detail), } if in.Attribute != nil { attr, err := AttributePath(in.Attribute) @@ -41,6 +43,59 @@ func Diagnostics(in []*tfprotov6.Diagnostic) ([]*tfplugin6.Diagnostic, error) { return diagnostics, nil } +// forceValidUTF8 returns a string guaranteed to be valid UTF-8 even if the +// input isn't, by replacing any invalid bytes with a valid UTF-8 encoding of +// the Unicode Replacement Character (\uFFFD). +// +// The protobuf serialization library will reject invalid UTF-8 with an +// unhelpful error message: +// +// string field contains invalid UTF-8 +// +// Passing a string result through this function makes invalid UTF-8 instead +// emerge as placeholder characters on the other side of the wire protocol, +// giving a better chance of still returning a partially-legible message +// instead of a generic character encoding error. +// +// This is intended for user-facing messages such as diagnostic summary and +// detail messages, where Terraform will just treat the value as opaque and +// it's ultimately up to the user and their terminal or web browser to +// interpret the result. Don't use this for strings that have machine-readable +// meaning. +func forceValidUTF8(s string) string { + // Most strings that pass through here will already be valid UTF-8 and + // utf8.ValidString has a fast path which will beat our rune-by-rune + // analysis below, so it's worth the cost of walking the string twice + // in the rarer invalid case. + if utf8.ValidString(s) { + return s + } + + // If we get down here then we know there's at least one invalid UTF-8 + // sequence in the string, so in this slow path we'll reconstruct the + // string one rune at a time, guaranteeing that we'll only write valid + // UTF-8 sequences into the resulting buffer. + // + // Any invalid string will grow at least a little larger as a result of + // this operation because we'll be replacing each invalid byte with + // the three-byte sequence \xEF\xBF\xBD, which is the UTF-8 encoding of + // the replacement character \uFFFD. 9 is a magic number giving room for + // three such expansions without any further allocation. + ret := make([]byte, 0, len(s)+9) + for { + // If the first byte in s is not the start of a valid UTF-8 sequence + // then the following will return utf8.RuneError, 1, where + // utf8.RuneError is the unicode replacement character. + r, advance := utf8.DecodeRuneInString(s) + if advance == 0 { + break + } + s = s[advance:] + ret = utf8.AppendRune(ret, r) + } + return string(ret) +} + // we have to say this next thing to get golint to stop yelling at us about the // underscores in the function names. We want the function names to match // actually-generated code, so it feels like fair play. It's just a shame we diff --git a/tfprotov6/internal/toproto/diagnostic_test.go b/tfprotov6/internal/toproto/diagnostic_test.go new file mode 100644 index 00000000..79e5eeb7 --- /dev/null +++ b/tfprotov6/internal/toproto/diagnostic_test.go @@ -0,0 +1,58 @@ +package toproto + +import ( + "testing" +) + +func TestForceValidUTF8(t *testing.T) { + tests := []struct { + Input string + Want string + }{ + { + "hello", + "hello", + }, + { + "こんにちは", + "こんにちは", + }, + { + "baffle", // NOTE: "ffl" is a single-character ligature + "baffle", // ligature is preserved exactly + }, + { + "wé́́é́́é́́!", // NOTE: These "e" have multiple combining diacritics + "wé́́é́́é́́!", // diacritics are preserved exactly + }, + { + "😸😾", // Astral-plane characters + "😸😾", // preserved exactly + }, + { + "\xff\xff", // neither byte is valid UTF-8 + "\ufffd\ufffd", // both are replaced by replacement character + }, + { + "\xff\xff\xff\xff\xff", // more than three invalid bytes + "\ufffd\ufffd\ufffd\ufffd\ufffd", // still expanded even though it exceeds our initial slice capacity in the implementation + }, + { + "t\xffe\xffst", // invalid bytes interleaved with other content + "t\ufffde\ufffdst", // the valid content is preserved + }, + { + "\xffこんにちは\xffこんにちは", // invalid bytes interacting with multibyte sequences + "\ufffdこんにちは\ufffdこんにちは", // the valid content is preserved + }, + } + + for _, test := range tests { + t.Run(test.Input, func(t *testing.T) { + got := forceValidUTF8(test.Input) + if got != test.Want { + t.Errorf("wrong result\ngot: %q\nwant: %q", got, test.Want) + } + }) + } +}