Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tweak diagnostics with invalid UTF-8 so they can pass over the wire #237

Merged
merged 2 commits into from Nov 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 7 additions & 0 deletions .changelog/237.txt
@@ -0,0 +1,7 @@
```release-note:bug
tfprotov5: Allow diagnostic messages with incorrect UTF-8 encoding to pass through with the invalid sequences replaced with the Unicode Replacement Character. This avoids returning the unhelpful message "string field contains invalid UTF-8" in that case.
```

```release-note:bug
tfprotov6: Allow diagnostic messages with incorrect UTF-8 encoding to pass through with the invalid sequences replaced with the Unicode Replacement Character. This avoids returning the unhelpful message "string field contains invalid UTF-8" in that case.
```
59 changes: 57 additions & 2 deletions tfprotov5/internal/toproto/diagnostic.go
@@ -1,15 +1,17 @@
package toproto

import (
"unicode/utf8"

"github.com/hashicorp/terraform-plugin-go/tfprotov5"
"github.com/hashicorp/terraform-plugin-go/tfprotov5/internal/tfplugin5"
)

func Diagnostic(in *tfprotov5.Diagnostic) (*tfplugin5.Diagnostic, error) {
diag := &tfplugin5.Diagnostic{
Severity: Diagnostic_Severity(in.Severity),
Summary: in.Summary,
Detail: in.Detail,
Summary: forceValidUTF8(in.Summary),
Detail: forceValidUTF8(in.Detail),
}
if in.Attribute != nil {
attr, err := AttributePath(in.Attribute)
Expand Down Expand Up @@ -41,6 +43,59 @@ func Diagnostics(in []*tfprotov5.Diagnostic) ([]*tfplugin5.Diagnostic, error) {
return diagnostics, nil
}

// forceValidUTF8 returns a string guaranteed to be valid UTF-8 even if the
// input isn't, by replacing any invalid bytes with a valid UTF-8 encoding of
// the Unicode Replacement Character (\uFFFD).
//
// The protobuf serialization library will reject invalid UTF-8 with an
// unhelpful error message:
//
// string field contains invalid UTF-8
//
// Passing a string result through this function makes invalid UTF-8 instead
// emerge as placeholder characters on the other side of the wire protocol,
// giving a better chance of still returning a partially-legible message
// instead of a generic character encoding error.
//
// This is intended for user-facing messages such as diagnostic summary and
// detail messages, where Terraform will just treat the value as opaque and
// it's ultimately up to the user and their terminal or web browser to
// interpret the result. Don't use this for strings that have machine-readable
// meaning.
func forceValidUTF8(s string) string {
// Most strings that pass through here will already be valid UTF-8 and
// utf8.ValidString has a fast path which will beat our rune-by-rune
// analysis below, so it's worth the cost of walking the string twice
// in the rarer invalid case.
if utf8.ValidString(s) {
return s
}

// If we get down here then we know there's at least one invalid UTF-8
// sequence in the string, so in this slow path we'll reconstruct the
// string one rune at a time, guaranteeing that we'll only write valid
// UTF-8 sequences into the resulting buffer.
//
// Any invalid string will grow at least a little larger as a result of
// this operation because we'll be replacing each invalid byte with
// the three-byte sequence \xEF\xBF\xBD, which is the UTF-8 encoding of
// the replacement character \uFFFD. 9 is a magic number giving room for
// three such expansions without any further allocation.
ret := make([]byte, 0, len(s)+9)
for {
// If the first byte in s is not the start of a valid UTF-8 sequence
// then the following will return utf8.RuneError, 1, where
// utf8.RuneError is the unicode replacement character.
r, advance := utf8.DecodeRuneInString(s)
if advance == 0 {
break
}
s = s[advance:]
ret = utf8.AppendRune(ret, r)
}
return string(ret)
}

// we have to say this next thing to get golint to stop yelling at us about the
// underscores in the function names. We want the function names to match
// actually-generated code, so it feels like fair play. It's just a shame we
Expand Down
58 changes: 58 additions & 0 deletions tfprotov5/internal/toproto/diagnostic_test.go
@@ -0,0 +1,58 @@
package toproto

import (
"testing"
)

func TestForceValidUTF8(t *testing.T) {
tests := []struct {
Input string
Want string
}{
{
"hello",
"hello",
},
{
"こんにちは",
"こんにちは",
},
{
"baffle", // NOTE: "ffl" is a single-character ligature
"baffle", // ligature is preserved exactly
},
{
"wé́́é́́é́́!", // NOTE: These "e" have multiple combining diacritics
"wé́́é́́é́́!", // diacritics are preserved exactly
},
{
"😸😾", // Astral-plane characters
"😸😾", // preserved exactly
},
{
"\xff\xff", // neither byte is valid UTF-8
"\ufffd\ufffd", // both are replaced by replacement character
apparentlymart marked this conversation as resolved.
Show resolved Hide resolved
},
apparentlymart marked this conversation as resolved.
Show resolved Hide resolved
{
"\xff\xff\xff\xff\xff", // more than three invalid bytes
"\ufffd\ufffd\ufffd\ufffd\ufffd", // still expanded even though it exceeds our initial slice capacity in the implementation
},
{
"t\xffe\xffst", // invalid bytes interleaved with other content
"t\ufffde\ufffdst", // the valid content is preserved
},
{
"\xffこんにちは\xffこんにちは", // invalid bytes interacting with multibyte sequences
"\ufffdこんにちは\ufffdこんにちは", // the valid content is preserved
},
}

for _, test := range tests {
t.Run(test.Input, func(t *testing.T) {
got := forceValidUTF8(test.Input)
if got != test.Want {
t.Errorf("wrong result\ngot: %q\nwant: %q", got, test.Want)
}
})
}
}
59 changes: 57 additions & 2 deletions tfprotov6/internal/toproto/diagnostic.go
@@ -1,15 +1,17 @@
package toproto

import (
"unicode/utf8"

"github.com/hashicorp/terraform-plugin-go/tfprotov6"
"github.com/hashicorp/terraform-plugin-go/tfprotov6/internal/tfplugin6"
)

func Diagnostic(in *tfprotov6.Diagnostic) (*tfplugin6.Diagnostic, error) {
diag := &tfplugin6.Diagnostic{
Severity: Diagnostic_Severity(in.Severity),
Summary: in.Summary,
Detail: in.Detail,
Summary: forceValidUTF8(in.Summary),
Detail: forceValidUTF8(in.Detail),
}
if in.Attribute != nil {
attr, err := AttributePath(in.Attribute)
Expand Down Expand Up @@ -41,6 +43,59 @@ func Diagnostics(in []*tfprotov6.Diagnostic) ([]*tfplugin6.Diagnostic, error) {
return diagnostics, nil
}

// forceValidUTF8 returns a string guaranteed to be valid UTF-8 even if the
// input isn't, by replacing any invalid bytes with a valid UTF-8 encoding of
// the Unicode Replacement Character (\uFFFD).
//
// The protobuf serialization library will reject invalid UTF-8 with an
// unhelpful error message:
//
// string field contains invalid UTF-8
//
// Passing a string result through this function makes invalid UTF-8 instead
// emerge as placeholder characters on the other side of the wire protocol,
// giving a better chance of still returning a partially-legible message
// instead of a generic character encoding error.
//
// This is intended for user-facing messages such as diagnostic summary and
// detail messages, where Terraform will just treat the value as opaque and
// it's ultimately up to the user and their terminal or web browser to
// interpret the result. Don't use this for strings that have machine-readable
// meaning.
func forceValidUTF8(s string) string {
// Most strings that pass through here will already be valid UTF-8 and
// utf8.ValidString has a fast path which will beat our rune-by-rune
// analysis below, so it's worth the cost of walking the string twice
// in the rarer invalid case.
if utf8.ValidString(s) {
return s
}

// If we get down here then we know there's at least one invalid UTF-8
// sequence in the string, so in this slow path we'll reconstruct the
// string one rune at a time, guaranteeing that we'll only write valid
// UTF-8 sequences into the resulting buffer.
//
// Any invalid string will grow at least a little larger as a result of
// this operation because we'll be replacing each invalid byte with
// the three-byte sequence \xEF\xBF\xBD, which is the UTF-8 encoding of
// the replacement character \uFFFD. 9 is a magic number giving room for
// three such expansions without any further allocation.
ret := make([]byte, 0, len(s)+9)
for {
// If the first byte in s is not the start of a valid UTF-8 sequence
// then the following will return utf8.RuneError, 1, where
// utf8.RuneError is the unicode replacement character.
r, advance := utf8.DecodeRuneInString(s)
if advance == 0 {
break
}
s = s[advance:]
ret = utf8.AppendRune(ret, r)
}
return string(ret)
}

// we have to say this next thing to get golint to stop yelling at us about the
// underscores in the function names. We want the function names to match
// actually-generated code, so it feels like fair play. It's just a shame we
Expand Down
58 changes: 58 additions & 0 deletions tfprotov6/internal/toproto/diagnostic_test.go
@@ -0,0 +1,58 @@
package toproto

import (
"testing"
)

func TestForceValidUTF8(t *testing.T) {
tests := []struct {
Input string
Want string
}{
{
"hello",
"hello",
},
{
"こんにちは",
"こんにちは",
},
{
"baffle", // NOTE: "ffl" is a single-character ligature
"baffle", // ligature is preserved exactly
},
{
"wé́́é́́é́́!", // NOTE: These "e" have multiple combining diacritics
"wé́́é́́é́́!", // diacritics are preserved exactly
},
{
"😸😾", // Astral-plane characters
"😸😾", // preserved exactly
},
{
"\xff\xff", // neither byte is valid UTF-8
"\ufffd\ufffd", // both are replaced by replacement character
},
{
"\xff\xff\xff\xff\xff", // more than three invalid bytes
"\ufffd\ufffd\ufffd\ufffd\ufffd", // still expanded even though it exceeds our initial slice capacity in the implementation
},
{
"t\xffe\xffst", // invalid bytes interleaved with other content
"t\ufffde\ufffdst", // the valid content is preserved
},
{
"\xffこんにちは\xffこんにちは", // invalid bytes interacting with multibyte sequences
"\ufffdこんにちは\ufffdこんにちは", // the valid content is preserved
},
}

for _, test := range tests {
t.Run(test.Input, func(t *testing.T) {
got := forceValidUTF8(test.Input)
if got != test.Want {
t.Errorf("wrong result\ngot: %q\nwant: %q", got, test.Want)
}
})
}
}