Skip to content

Commit

Permalink
Tweak diagnostics with invalid UTF-8 so they can pass over the wire
Browse files Browse the repository at this point in the history
A correct provider should only ever return valid UTF-8 strings as the
diagnostic Summary or Detail, but since diagnostics tend to be describing
unexpected situations and are often derived from errors in downstream
libraries it's possible that a provider might incorrectly return incorrect
garbage as part of a diagnostic message.

The protobuf serializer rejects non-UTF8 strings with a generic message
that is unhelpful to end-users:
    string field contains invalid UTF-8

Here we make the compromise that it's better to make a best effort to
return a diagnostic that is probably only partially invalid so that the
end user has a chance of still getting some clue about what problem
occurred. The new helper functions here achieve that by replacing any
invalid bytes with a correctly-encoded version of the Unicode Replacement
Character, which will then allow the string to pass over the wire protocol
successfully and hopefully end up as an obviously-invalid character in
the CLI output or web UI that's rendering the diagnostics.

This does introduce some slight additional overhead when returning
responses, but it should be immaterial for any response that doesn't
include any diagnostics, relatively minor for responses that include
valid diagnostics, and only markedly expensive for a diagnostic string
with invalid bytes that will therefore need to be re-encoded on a
rune-by-rune basis.
  • Loading branch information
apparentlymart committed Nov 18, 2022
1 parent a476543 commit c55353b
Show file tree
Hide file tree
Showing 4 changed files with 206 additions and 4 deletions.
59 changes: 57 additions & 2 deletions tfprotov5/internal/toproto/diagnostic.go
@@ -1,15 +1,17 @@
package toproto

import (
"unicode/utf8"

"github.com/hashicorp/terraform-plugin-go/tfprotov5"
"github.com/hashicorp/terraform-plugin-go/tfprotov5/internal/tfplugin5"
)

func Diagnostic(in *tfprotov5.Diagnostic) (*tfplugin5.Diagnostic, error) {
diag := &tfplugin5.Diagnostic{
Severity: Diagnostic_Severity(in.Severity),
Summary: in.Summary,
Detail: in.Detail,
Summary: forceValidUTF8(in.Summary),
Detail: forceValidUTF8(in.Detail),
}
if in.Attribute != nil {
attr, err := AttributePath(in.Attribute)
Expand Down Expand Up @@ -41,6 +43,59 @@ func Diagnostics(in []*tfprotov5.Diagnostic) ([]*tfplugin5.Diagnostic, error) {
return diagnostics, nil
}

// forceValidUTF8 returns a string guaranteed to be valid UTF-8 even if the
// input isn't, by replacing any invalid bytes with a valid UTF-8 encoding of
// the Unicode Replacement Character (\uFFFD).
//
// The protobuf serialization library will reject invalid UTF-8 with an
// unhelpful error message:
//
// string field contains invalid UTF-8
//
// Passing a string result through this function makes invalid UTF-8 instead
// emerge as placeholder characters on the other side of the wire protocol,
// giving a better chance of still returning a partially-legible message
// instead of a generic character encoding error.
//
// This is intended for user-facing messages such as diagnostic summary and
// detail messages, where Terraform will just treat the value as opaque and
// it's ultimately up to the user and their terminal or web browser to
// interpret the result. Don't use this for strings that have machine-readable
// meaning.
func forceValidUTF8(s string) string {
// Most strings that pass through here will already be valid UTF-8 and
// utf8.ValidString has a fast path which will beat our rune-by-rune
// analysis below, so it's worth the cost of walking the string twice
// in the rarer invalid case.
if utf8.ValidString(s) {
return s
}

// If we get down here then we know there's at least one invalid UTF-8
// sequence in the string, so in this slow path we'll reconstruct the
// string one rune at a time, guaranteeing that we'll only write valid
// UTF-8 sequences into the resulting buffer.
//
// Any invalid string will grow at least a little larger as a result of
// this operation because we'll be replacing each invalid byte with
// the three-byte sequence \xEF\xBF\xBD, which is the UTF-8 encoding of
// the replacement character \uFFFD. 9 is a magic number giving room for
// three such expansions without any further allocation.
ret := make([]byte, 0, len(s)+9)
for {
// If the first byte in s is not the start of a valid UTF-8 sequence
// then the following will return utf8.RuneError, 1, where
// utf8.RuneError is the unicode replacement character.
r, advance := utf8.DecodeRuneInString(s)
if advance == 0 {
break
}
s = s[advance:]
ret = utf8.AppendRune(ret, r)
}
return string(ret)
}

// we have to say this next thing to get golint to stop yelling at us about the
// underscores in the function names. We want the function names to match
// actually-generated code, so it feels like fair play. It's just a shame we
Expand Down
46 changes: 46 additions & 0 deletions tfprotov5/internal/toproto/diagnostic_test.go
@@ -0,0 +1,46 @@
package toproto

import (
"testing"
)

func TestForceValidUTF8(t *testing.T) {
tests := []struct {
Input string
Want string
}{
{
"hello",
"hello",
},
{
"こんにちは",
"こんにちは",
},
{
"baffle", // NOTE: "ffl" is a single-character ligature
"baffle", // ligature is preserved exactly
},
{
"wé́́é́́é́́!", // NOTE: These "e" have multiple combining diacritics
"wé́́é́́é́́!", // diacritics are preserved exactly
},
{
"😸😾", // Astral-plane characters
"😸😾", // preserved exactly
},
{
"\xff\xff", // neither byte is valid UTF-8
"\ufffd\ufffd", // both are replaced by replacement character
},
}

for _, test := range tests {
t.Run(test.Input, func(t *testing.T) {
got := forceValidUTF8(test.Input)
if got != test.Want {
t.Errorf("wrong result\ngot: %q\nwant: %q", got, test.Want)
}
})
}
}
59 changes: 57 additions & 2 deletions tfprotov6/internal/toproto/diagnostic.go
@@ -1,15 +1,17 @@
package toproto

import (
"unicode/utf8"

"github.com/hashicorp/terraform-plugin-go/tfprotov6"
"github.com/hashicorp/terraform-plugin-go/tfprotov6/internal/tfplugin6"
)

func Diagnostic(in *tfprotov6.Diagnostic) (*tfplugin6.Diagnostic, error) {
diag := &tfplugin6.Diagnostic{
Severity: Diagnostic_Severity(in.Severity),
Summary: in.Summary,
Detail: in.Detail,
Summary: forceValidUTF8(in.Summary),
Detail: forceValidUTF8(in.Detail),
}
if in.Attribute != nil {
attr, err := AttributePath(in.Attribute)
Expand Down Expand Up @@ -41,6 +43,59 @@ func Diagnostics(in []*tfprotov6.Diagnostic) ([]*tfplugin6.Diagnostic, error) {
return diagnostics, nil
}

// forceValidUTF8 returns a string guaranteed to be valid UTF-8 even if the
// input isn't, by replacing any invalid bytes with a valid UTF-8 encoding of
// the Unicode Replacement Character (\uFFFD).
//
// The protobuf serialization library will reject invalid UTF-8 with an
// unhelpful error message:
//
// string field contains invalid UTF-8
//
// Passing a string result through this function makes invalid UTF-8 instead
// emerge as placeholder characters on the other side of the wire protocol,
// giving a better chance of still returning a partially-legible message
// instead of a generic character encoding error.
//
// This is intended for user-facing messages such as diagnostic summary and
// detail messages, where Terraform will just treat the value as opaque and
// it's ultimately up to the user and their terminal or web browser to
// interpret the result. Don't use this for strings that have machine-readable
// meaning.
func forceValidUTF8(s string) string {
// Most strings that pass through here will already be valid UTF-8 and
// utf8.ValidString has a fast path which will beat our rune-by-rune
// analysis below, so it's worth the cost of walking the string twice
// in the rarer invalid case.
if utf8.ValidString(s) {
return s
}

// If we get down here then we know there's at least one invalid UTF-8
// sequence in the string, so in this slow path we'll reconstruct the
// string one rune at a time, guaranteeing that we'll only write valid
// UTF-8 sequences into the resulting buffer.
//
// Any invalid string will grow at least a little larger as a result of
// this operation because we'll be replacing each invalid byte with
// the three-byte sequence \xEF\xBF\xBD, which is the UTF-8 encoding of
// the replacement character \uFFFD. 9 is a magic number giving room for
// three such expansions without any further allocation.
ret := make([]byte, 0, len(s)+9)
for {
// If the first byte in s is not the start of a valid UTF-8 sequence
// then the following will return utf8.RuneError, 1, where
// utf8.RuneError is the unicode replacement character.
r, advance := utf8.DecodeRuneInString(s)
if advance == 0 {
break
}
s = s[advance:]
ret = utf8.AppendRune(ret, r)
}
return string(ret)
}

// we have to say this next thing to get golint to stop yelling at us about the
// underscores in the function names. We want the function names to match
// actually-generated code, so it feels like fair play. It's just a shame we
Expand Down
46 changes: 46 additions & 0 deletions tfprotov6/internal/toproto/diagnostic_test.go
@@ -0,0 +1,46 @@
package toproto

import (
"testing"
)

func TestForceValidUTF8(t *testing.T) {
tests := []struct {
Input string
Want string
}{
{
"hello",
"hello",
},
{
"こんにちは",
"こんにちは",
},
{
"baffle", // NOTE: "ffl" is a single-character ligature
"baffle", // ligature is preserved exactly
},
{
"wé́́é́́é́́!", // NOTE: These "e" have multiple combining diacritics
"wé́́é́́é́́!", // diacritics are preserved exactly
},
{
"😸😾", // Astral-plane characters
"😸😾", // preserved exactly
},
{
"\xff\xff", // neither byte is valid UTF-8
"\ufffd\ufffd", // both are replaced by replacement character
},
}

for _, test := range tests {
t.Run(test.Input, func(t *testing.T) {
got := forceValidUTF8(test.Input)
if got != test.Want {
t.Errorf("wrong result\ngot: %q\nwant: %q", got, test.Want)
}
})
}
}

0 comments on commit c55353b

Please sign in to comment.