Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(bigquery/storage/managedwriter): enable field name indirection #6247

Merged
merged 30 commits into from Oct 4, 2022
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
2b8c0fd
feat(bigquery/storage/managedwriter): expose preview annotation
shollyman Jun 23, 2022
5d62198
Merge branch 'main' into annotation-preview
shollyman Jul 14, 2022
5f771fb
update schema normalization, add unit test
shollyman Jul 14, 2022
5bd5e87
Merge branch 'main' into annotation-preview
shollyman Jul 15, 2022
eb7d1e4
add validation test for column_name annotation
shollyman Jul 15, 2022
229f408
Merge branch 'main' into annotation-preview
shollyman Jul 15, 2022
2da5fac
Merge branch 'main' into annotation-preview
shollyman Jul 27, 2022
3c9737c
Merge branch 'main' into annotation-preview
shollyman Jul 28, 2022
3638ed4
Merge branch 'main' into annotation-preview
shollyman Aug 23, 2022
70390f8
update expectations - no emojis, sadly
shollyman Aug 23, 2022
9db8548
Merge branch 'main' into annotation-preview
shollyman Aug 24, 2022
ffbae4e
additional error checking
shollyman Aug 31, 2022
4fe1e57
Merge branch 'main' into annotation-preview
shollyman Sep 7, 2022
65a7f2a
use quoted literals in validation builder, cleanup extraneous logging
shollyman Sep 9, 2022
9f1a948
Merge branch 'main' into annotation-preview
shollyman Sep 9, 2022
6a5d804
switch emojis to valid characters in conversion tests
shollyman Sep 9, 2022
74d998a
Merge branch 'main' into annotation-preview
shollyman Sep 14, 2022
a651315
switch from preview to stable annotation
shollyman Sep 14, 2022
458bb2c
Merge branch 'main' into annotation-preview
shollyman Sep 15, 2022
2659053
switch field name encoding for unicode conversion
shollyman Sep 15, 2022
125bfea
Merge branch 'main' into annotation-preview
shollyman Sep 16, 2022
199d01a
address reviewer feedback re: allowed characters
shollyman Sep 16, 2022
08288f0
Merge branch 'main' into annotation-preview
shollyman Sep 20, 2022
b8a8044
remove stale test
shollyman Sep 21, 2022
cf81868
Merge branch 'main' into annotation-preview
shollyman Sep 21, 2022
d6428bc
rely on name validation directly from protobuf
shollyman Sep 21, 2022
4a12d9c
more test cases
shollyman Sep 21, 2022
6558136
disable indirection validation test until all test projects have access
shollyman Sep 21, 2022
83c0e25
Merge branch 'main' into annotation-preview
shollyman Oct 4, 2022
a059390
enable test
shollyman Oct 4, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
83 changes: 55 additions & 28 deletions bigquery/storage/managedwriter/adapt/protoconversion.go
Expand Up @@ -18,7 +18,9 @@ import (
"encoding/base64"
"fmt"
"strings"
"unicode"

"cloud.google.com/go/bigquery/storage/managedwriter/internal/annotations"
storagepb "google.golang.org/genproto/googleapis/cloud/bigquery/storage/v1"
"google.golang.org/protobuf/proto"
"google.golang.org/protobuf/reflect/protodesc"
Expand Down Expand Up @@ -286,46 +288,71 @@ func storageSchemaToDescriptorInternal(inSchema *storagepb.TableSchema, scope st
//
// Messages are always nullable, and repeated fields are as well.
func tableFieldSchemaToFieldDescriptorProto(field *storagepb.TableFieldSchema, idx int32, scope string, useProto3 bool) (*descriptorpb.FieldDescriptorProto, error) {

name := strings.ToLower(field.GetName())
var fdp *descriptorpb.FieldDescriptorProto

if field.GetType() == storagepb.TableFieldSchema_STRUCT {
return &descriptorpb.FieldDescriptorProto{
fdp = &descriptorpb.FieldDescriptorProto{
Name: proto.String(name),
Number: proto.Int32(idx),
TypeName: proto.String(scope),
Label: convertModeToLabel(field.GetMode(), useProto3),
}, nil
}

// For (REQUIRED||REPEATED) fields for proto3, or all cases for proto2, we can use the expected scalar types.
if field.GetMode() != storagepb.TableFieldSchema_NULLABLE || !useProto3 {
outType := bqTypeToFieldTypeMap[field.GetType()]
fdp := &descriptorpb.FieldDescriptorProto{
Name: proto.String(name),
Number: proto.Int32(idx),
Type: outType.Enum(),
Label: convertModeToLabel(field.GetMode(), useProto3),
}
// Special case: proto2 repeated fields may benefit from using packed annotation.
if field.GetMode() == storagepb.TableFieldSchema_REPEATED && !useProto3 {
for _, v := range packedTypes {
if outType == v {
fdp.Options = &descriptorpb.FieldOptions{
Packed: proto.Bool(true),
} else {
// For (REQUIRED||REPEATED) fields for proto3, or all cases for proto2, we can use the expected scalar types.
if field.GetMode() != storagepb.TableFieldSchema_NULLABLE || !useProto3 {
outType := bqTypeToFieldTypeMap[field.GetType()]
fdp = &descriptorpb.FieldDescriptorProto{
Name: proto.String(name),
Number: proto.Int32(idx),
Type: outType.Enum(),
Label: convertModeToLabel(field.GetMode(), useProto3),
}

// Special case: proto2 repeated fields may benefit from using packed annotation.
if field.GetMode() == storagepb.TableFieldSchema_REPEATED && !useProto3 {
for _, v := range packedTypes {
if outType == v {
fdp.Options = &descriptorpb.FieldOptions{
Packed: proto.Bool(true),
}
break
}
break
}
}
} else {
// For NULLABLE proto3 fields, use a wrapper type.
fdp = &descriptorpb.FieldDescriptorProto{
shollyman marked this conversation as resolved.
Show resolved Hide resolved
Name: proto.String(name),
Number: proto.Int32(idx),
Type: descriptorpb.FieldDescriptorProto_TYPE_MESSAGE.Enum(),
TypeName: proto.String(bqTypeToWrapperMap[field.GetType()]),
Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum(),
}
}
return fdp, nil
}
// For NULLABLE proto3 fields, use a wrapper type.
return &descriptorpb.FieldDescriptorProto{
Name: proto.String(name),
Number: proto.Int32(idx),
Type: descriptorpb.FieldDescriptorProto_TYPE_MESSAGE.Enum(),
TypeName: proto.String(bqTypeToWrapperMap[field.GetType()]),
Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum(),
}, nil
if nameRequiresAnnotation(name) {
// TODO: need agreement across implementations on how to normalize?
// Possibly use golang.org/x/text/unicode/norm for ascii-fying?
fdp.Name = proto.String(fmt.Sprintf("unicode_field_%d", fdp.GetNumber()))
opts := fdp.GetOptions()
if opts == nil {
fdp.Options = &descriptorpb.FieldOptions{}
}
proto.SetExtension(fdp.Options, annotations.E_ColumnName, name)
}
return fdp, nil
}

// nameRequiresAnnotation determines whether a field name requires unicode-annotation.
func nameRequiresAnnotation(in string) bool {
for i := 0; i < len(in); i++ {
if in[i] > unicode.MaxASCII {
shollyman marked this conversation as resolved.
Show resolved Hide resolved
return true
}
}
return false
}

// NormalizeDescriptor builds a self-contained DescriptorProto suitable for communicating schema
Expand Down
38 changes: 38 additions & 0 deletions bigquery/storage/managedwriter/adapt/protoconversion_test.go
Expand Up @@ -19,6 +19,7 @@ import (
"reflect"
"testing"

"cloud.google.com/go/bigquery/storage/managedwriter/internal/annotations"
"cloud.google.com/go/bigquery/storage/managedwriter/testdata"
"github.com/google/go-cmp/cmp"
storagepb "google.golang.org/genproto/googleapis/cloud/bigquery/storage/v1"
Expand Down Expand Up @@ -413,6 +414,43 @@ func TestSchemaToProtoConversion(t *testing.T) {
},
},
},
{
description: "unicode",
bq: &storagepb.TableSchema{
Fields: []*storagepb.TableFieldSchema{
{Name: "foo", Type: storagepb.TableFieldSchema_STRING, Mode: storagepb.TableFieldSchema_NULLABLE},
{Name: "💩", Type: storagepb.TableFieldSchema_INT64, Mode: storagepb.TableFieldSchema_REQUIRED},
{Name: "☕_addict", Type: storagepb.TableFieldSchema_BYTES, Mode: storagepb.TableFieldSchema_REPEATED},
}},
wantProto2: func() *descriptorpb.DescriptorProto {
dp := &descriptorpb.DescriptorProto{
Name: proto.String("root"),
Field: []*descriptorpb.FieldDescriptorProto{
{
Name: proto.String("foo"),
Number: proto.Int32(1),
Type: descriptorpb.FieldDescriptorProto_TYPE_STRING.Enum(),
Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum()},
{
Name: proto.String("unicode_field_2"),
Number: proto.Int32(2),
Type: descriptorpb.FieldDescriptorProto_TYPE_INT64.Enum(),
Options: &descriptorpb.FieldOptions{},
Label: descriptorpb.FieldDescriptorProto_LABEL_REQUIRED.Enum()},
{
Name: proto.String("unicode_field_3"),
Number: proto.Int32(3),
Type: descriptorpb.FieldDescriptorProto_TYPE_BYTES.Enum(),
Options: &descriptorpb.FieldOptions{},
Label: descriptorpb.FieldDescriptorProto_LABEL_REPEATED.Enum(),
},
},
}
proto.SetExtension(dp.Field[1].Options, annotations.E_ColumnName, "💩")
proto.SetExtension(dp.Field[2].Options, annotations.E_ColumnName, "☕_addict")
return dp
}(),
},
}
for _, tc := range testCases {
// Proto2
Expand Down
128 changes: 128 additions & 0 deletions bigquery/storage/managedwriter/internal/annotations/annotations.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

@@ -0,0 +1,40 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";
shollyman marked this conversation as resolved.
Show resolved Hide resolved

package google.cloud.bigquery.storage.v1;

import "google/protobuf/descriptor.proto";

option go_package = "cloud.google.com/go/bigquery/storage/managedwriter/internal/annotations;annotations";
option java_package = "com.google.cloud.bigquery.storage.v1";
option java_multiple_files = true;
option java_outer_classname = "AnnotationsProto";

extend google.protobuf.FieldOptions {
// Setting the column_name extension allows users to reference
// bigquery column independently of the field name in the protocol buffer
// message.
//
// The intended use of this annotation is to reference a destination column
// named using characters unavailable for protobuf field names (e.g. unicode
// characters).
//
// More details about BigQuery naming limitations can be found here:
// https://cloud.google.com/bigquery/docs/schemas#column_names
//
// This extension is currently experimental.
optional string column_name = 454943157;
}
15 changes: 15 additions & 0 deletions bigquery/storage/managedwriter/testdata/schemas.go
Expand Up @@ -257,4 +257,19 @@ var (
Repeated: true,
},
}

ValidationColumnAnnotations bigquery.Schema = bigquery.Schema{
{
Name: "first",
Type: bigquery.StringFieldType,
},
{
Name: "second",
Type: bigquery.StringFieldType,
},
{
Name: "特別コラム",
Type: bigquery.StringFieldType,
},
}
)