Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(bigquery/storage/managedwriter): enable field name indirection #6247

Merged
merged 30 commits into from Oct 4, 2022
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
2b8c0fd
feat(bigquery/storage/managedwriter): expose preview annotation
shollyman Jun 23, 2022
5d62198
Merge branch 'main' into annotation-preview
shollyman Jul 14, 2022
5f771fb
update schema normalization, add unit test
shollyman Jul 14, 2022
5bd5e87
Merge branch 'main' into annotation-preview
shollyman Jul 15, 2022
eb7d1e4
add validation test for column_name annotation
shollyman Jul 15, 2022
229f408
Merge branch 'main' into annotation-preview
shollyman Jul 15, 2022
2da5fac
Merge branch 'main' into annotation-preview
shollyman Jul 27, 2022
3c9737c
Merge branch 'main' into annotation-preview
shollyman Jul 28, 2022
3638ed4
Merge branch 'main' into annotation-preview
shollyman Aug 23, 2022
70390f8
update expectations - no emojis, sadly
shollyman Aug 23, 2022
9db8548
Merge branch 'main' into annotation-preview
shollyman Aug 24, 2022
ffbae4e
additional error checking
shollyman Aug 31, 2022
4fe1e57
Merge branch 'main' into annotation-preview
shollyman Sep 7, 2022
65a7f2a
use quoted literals in validation builder, cleanup extraneous logging
shollyman Sep 9, 2022
9f1a948
Merge branch 'main' into annotation-preview
shollyman Sep 9, 2022
6a5d804
switch emojis to valid characters in conversion tests
shollyman Sep 9, 2022
74d998a
Merge branch 'main' into annotation-preview
shollyman Sep 14, 2022
a651315
switch from preview to stable annotation
shollyman Sep 14, 2022
458bb2c
Merge branch 'main' into annotation-preview
shollyman Sep 15, 2022
2659053
switch field name encoding for unicode conversion
shollyman Sep 15, 2022
125bfea
Merge branch 'main' into annotation-preview
shollyman Sep 16, 2022
199d01a
address reviewer feedback re: allowed characters
shollyman Sep 16, 2022
08288f0
Merge branch 'main' into annotation-preview
shollyman Sep 20, 2022
b8a8044
remove stale test
shollyman Sep 21, 2022
cf81868
Merge branch 'main' into annotation-preview
shollyman Sep 21, 2022
d6428bc
rely on name validation directly from protobuf
shollyman Sep 21, 2022
4a12d9c
more test cases
shollyman Sep 21, 2022
6558136
disable indirection validation test until all test projects have access
shollyman Sep 21, 2022
83c0e25
Merge branch 'main' into annotation-preview
shollyman Oct 4, 2022
a059390
enable test
shollyman Oct 4, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
78 changes: 50 additions & 28 deletions bigquery/storage/managedwriter/adapt/protoconversion.go
Expand Up @@ -286,46 +286,68 @@ func storageSchemaToDescriptorInternal(inSchema *storagepb.TableSchema, scope st
//
// Messages are always nullable, and repeated fields are as well.
func tableFieldSchemaToFieldDescriptorProto(field *storagepb.TableFieldSchema, idx int32, scope string, useProto3 bool) (*descriptorpb.FieldDescriptorProto, error) {

name := strings.ToLower(field.GetName())
var fdp *descriptorpb.FieldDescriptorProto

if field.GetType() == storagepb.TableFieldSchema_STRUCT {
return &descriptorpb.FieldDescriptorProto{
fdp = &descriptorpb.FieldDescriptorProto{
Name: proto.String(name),
Number: proto.Int32(idx),
TypeName: proto.String(scope),
Label: convertModeToLabel(field.GetMode(), useProto3),
}, nil
}

// For (REQUIRED||REPEATED) fields for proto3, or all cases for proto2, we can use the expected scalar types.
if field.GetMode() != storagepb.TableFieldSchema_NULLABLE || !useProto3 {
outType := bqTypeToFieldTypeMap[field.GetType()]
fdp := &descriptorpb.FieldDescriptorProto{
Name: proto.String(name),
Number: proto.Int32(idx),
Type: outType.Enum(),
Label: convertModeToLabel(field.GetMode(), useProto3),
}
// Special case: proto2 repeated fields may benefit from using packed annotation.
if field.GetMode() == storagepb.TableFieldSchema_REPEATED && !useProto3 {
for _, v := range packedTypes {
if outType == v {
fdp.Options = &descriptorpb.FieldOptions{
Packed: proto.Bool(true),
} else {
// For (REQUIRED||REPEATED) fields for proto3, or all cases for proto2, we can use the expected scalar types.
if field.GetMode() != storagepb.TableFieldSchema_NULLABLE || !useProto3 {
outType := bqTypeToFieldTypeMap[field.GetType()]
fdp = &descriptorpb.FieldDescriptorProto{
Name: proto.String(name),
Number: proto.Int32(idx),
Type: outType.Enum(),
Label: convertModeToLabel(field.GetMode(), useProto3),
}

// Special case: proto2 repeated fields may benefit from using packed annotation.
if field.GetMode() == storagepb.TableFieldSchema_REPEATED && !useProto3 {
for _, v := range packedTypes {
if outType == v {
fdp.Options = &descriptorpb.FieldOptions{
Packed: proto.Bool(true),
}
break
}
break
}
}
} else {
// For NULLABLE proto3 fields, use a wrapper type.
fdp = &descriptorpb.FieldDescriptorProto{
shollyman marked this conversation as resolved.
Show resolved Hide resolved
Name: proto.String(name),
Number: proto.Int32(idx),
Type: descriptorpb.FieldDescriptorProto_TYPE_MESSAGE.Enum(),
TypeName: proto.String(bqTypeToWrapperMap[field.GetType()]),
Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum(),
}
}
}
if nameRequiresAnnotation(name) {
// Use a prefix + base64 encoded name when annotations bear the actual name.
// Base 64 standard encoding may also contain certain characters (+,/,=) which
// we remove from the generated name.
encoded := strings.Trim(base64.StdEncoding.EncodeToString([]byte(name)), "+/=")
fdp.Name = proto.String(fmt.Sprintf("col_%s", encoded))
opts := fdp.GetOptions()
if opts == nil {
fdp.Options = &descriptorpb.FieldOptions{}
}
return fdp, nil
proto.SetExtension(fdp.Options, storagepb.E_ColumnName, name)
}
// For NULLABLE proto3 fields, use a wrapper type.
return &descriptorpb.FieldDescriptorProto{
Name: proto.String(name),
Number: proto.Int32(idx),
Type: descriptorpb.FieldDescriptorProto_TYPE_MESSAGE.Enum(),
TypeName: proto.String(bqTypeToWrapperMap[field.GetType()]),
Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum(),
}, nil
return fdp, nil
}

// nameRequiresAnnotation determines whether a field name requires unicode-annotation.
func nameRequiresAnnotation(in string) bool {
return !protoreflect.Name(in).IsValid()
shollyman marked this conversation as resolved.
Show resolved Hide resolved
}

// NormalizeDescriptor builds a self-contained DescriptorProto suitable for communicating schema
Expand Down
53 changes: 53 additions & 0 deletions bigquery/storage/managedwriter/adapt/protoconversion_test.go
Expand Up @@ -413,6 +413,59 @@ func TestSchemaToProtoConversion(t *testing.T) {
},
},
},
{
description: "indirect names",
bq: &storagepb.TableSchema{
Fields: []*storagepb.TableFieldSchema{
{Name: "foo", Type: storagepb.TableFieldSchema_STRING, Mode: storagepb.TableFieldSchema_NULLABLE},
{Name: "火", Type: storagepb.TableFieldSchema_INT64, Mode: storagepb.TableFieldSchema_REQUIRED},
{Name: "水_addict", Type: storagepb.TableFieldSchema_BYTES, Mode: storagepb.TableFieldSchema_REPEATED},
{Name: "0col", Type: storagepb.TableFieldSchema_INT64, Mode: storagepb.TableFieldSchema_NULLABLE},
{Name: "funny-name", Type: storagepb.TableFieldSchema_INT64, Mode: storagepb.TableFieldSchema_NULLABLE},
}},
wantProto2: func() *descriptorpb.DescriptorProto {
dp := &descriptorpb.DescriptorProto{
Name: proto.String("root"),
Field: []*descriptorpb.FieldDescriptorProto{
{
Name: proto.String("foo"),
Number: proto.Int32(1),
Type: descriptorpb.FieldDescriptorProto_TYPE_STRING.Enum(),
Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum()},
{
Name: proto.String("col_54Gr"),
Number: proto.Int32(2),
Type: descriptorpb.FieldDescriptorProto_TYPE_INT64.Enum(),
Options: &descriptorpb.FieldOptions{},
Label: descriptorpb.FieldDescriptorProto_LABEL_REQUIRED.Enum()},
{
Name: proto.String("col_5rC0X2FkZGljdA"),
Number: proto.Int32(3),
Type: descriptorpb.FieldDescriptorProto_TYPE_BYTES.Enum(),
Options: &descriptorpb.FieldOptions{},
Label: descriptorpb.FieldDescriptorProto_LABEL_REPEATED.Enum(),
},
{
Name: proto.String("col_MGNvbA"),
Number: proto.Int32(4),
Type: descriptorpb.FieldDescriptorProto_TYPE_INT64.Enum(),
Options: &descriptorpb.FieldOptions{},
Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum()},
{
Name: proto.String("col_ZnVubnktbmFtZQ"),
Number: proto.Int32(5),
Type: descriptorpb.FieldDescriptorProto_TYPE_INT64.Enum(),
Options: &descriptorpb.FieldOptions{},
Label: descriptorpb.FieldDescriptorProto_LABEL_OPTIONAL.Enum()},
},
}
proto.SetExtension(dp.Field[1].Options, storagepb.E_ColumnName, "火")
proto.SetExtension(dp.Field[2].Options, storagepb.E_ColumnName, "水_addict")
proto.SetExtension(dp.Field[3].Options, storagepb.E_ColumnName, "0col")
proto.SetExtension(dp.Field[4].Options, storagepb.E_ColumnName, "funny-name")
return dp
}(),
},
}
for _, tc := range testCases {
// Proto2
Expand Down
15 changes: 15 additions & 0 deletions bigquery/storage/managedwriter/testdata/schemas.go
Expand Up @@ -257,4 +257,19 @@ var (
Repeated: true,
},
}

ValidationColumnAnnotations bigquery.Schema = bigquery.Schema{
{
Name: "first",
Type: bigquery.StringFieldType,
},
{
Name: "second",
Type: bigquery.StringFieldType,
},
{
Name: "特別コラム",
Type: bigquery.StringFieldType,
},
}
)