Skip to content

Commit

Permalink
Use TextDecoder API for decoding UTF-8 from binary data, see #184
Browse files Browse the repository at this point in the history
  • Loading branch information
timostamm committed Jan 3, 2022
1 parent 5e07925 commit e1cd360
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 40 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Expand Up @@ -6,6 +6,14 @@ New features:
- The new plugin option `add_pb_suffix` adds the suffix `_pb` to all file names, see #186.


Bug fixes:

- Use TextDecoder API for decoding UTF-8 from binary data, see #184.
We have been using protobuf.js' algorithm to decode UTF-8, but it has had [bugs](https://github.com/protobufjs/protobuf.js/pull/1486)
in the past. For best possible compatibility, we have switched to the TextDecoder API.
See [MANUAL](./MANUAL.md#utf-8-decoding) for details.


### v2.1.0

New features:
Expand Down
32 changes: 32 additions & 0 deletions MANUAL.md
Expand Up @@ -1120,6 +1120,38 @@ The `toBinary` method takes an optional second argument of type
Allows to use a custom implementation to encode binary data.


#### UTF-8 decoding

JavaScript uses UTF-16 for strings, but protobuf uses UTF-8. In order
to serialize to and from binary data, protobuf-ts converts between the
encodings with the [TextEncoder / TextDecoder API](https://developer.mozilla.org/en-US/docs/Web/API/Encoding_API).

Note that the protobuf [language guide](https://developers.google.com/protocol-buffers/docs/proto3#scalar) states:

> A string must always contain UTF-8 encoded or 7-bit ASCII text [...]
If an invalid UTF-8 string is encoded in the binary format, protobuf-ts
will raise an error on decoding through the TextDecoder option `fatal`.
If you do not want that behaviour, use the `readerFactory` option to
pass your own TextDecoder instance.

As of January 2022, performance of TextDecoder on Node.js falls behind
Node.js' `Buffer`. In order to use `Buffer` to decode UTF-8, use the
`readerFactory` option:

```ts
const nodeBinaryReadOptions = {
readerFactory: (bytes: Uint8Array) => new BinaryReader(bytes, {
decode(input?: Uint8Array): string {
return input ? (input as Buffer).toString("utf8") : "";
}
})
};
MyMessage.fromBinary(bytes, nodeBinaryReadOptions);
```



#### Conformance

`protobuf-ts` strictly conforms to the protobuf spec. It passes all
Expand Down
86 changes: 51 additions & 35 deletions packages/benchmarks/perf.ts
Expand Up @@ -4,9 +4,14 @@ import {FileDescriptorSet as tsProtoType} from "./testees/ts-proto.default/.plug
import {FileDescriptorSet as googleProtobufType} from "google-protobuf/google/protobuf/descriptor_pb";
import {FileDescriptorSet as sizeType} from "./testees/protobuf-ts.size/.plugin-out/google/protobuf/descriptor";
import {FileDescriptorSet as speedType} from "./testees/protobuf-ts.speed/.plugin-out/google/protobuf/descriptor";
import {FileDescriptorSet as sizeBigintType} from "./testees/protobuf-ts.size-bigint/.plugin-out/google/protobuf/descriptor";
import {FileDescriptorSet as speedBigintType} from "./testees/protobuf-ts.speed-bigint/.plugin-out/google/protobuf/descriptor";
import {
FileDescriptorSet as sizeBigintType
} from "./testees/protobuf-ts.size-bigint/.plugin-out/google/protobuf/descriptor";
import {
FileDescriptorSet as speedBigintType
} from "./testees/protobuf-ts.speed-bigint/.plugin-out/google/protobuf/descriptor";
import * as protobufjsNamespace from "./testees/protobufjs/.plugin-out/descriptor"
import {BinaryReader} from "@protobuf-ts/runtime";

function bench(name: string, fn: () => void, durationSeconds = 5) {
let startTs = performance.now();
Expand Down Expand Up @@ -44,50 +49,61 @@ let speedJson = speedType.toJson(speedMessage);
let speedBigintMessage = speedBigintType.fromBinary(bytes);
let protobufjsMessage = protobufjsType.decode(new Uint8Array(bytes));
let protobufjsJson = protobufjsType.toObject(protobufjsMessage);
const nodeBinaryReadOptions = {
readerFactory: (bytes: Uint8Array) => new BinaryReader(bytes, {
decode(input?: Uint8Array): string {
return input ? (input as Buffer).toString("utf8") : "";
}
})
};

console.log('### read binary');
bench('google-protobuf ', () => googleProtobufType.deserializeBinary(bytes));
bench('ts-proto ', () => tsProtoType.decode(bytes));
bench('protobuf-ts (speed) ', () => speedType.fromBinary(bytes));
bench('protobuf-ts (speed, bigint) ', () => speedBigintType.fromBinary(bytes));
bench('protobuf-ts (size) ', () => sizeType.fromBinary(bytes));
bench('protobuf-ts (size, bigint) ', () => sizeBigintType.fromBinary(bytes));
bench('protobufjs ', () => protobufjsType.decode(new Uint8Array(bytes)));
bench('google-protobuf ', () => googleProtobufType.deserializeBinary(bytes));
bench('ts-proto ', () => tsProtoType.decode(bytes));
bench('protobuf-ts (speed) ', () => speedType.fromBinary(bytes));
bench('protobuf-ts (speed, bigint) ', () => speedBigintType.fromBinary(bytes));
bench('protobuf-ts (size) ', () => sizeType.fromBinary(bytes));
bench('protobuf-ts (size, bigint) ', () => sizeBigintType.fromBinary(bytes));
bench('protobuf-ts (speed, node/Buffer) ', () => speedType.fromBinary(bytes, nodeBinaryReadOptions));
bench('protobuf-ts (speed, bigint, node/Buffer) ', () => speedBigintType.fromBinary(bytes, nodeBinaryReadOptions));
bench('protobuf-ts (size, node/Buffer) ', () => sizeType.fromBinary(bytes, nodeBinaryReadOptions));
bench('protobuf-ts (size, bigint, node/Buffer) ', () => sizeBigintType.fromBinary(bytes, nodeBinaryReadOptions));
bench('protobufjs ', () => protobufjsType.decode(new Uint8Array(bytes)));

console.log('### write binary');
bench('google-protobuf ', () => googleProtobufMessage.serializeBinary());
bench('ts-proto ', () => tsProtoType.encode(tsProtoMessage));
bench('protobuf-ts (speed) ', () => speedType.toBinary(speedMessage));
bench('protobuf-ts (speed, bigint) ', () => speedBigintType.toBinary(speedBigintMessage));
bench('protobuf-ts (size) ', () => sizeType.toBinary(sizeMessage));
bench('protobuf-ts (size, bigint) ', () => sizeBigintType.toBinary(sizeBigintMessage));
bench('protobufjs ', () => protobufjsType.encode(protobufjsMessage).finish());
bench('google-protobuf ', () => googleProtobufMessage.serializeBinary());
bench('ts-proto ', () => tsProtoType.encode(tsProtoMessage));
bench('protobuf-ts (speed) ', () => speedType.toBinary(speedMessage));
bench('protobuf-ts (speed, bigint) ', () => speedBigintType.toBinary(speedBigintMessage));
bench('protobuf-ts (size) ', () => sizeType.toBinary(sizeMessage));
bench('protobuf-ts (size, bigint) ', () => sizeBigintType.toBinary(sizeBigintMessage));
bench('protobufjs ', () => protobufjsType.encode(protobufjsMessage).finish());

console.log('### from partial');
bench('ts-proto ', () => tsProtoType.fromPartial(tsProtoMessage));
bench('protobuf-ts (speed) ', () => speedType.create(sizeMessage));
bench('protobuf-ts (size) ', () => sizeType.create(speedMessage));
bench('ts-proto ', () => tsProtoType.fromPartial(tsProtoMessage));
bench('protobuf-ts (speed) ', () => speedType.create(sizeMessage));
bench('protobuf-ts (size) ', () => sizeType.create(speedMessage));

console.log('### read json string');
bench('ts-proto ', () => tsProtoType.fromJSON(JSON.parse(tsProtoJsonString)));
bench('protobuf-ts (speed) ', () => speedType.fromJsonString(tsProtoJsonString));
bench('protobuf-ts (size) ', () => sizeType.fromJsonString(tsProtoJsonString));
bench('protobufjs ', () => protobufjsType.fromObject(JSON.parse(tsProtoJsonString)));
bench('ts-proto ', () => tsProtoType.fromJSON(JSON.parse(tsProtoJsonString)));
bench('protobuf-ts (speed) ', () => speedType.fromJsonString(tsProtoJsonString));
bench('protobuf-ts (size) ', () => sizeType.fromJsonString(tsProtoJsonString));
bench('protobufjs ', () => protobufjsType.fromObject(JSON.parse(tsProtoJsonString)));

console.log('### write json string');
bench('ts-proto ', () => JSON.stringify(tsProtoType.toJSON(tsProtoMessage)));
bench('protobuf-ts (speed) ', () => speedType.toJsonString(speedMessage));
bench('protobuf-ts (size) ', () => sizeType.toJsonString(sizeMessage));
bench('protobufjs ', () => JSON.stringify(protobufjsType.toObject(protobufjsMessage)));
bench('ts-proto ', () => JSON.stringify(tsProtoType.toJSON(tsProtoMessage)));
bench('protobuf-ts (speed) ', () => speedType.toJsonString(speedMessage));
bench('protobuf-ts (size) ', () => sizeType.toJsonString(sizeMessage));
bench('protobufjs ', () => JSON.stringify(protobufjsType.toObject(protobufjsMessage)));

console.log('### read json object');
bench('ts-proto ', () => tsProtoType.fromJSON(tsProtoJson));
bench('protobuf-ts (speed) ', () => speedType.fromJson(speedJson));
bench('protobuf-ts (size) ', () => sizeType.fromJson(sizeJson));
bench('protobufjs ', () => protobufjsType.fromObject(protobufjsJson));
bench('ts-proto ', () => tsProtoType.fromJSON(tsProtoJson));
bench('protobuf-ts (speed) ', () => speedType.fromJson(speedJson));
bench('protobuf-ts (size) ', () => sizeType.fromJson(sizeJson));
bench('protobufjs ', () => protobufjsType.fromObject(protobufjsJson));

console.log('### write json object');
bench('ts-proto ', () => tsProtoType.toJSON(tsProtoMessage));
bench('protobuf-ts (speed) ', () => speedType.toJson(speedMessage));
bench('protobuf-ts (size) ', () => sizeType.toJson(sizeMessage));
bench('protobufjs ', () => protobufjsType.toObject(protobufjsMessage));
bench('ts-proto ', () => tsProtoType.toJSON(tsProtoMessage));
bench('protobuf-ts (speed) ', () => speedType.toJson(speedMessage));
bench('protobuf-ts (size) ', () => sizeType.toJson(sizeMessage));
bench('protobufjs ', () => protobufjsType.toObject(protobufjsMessage));
15 changes: 12 additions & 3 deletions packages/runtime/src/binary-reader.ts
@@ -1,9 +1,14 @@
import type {IBinaryReader} from "./binary-format-contract";
import {WireType} from "./binary-format-contract";
import {PbLong, PbULong} from "./pb-long";
import {utf8read} from "./protobufjs-utf8";
import {varint32read, varint64read} from "./goog-varint";

/**
* TextDecoderLike is the subset of the TextDecoder API required by protobuf-ts.
*/
interface TextDecoderLike {
decode(input?: Uint8Array): string;
}

export class BinaryReader implements IBinaryReader {

Expand All @@ -19,13 +24,17 @@ export class BinaryReader implements IBinaryReader {

private readonly buf: Uint8Array;
private readonly view: DataView;
private readonly textDecoder: TextDecoderLike;


constructor(buf: Uint8Array) {
constructor(buf: Uint8Array, textDecoder?: TextDecoderLike) {
this.buf = buf;
this.len = buf.length;
this.pos = 0;
this.view = new DataView(buf.buffer, buf.byteOffset, buf.byteLength);
this.textDecoder = textDecoder ?? new TextDecoder("utf-8", {
fatal: true
});
}


Expand Down Expand Up @@ -218,7 +227,7 @@ export class BinaryReader implements IBinaryReader {
* Read a `string` field, length-delimited data converted to UTF-8 text.
*/
string(): string {
return utf8read(this.bytes());
return this.textDecoder.decode(this.bytes());
}

}
Expand Down
11 changes: 9 additions & 2 deletions packages/runtime/src/binary-writer.ts
Expand Up @@ -5,6 +5,13 @@ import {varint32write, varint64write} from "./goog-varint";
import {assertFloat32, assertInt32, assertUInt32} from "./assert";


/**
* TextEncoderLike is the subset of the TextEncoder API required by protobuf-ts.
*/
interface TextEncoderLike {
encode(input?: string): Uint8Array;
}

export class BinaryWriter implements IBinaryWriter {


Expand Down Expand Up @@ -35,10 +42,10 @@ export class BinaryWriter implements IBinaryWriter {
/**
* Text encoder instance to convert UTF-8 to bytes.
*/
private readonly textEncoder: TextEncoder;
private readonly textEncoder: TextEncoderLike;


constructor(textEncoder?: TextEncoder) {
constructor(textEncoder?: TextEncoderLike) {
this.textEncoder = textEncoder ?? new TextEncoder();
this.chunks = [];
this.buf = [];
Expand Down
5 changes: 5 additions & 0 deletions packages/runtime/src/protobufjs-utf8.ts
Expand Up @@ -29,6 +29,11 @@
const fromCharCodes = (chunk: number[]) => String.fromCharCode.apply(String, chunk)

/**
* @deprecated This function will no longer be exported with the next major
* release, since protobuf-ts has switch to TextDecoder API. If you need this
* function, please migrate to @protobufjs/utf8. For context, see
* https://github.com/timostamm/protobuf-ts/issues/184
*
* Reads UTF8 bytes as a string.
*
* See [protobufjs / utf8](https://github.com/protobufjs/protobuf.js/blob/9893e35b854621cce64af4bf6be2cff4fb892796/lib/utf8/index.js#L40)
Expand Down

0 comments on commit e1cd360

Please sign in to comment.