Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve text encoding API #564

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
194 changes: 194 additions & 0 deletions std/assembly/encoding.ts
@@ -0,0 +1,194 @@
import { ALLOCATE, REGISTER, REALLOCATE, MAX_BYTELENGTH } from "./runtime";
import { E_INVALIDLENGTH } from "./util/error";

/** UTF16 encoding. */
export namespace UTF16 {

/** Calculates the length of a string when encoded as an UTF16 buffer. */
export function length(str: string): i32 {
return str.length << 1;
}

/** Encodes a string as an UTF16 buffer. */
export function encode(str: string): ArrayBuffer {
var size = <usize>str.length << 1;
var buf = ALLOCATE(size);
memory.copy(buf, changetype<usize>(str), size);
return REGISTER<ArrayBuffer>(buf);
}

/** Decodes an UTF16 buffer to a string.*/
export function decode(buf: ArrayBuffer): string {
return decodeRaw(changetype<usize>(buf), buf.byteLength);
}

// @ts-ignore: decorator
@unsafe
export function decodeRaw(buf: usize, len: i32): string {
if (<usize>len > <usize>MAX_BYTELENGTH) throw new RangeError(E_INVALIDLENGTH);
var size = <usize>len;
var str = ALLOCATE(size);
memory.copy(str, changetype<usize>(buf), size);
return REGISTER<string>(str);
}
}

/** UTF8 encoding. */
export namespace UTF8 {

/** Calculates the length of a string when encoded as an UTF8 buffer. */
export function length(str: string, delimited: bool = false): i32 {
var strOff = changetype<usize>(str);
var strEnd = changetype<usize>(str) + (<usize>str.length << 1);
var bufLen = delimited ? 1 : 0;
while (strOff < strEnd) {
let c = <u32>load<u16>(strOff);
if (c < 128) {
bufLen += 1; strOff += 2;
} else if (c < 2048) {
bufLen += 2; strOff += 2;
} else {
if ((c & 0xFC00) == 0xD800 && strOff + 2 < strEnd) {
if ((<u32>load<u16>(strOff, 2) & 0xFC00) == 0xDC00) {
strOff += 4; bufLen += 4;
continue;
}
}
strOff += 2; bufLen += 3;
}
}
return bufLen;
}

/** Encodes a string as an UTF8 buffer. */
export function encode(str: string, delimited: bool = false): ArrayBuffer {
var strOff = changetype<usize>(str);
var strEnd = changetype<usize>(str) + (<usize>str.length << 1);
var buf = ALLOCATE(<usize>length(str, delimited));
var bufOff = changetype<usize>(buf);
while (strOff < strEnd) {
let c1 = <u32>load<u16>(strOff);
if (c1 < 128) {
store<u8>(bufOff, c1);
bufOff += 1; strOff += 2;
} else if (c1 < 2048) {
store<u8>(bufOff, c1 >> 6 | 192);
store<u8>(bufOff, c1 & 63 | 128, 1);
bufOff += 2; strOff += 2;
} else {
if ((c1 & 0xFC00) == 0xD800 && strOff + 2 < strEnd) {
let c2 = <u32>load<u16>(strOff, 2);
if ((c2 & 0xFC00) == 0xDC00) {
c1 = 0x10000 + ((c1 & 0x03FF) << 10) + (c2 & 0x03FF);
store<u8>(bufOff, c1 >> 18 | 240);
store<u8>(bufOff, c1 >> 12 & 63 | 128, 1);
store<u8>(bufOff, c1 >> 6 & 63 | 128, 2);
store<u8>(bufOff, c1 & 63 | 128, 3);
strOff += 4; bufOff += 4;
continue;
}
}
store<u8>(bufOff, c1 >> 12 | 224);
store<u8>(bufOff, c1 >> 6 & 63 | 128, 1);
store<u8>(bufOff, c1 & 63 | 128, 2);
strOff += 2; bufOff += 3;
}
}
assert(strOff == strEnd);
if (delimited) store<u8>(bufOff, 0);
return REGISTER<ArrayBuffer>(buf);
}

/** Decodes an UTF8 buffer to a string.*/
export function decode(buf: ArrayBuffer, delimited: bool = false): string {
return delimited
? decodeRawDelimited(changetype<usize>(buf), buf.byteLength)
: decodeRaw(changetype<usize>(buf), buf.byteLength);
}

// @ts-ignore: decorator
@unsafe
export function decodeRaw(buf: usize, len: i32): string {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like this could be internal because decode already provide more general interface. WDYT?

Copy link
Member Author

@dcodeIO dcodeIO Mar 28, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea is that someone who gets just a str and a str_len from C has a way to make a string from it, as the higher level decode function wants an ArrayBuffer which must have a runtime header with payloadLength.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So may be makes sense add optional len to decode as well?

Copy link
Member Author

@dcodeIO dcodeIO Mar 28, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That'd look like UTF8.decode(changetype<ArrayBuffer>(str), false, str_len) then. Not sure, feels somewhat wrong and would do different things depending of whether str_len is given or not. For instance, the function does not know whether it is called with an actual ArrayBuffer or not, so it doesn't know when it can length-check and when it can't.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

UTF8.decode(changetype<ArrayBuffer>(str), 0, true) ?

Copy link
Member Author

@dcodeIO dcodeIO Mar 28, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually had something similar in mind initially, but gave up on it when I wasn't able to min(len, buf.byteLength) because buf could be a c pointer. Essentially, if len is given, it must not check byteLength because buf might be something unsafe, in turn making the entire function unsafe when specifying buf as an ArrayBuffer plus len. Hence I made a dedicated unsafe function.

var bufOff = buf;
var bufEnd = buf + <usize>len;
var str = ALLOCATE(<usize>len << 1); // max is one u16 char per u8 byte
var strOff = str;
while (bufOff < bufEnd) {
let cp = <u32>load<u8>(bufOff++);
if (cp < 128) {
store<u16>(strOff, cp);
strOff += 2;
} else if (cp > 191 && cp < 224) {
store<u16>(strOff, (cp & 31) << 6 | load<u8>(bufOff++) & 63);
strOff += 2;
} else if (cp > 239 && cp < 365) {
cp = (
(cp & 7) << 18 |
(load<u8>(bufOff) & 63) << 12 |
(load<u8>(bufOff, 1) & 63) << 6 |
load<u8>(bufOff, 2) & 63
) - 0x10000;
bufOff += 3;
store<u16>(strOff, 0xD800 + (cp >> 10));
store<u16>(strOff, 0xDC00 + (cp & 1023), 2);
strOff += 4;
} else {
store<u16>(strOff,
(cp & 15) << 12 |
(load<u8>(bufOff) & 63) << 6 |
load<u8>(bufOff, 1) & 63
);
bufOff += 2; strOff += 2;
}
}
return REGISTER<string>(REALLOCATE(str, strOff - str));
}

// @ts-ignore: decorator
@unsafe
export function decodeRawDelimited(buf: usize, maxLen: i32 = MAX_BYTELENGTH): string {
var bufOff = buf;
var bufLim = buf + <usize>maxLen;
assert(bufLim > bufOff); // guard wraparound
var str = ALLOCATE(16); // optimize for small strings
var strLen = <usize>0;
while (bufOff < bufLim) {
let cp = <u32>load<u8>(bufOff++);
if (cp < 128) {
if (!cp) break;
str = REALLOCATE(str, strLen + 2);
store<u16>(str + strLen, cp);
strLen += 2;
} else if (cp > 191 && cp < 224) {
if (bufOff >= bufLim) break;
str = REALLOCATE(str, strLen + 2);
store<u16>(str + strLen, (cp & 31) << 6 | load<u8>(bufOff++) & 63);
strLen += 2;
} else if (cp > 239 && cp < 365) {
if (bufOff + 3 > bufLim) break;
cp = (
(cp & 7) << 18 |
(load<u8>(bufOff) & 63) << 12 |
(load<u8>(bufOff, 1) & 63) << 6 |
load<u8>(bufOff, 2) & 63
) - 0x10000;
bufOff += 3;
str = REALLOCATE(str, strLen + 4);
let strOff = str + strLen;
store<u16>(strOff, 0xD800 + (cp >> 10));
store<u16>(strOff, 0xDC00 + (cp & 1023), 2);
strLen += 4;
} else {
if (bufOff + 2 > bufLim) break;
str = REALLOCATE(str, strLen + 2);
store<u16>(str + strLen,
(cp & 15) << 12 |
(load<u8>(bufOff) & 63) << 6 |
load<u8>(bufOff, 1) & 63
);
bufOff += 2; strLen += 2;
}
}
return REGISTER<string>(REALLOCATE(str, strLen));
}
}
18 changes: 15 additions & 3 deletions std/assembly/index.d.ts
Expand Up @@ -1230,7 +1230,6 @@ declare class String {
static fromCodePoints(arr: i32[]): string;

readonly length: i32;
readonly lengthUTF8: i32;

charAt(index: u32): string;
charCodeAt(index: u32): u16;
Expand All @@ -1253,8 +1252,21 @@ declare class String {
slice(beginIndex: i32, endIndex?: i32): string;
split(separator?: string, limit?: i32): string[];
toString(): string;
static fromUTF8(ptr: usize, len: usize): string;
toUTF8(): usize;
}

declare namespace UTF16 {
export function length(str: string): i32;
export function encode(str: string): ArrayBuffer;
export function decode(buf: ArrayBuffer, delimited?: bool): string;
export function decodeRaw(buf: usize, len: i32): string; // unsafe
}

declare namespace UTF8 {
export function length(str: string, delimited?: bool): i32;
export function encode(str: string, delimited?: bool): ArrayBuffer;
export function decode(buf: ArrayBuffer, delimited?: bool): string;
export function decodeRaw(buf: usize, len: i32): string; // unsafe
export function decodeRawDelimited(buf: usize, maxLen?: i32): string; // unsafe
}

/** Class for representing a runtime error. Base class of all errors. */
Expand Down
107 changes: 1 addition & 106 deletions std/assembly/string.ts
Expand Up @@ -4,6 +4,7 @@ import { ALLOCATE, REGISTER, HEADER, HEADER_SIZE, MAKEARRAY, ArrayBufferView } f
import { MAX_SIZE_32 } from "./util/allocator";
import { compareImpl, parse, CharCode, isWhiteSpaceOrLineTerminator } from "./util/string";
import { E_INVALIDLENGTH } from "./util/error";
import { UTF8 } from "./encoding";

@sealed export abstract class String {

Expand Down Expand Up @@ -408,112 +409,6 @@ import { E_INVALIDLENGTH } from "./util/error";
toString(): String {
return this;
}

get lengthUTF8(): i32 {
var len = 1; // null terminated
var pos: usize = 0;
var end = <usize>this.length;
while (pos < end) {
let c = <u32>load<u16>(changetype<usize>(this) + (pos << 1));
if (c < 128) {
len += 1; ++pos;
} else if (c < 2048) {
len += 2; ++pos;
} else {
if (
(c & 0xFC00) == 0xD800 && pos + 1 < end &&
(<u32>load<u16>(changetype<usize>(this) + ((pos + 1) << 1)) & 0xFC00) == 0xDC00
) {
len += 4; pos += 2;
} else {
len += 3; ++pos;
}
}
}
return len;
}

static fromUTF8(ptr: usize, len: usize): string {
if (len < 1) return changetype<string>("");
var ptrPos = <usize>0;
var buf = memory.allocate(<usize>len << 1);
var bufPos = <usize>0;
while (ptrPos < len) {
let cp = <u32>load<u8>(ptr + ptrPos++);
if (cp < 128) {
store<u16>(buf + bufPos, cp);
bufPos += 2;
} else if (cp > 191 && cp < 224) {
assert(ptrPos + 1 <= len);
store<u16>(buf + bufPos, (cp & 31) << 6 | load<u8>(ptr + ptrPos++) & 63);
bufPos += 2;
} else if (cp > 239 && cp < 365) {
assert(ptrPos + 3 <= len);
cp = (
(cp & 7) << 18 |
(load<u8>(ptr + ptrPos++) & 63) << 12 |
(load<u8>(ptr + ptrPos++) & 63) << 6 |
load<u8>(ptr + ptrPos++) & 63
) - 0x10000;
store<u16>(buf + bufPos, 0xD800 + (cp >> 10));
bufPos += 2;
store<u16>(buf + bufPos, 0xDC00 + (cp & 1023));
bufPos += 2;
} else {
assert(ptrPos + 2 <= len);
store<u16>(buf + bufPos,
(cp & 15) << 12 |
(load<u8>(ptr + ptrPos++) & 63) << 6 |
load<u8>(ptr + ptrPos++) & 63
);
bufPos += 2;
}
}
assert(ptrPos == len);
var out = ALLOCATE(bufPos);
memory.copy(changetype<usize>(out), buf, bufPos);
memory.free(buf);
return REGISTER<string>(out);
}

toUTF8(): usize {
var buf = memory.allocate(<usize>this.lengthUTF8);
var pos: usize = 0;
var end = <usize>this.length;
var off: usize = 0;
while (pos < end) {
let c1 = <u32>load<u16>(changetype<usize>(this) + (pos << 1));
if (c1 < 128) {
store<u8>(buf + off, c1);
++off; ++pos;
} else if (c1 < 2048) {
let ptr = buf + off;
store<u8>(ptr, c1 >> 6 | 192);
store<u8>(ptr, c1 & 63 | 128, 1);
off += 2; ++pos;
} else {
let ptr = buf + off;
if ((c1 & 0xFC00) == 0xD800 && pos + 1 < end) {
let c2 = <u32>load<u16>(changetype<usize>(this) + ((pos + 1) << 1));
if ((c2 & 0xFC00) == 0xDC00) {
c1 = 0x10000 + ((c1 & 0x03FF) << 10) + (c2 & 0x03FF);
store<u8>(ptr, c1 >> 18 | 240);
store<u8>(ptr, c1 >> 12 & 63 | 128, 1);
store<u8>(ptr, c1 >> 6 & 63 | 128, 2);
store<u8>(ptr, c1 & 63 | 128, 3);
off += 4; pos += 2;
continue;
}
}
store<u8>(ptr, c1 >> 12 | 224);
store<u8>(ptr, c1 >> 6 & 63 | 128, 1);
store<u8>(ptr, c1 & 63 | 128, 2);
off += 3; ++pos;
}
}
store<u8>(buf + off, 0);
return buf;
}
}

// @ts-ignore: nolib
Expand Down
2 changes: 1 addition & 1 deletion tests/compiler/number.optimized.wat
Expand Up @@ -2358,7 +2358,7 @@
if
i32.const 0
i32.const 1648
i32.const 189
i32.const 190
i32.const 4
call $~lib/env/abort
unreachable
Expand Down
2 changes: 1 addition & 1 deletion tests/compiler/number.untouched.wat
Expand Up @@ -3424,7 +3424,7 @@
if
i32.const 0
i32.const 1648
i32.const 189
i32.const 190
i32.const 4
call $~lib/env/abort
unreachable
Expand Down
2 changes: 1 addition & 1 deletion tests/compiler/std/array-access.optimized.wat
Expand Up @@ -142,7 +142,7 @@
if
i32.const 0
i32.const 64
i32.const 164
i32.const 165
i32.const 4
call $~lib/env/abort
unreachable
Expand Down
2 changes: 1 addition & 1 deletion tests/compiler/std/array-access.untouched.wat
Expand Up @@ -219,7 +219,7 @@
if
i32.const 0
i32.const 64
i32.const 164
i32.const 165
i32.const 4
call $~lib/env/abort
unreachable
Expand Down
2 changes: 1 addition & 1 deletion tests/compiler/std/array.optimized.wat
Expand Up @@ -6229,7 +6229,7 @@
if
i32.const 0
i32.const 4376
i32.const 189
i32.const 190
i32.const 4
call $~lib/env/abort
unreachable
Expand Down