Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement text encoding helpers on top of the new runtime #679

Merged
merged 6 commits into from Jun 21, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitattributes
Expand Up @@ -2,3 +2,4 @@ bin/* text eol=lf
dist/* binary
scripts/*.sh eol=lf
lib/binaryen.js binary
tests/compiler/std/string-encoding.ts eol=lf
2 changes: 1 addition & 1 deletion src/tokenizer.ts
Expand Up @@ -1101,7 +1101,7 @@ export class Tokenizer extends DiagnosticEmitter {
start = this.pos;
continue;
}
if (isLineBreak(c)) {
if (isLineBreak(c) && quote != CharCode.BACKTICK) {
result += text.substring(start, this.pos);
this.error(
DiagnosticCode.Unterminated_string_literal,
Expand Down
30 changes: 24 additions & 6 deletions std/assembly/index.d.ts
Expand Up @@ -1221,15 +1221,11 @@ declare class FixedArray<T> {

/** Class representing a sequence of characters. */
declare class String {

static fromCharCode(ls: i32, hs?: i32): string;
static fromCharCodes(arr: u16[]): string;
static fromCodePoint(code: i32): string;
static fromCodePoints(arr: i32[]): string;

readonly length: i32;
readonly lengthUTF8: i32;

charAt(index: i32): string;
charCodeAt(index: i32): i32;
codePointAt(index: i32): i32;
Expand All @@ -1254,8 +1250,30 @@ declare class String {
slice(beginIndex: i32, endIndex?: i32): string;
split(separator?: string, limit?: i32): string[];
toString(): string;
static fromUTF8(ptr: usize, len: usize): string;
toUTF8(): usize;
}
declare namespace String {
/** Encoding helpers for UTF-8. */
export namespace UTF8 {
/** Calculates the byte length of the specified string when encoded as UTF-8, optionally null terminated. */
export function byteLength(str: string, nullTerminated?: bool): i32;
/** Encodes the specified string to UTF-8 bytes, optionally null terminated. */
export function encode(str: string, nullTerminated?: bool): ArrayBuffer;
/** Decodes the specified buffer from UTF-8 bytes to a string, optionally null terminated. */
export function decode(buf: ArrayBuffer, nullTerminated?: bool): string;
/** Decodes raw UTF-8 bytes to a string, optionally null terminated. */
export function decodeUnsafe(buf: usize, len: usize, nullTerminated?: bool): string;
}
/** Encoding helpers for UTF-16. */
export namespace UTF16 {
/** Calculates the byte length of the specified string when encoded as UTF-16. */
export function byteLength(str: string): i32;
/** Encodes the specified string to UTF-16 bytes. */
export function encode(str: string): ArrayBuffer;
/** Decodes the specified buffer from UTF-16 bytes to a string. */
export function decode(buf: ArrayBuffer): string;
/** Decodes raw UTF-16 bytes to a string. */
export function decodeUnsafe(buf: usize, len: usize): string;
}
}

/** Class for representing a runtime error. Base class of all errors. */
Expand Down
244 changes: 143 additions & 101 deletions std/assembly/string.ts
Expand Up @@ -512,121 +512,163 @@ import { idof } from "./builtins";
toString(): String {
return this;
}
}

get lengthUTF8(): i32 {
var len = 1; // null terminated
var pos: usize = 0;
var end = <usize>this.length;
while (pos < end) {
let c = <u32>load<u16>(changetype<usize>(this) + (pos << 1));
if (c < 128) {
len += 1; ++pos;
} else if (c < 2048) {
len += 2; ++pos;
} else {
if (
(c & 0xFC00) == 0xD800 && pos + 1 < end &&
(<u32>load<u16>(changetype<usize>(this) + ((pos + 1) << 1)) & 0xFC00) == 0xDC00
) {
len += 4; pos += 2;
// @ts-ignore: nolib
export type string = String;

export function parseInt(str: string, radix: i32 = 0): f64 {
return strtol<f64>(str, radix);
}

export function parseFloat(str: string): f64 {
return strtod(str);
}

// Encoding helpers
export namespace String {

export namespace UTF8 {

export function byteLength(str: string, nullTerminated: bool = false): i32 {
var strOff = changetype<usize>(str);
var strEnd = strOff + <usize>changetype<BLOCK>(changetype<usize>(str) - BLOCK_OVERHEAD).rtSize;
var bufLen = nullTerminated ? 1 : 0;
while (strOff < strEnd) {
let c1 = <u32>load<u16>(strOff);
if (c1 < 128) {
if (nullTerminated && !c1) break;
bufLen += 1; strOff += 2;
} else if (c1 < 2048) {
bufLen += 2; strOff += 2;
} else {
len += 3; ++pos;
if ((c1 & 0xFC00) == 0xD800 && strOff + 2 < strEnd) {
if ((<u32>load<u16>(strOff, 2) & 0xFC00) == 0xDC00) {
strOff += 4; bufLen += 4;
continue;
}
}
strOff += 2; bufLen += 3;
}
}
return bufLen;
}
return len;
}

static fromUTF8(ptr: usize, len: usize): String {
if (len < 1) return changetype<String>("");
var ptrPos = <usize>0;
var buf = __alloc(<usize>len << 1, 0);
var bufPos = <usize>0;
while (ptrPos < len) {
let cp = <u32>load<u8>(ptr + ptrPos++);
if (cp < 128) {
store<u16>(buf + bufPos, cp);
bufPos += 2;
} else if (cp > 191 && cp < 224) {
assert(ptrPos + 1 <= len);
store<u16>(buf + bufPos, (cp & 31) << 6 | load<u8>(ptr + ptrPos++) & 63);
bufPos += 2;
} else if (cp > 239 && cp < 365) {
assert(ptrPos + 3 <= len);
cp = (
(cp & 7) << 18 |
(load<u8>(ptr + ptrPos++) & 63) << 12 |
(load<u8>(ptr + ptrPos++) & 63) << 6 |
load<u8>(ptr + ptrPos++) & 63
) - 0x10000;
store<u16>(buf + bufPos, 0xD800 + (cp >> 10));
bufPos += 2;
store<u16>(buf + bufPos, 0xDC00 + (cp & 1023));
bufPos += 2;

export function encode(str: string, nullTerminated: bool = false): ArrayBuffer {
var strOff = changetype<usize>(str);
var strEnd = changetype<usize>(str) + <usize>changetype<BLOCK>(changetype<usize>(str) - BLOCK_OVERHEAD).rtSize;
var buf = __alloc(UTF8.byteLength(str, nullTerminated), idof<ArrayBuffer>());
var bufOff = buf;
while (strOff < strEnd) {
let c1 = <u32>load<u16>(strOff);
if (c1 < 128) {
if (nullTerminated && !c1) break;
store<u8>(bufOff, c1);
bufOff += 1; strOff += 2;
} else if (c1 < 2048) {
store<u8>(bufOff, c1 >> 6 | 192);
store<u8>(bufOff, c1 & 63 | 128, 1);
bufOff += 2; strOff += 2;
} else {
if ((c1 & 0xFC00) == 0xD800 && strOff + 2 < strEnd) {
let c2 = <u32>load<u16>(strOff, 2);
if ((c2 & 0xFC00) == 0xDC00) {
c1 = 0x10000 + ((c1 & 0x03FF) << 10) + (c2 & 0x03FF);
store<u8>(bufOff, c1 >> 18 | 240);
store<u8>(bufOff, c1 >> 12 & 63 | 128, 1);
store<u8>(bufOff, c1 >> 6 & 63 | 128, 2);
store<u8>(bufOff, c1 & 63 | 128, 3);
strOff += 4; bufOff += 4;
continue;
}
}
store<u8>(bufOff, c1 >> 12 | 224);
store<u8>(bufOff, c1 >> 6 & 63 | 128, 1);
store<u8>(bufOff, c1 & 63 | 128, 2);
strOff += 2; bufOff += 3;
}
}
if (nullTerminated) {
assert(strOff <= strEnd);
buf = __realloc(buf, bufOff - buf + 1);
store<u8>(bufOff, 0);
} else {
assert(ptrPos + 2 <= len);
store<u16>(buf + bufPos,
(cp & 15) << 12 |
(load<u8>(ptr + ptrPos++) & 63) << 6 |
load<u8>(ptr + ptrPos++) & 63
);
bufPos += 2;
assert(strOff == strEnd);
}
return changetype<ArrayBuffer>(buf); // retains
}
assert(ptrPos == len);
var out = __alloc(bufPos, idof<String>());
memory.copy(out, buf, bufPos);
__free(buf);
return changetype<String>(out); // retains
}

toUTF8(): usize {
var buf = __alloc(<usize>this.lengthUTF8, 0);
var pos: usize = 0;
var end = <usize>this.length;
var off: usize = 0;
while (pos < end) {
let c1 = <u32>load<u16>(changetype<usize>(this) + (pos << 1));
if (c1 < 128) {
store<u8>(buf + off, c1);
++off; ++pos;
} else if (c1 < 2048) {
let ptr = buf + off;
store<u8>(ptr, c1 >> 6 | 192);
store<u8>(ptr, c1 & 63 | 128, 1);
off += 2; ++pos;
} else {
let ptr = buf + off;
if ((c1 & 0xFC00) == 0xD800 && pos + 1 < end) {
let c2 = <u32>load<u16>(changetype<usize>(this) + ((pos + 1) << 1));
if ((c2 & 0xFC00) == 0xDC00) {
c1 = 0x10000 + ((c1 & 0x03FF) << 10) + (c2 & 0x03FF);
store<u8>(ptr, c1 >> 18 | 240);
store<u8>(ptr, c1 >> 12 & 63 | 128, 1);
store<u8>(ptr, c1 >> 6 & 63 | 128, 2);
store<u8>(ptr, c1 & 63 | 128, 3);
off += 4; pos += 2;
continue;
}
export function decode(buf: ArrayBuffer, nullTerminated: bool = false): string {
return decodeUnsafe(changetype<usize>(buf), buf.byteLength, nullTerminated);
}

// @ts-ignore: decorator
@unsafe
export function decodeUnsafe(buf: usize, len: usize, nullTerminated: bool = false): string {
var bufOff = buf;
var bufEnd = buf + len;
assert(bufEnd >= bufOff); // guard wraparound
var str = __alloc(len << 1, idof<string>()); // max is one u16 char per u8 byte
var strOff = str;
while (bufOff < bufEnd) {
let cp = <u32>load<u8>(bufOff++);
if (cp < 128) {
if (nullTerminated && !cp) break;
store<u16>(strOff, cp);
strOff += 2;
} else if (cp > 191 && cp < 224) {
if (bufEnd - bufOff < 1) break;
store<u16>(strOff, (cp & 31) << 6 | load<u8>(bufOff++) & 63);
strOff += 2;
} else if (cp > 239 && cp < 365) {
if (bufEnd - bufOff < 3) break;
cp = (
(cp & 7) << 18 |
(load<u8>(bufOff) & 63) << 12 |
(load<u8>(bufOff, 1) & 63) << 6 |
load<u8>(bufOff, 2) & 63
) - 0x10000;
bufOff += 3;
store<u16>(strOff, 0xD800 + (cp >> 10));
store<u16>(strOff, 0xDC00 + (cp & 1023), 2);
strOff += 4;
} else {
if (bufEnd - bufOff < 2) break;
store<u16>(strOff,
(cp & 15) << 12 |
(load<u8>(bufOff) & 63) << 6 |
load<u8>(bufOff, 1) & 63
);
bufOff += 2; strOff += 2;
}
store<u8>(ptr, c1 >> 12 | 224);
store<u8>(ptr, c1 >> 6 & 63 | 128, 1);
store<u8>(ptr, c1 & 63 | 128, 2);
off += 3; ++pos;
}
return changetype<string>(__realloc(str, strOff - str)); // retains
}
store<u8>(buf + off, 0);
return buf;
}
}

// @ts-ignore: nolib
export type string = String;
export namespace UTF16 {

export function parseInt(str: string, radix: i32 = 0): f64 {
return strtol<f64>(str, radix);
}
export function byteLength(str: string): i32 {
return changetype<BLOCK>(changetype<usize>(str) - BLOCK_OVERHEAD).rtSize;
}

export function parseFloat(str: string): f64 {
return strtod(str);
export function encode(str: string): ArrayBuffer {
var size = changetype<BLOCK>(changetype<usize>(str) - BLOCK_OVERHEAD).rtSize;
var buf = __alloc(size, idof<ArrayBuffer>());
memory.copy(buf, changetype<usize>(str), <usize>size);
return changetype<ArrayBuffer>(buf); // retains
}

export function decode(buf: ArrayBuffer): string {
return decodeUnsafe(changetype<usize>(buf), buf.byteLength);
}

// @ts-ignore: decorator
@unsafe
export function decodeUnsafe(buf: usize, len: usize): string {
var str = __alloc(len &= ~1, idof<string>());
memory.copy(str, buf, len);
return changetype<string>(str); // retains
}
}
}
6 changes: 6 additions & 0 deletions tests/compiler/std/string-encoding.json
@@ -0,0 +1,6 @@
{
"asc_flags": [
"--runtime half",
"--use ASC_RTRACE=1"
]
}
3,711 changes: 3,711 additions & 0 deletions tests/compiler/std/string-encoding.optimized.wat

Large diffs are not rendered by default.