Skip to content

Commit

Permalink
Implement text encoding helpers on top of the new runtime (#679)
Browse files Browse the repository at this point in the history
  • Loading branch information
dcodeIO committed Jun 21, 2019
1 parent 2d31692 commit b6feaab
Show file tree
Hide file tree
Showing 12 changed files with 9,766 additions and 3,730 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Expand Up @@ -2,3 +2,4 @@ bin/* text eol=lf
dist/* binary
scripts/*.sh eol=lf
lib/binaryen.js binary
tests/compiler/std/string-encoding.ts eol=lf
2 changes: 1 addition & 1 deletion src/tokenizer.ts
Expand Up @@ -1101,7 +1101,7 @@ export class Tokenizer extends DiagnosticEmitter {
start = this.pos;
continue;
}
if (isLineBreak(c)) {
if (isLineBreak(c) && quote != CharCode.BACKTICK) {
result += text.substring(start, this.pos);
this.error(
DiagnosticCode.Unterminated_string_literal,
Expand Down
30 changes: 24 additions & 6 deletions std/assembly/index.d.ts
Expand Up @@ -1221,15 +1221,11 @@ declare class FixedArray<T> {

/** Class representing a sequence of characters. */
declare class String {

static fromCharCode(ls: i32, hs?: i32): string;
static fromCharCodes(arr: u16[]): string;
static fromCodePoint(code: i32): string;
static fromCodePoints(arr: i32[]): string;

readonly length: i32;
readonly lengthUTF8: i32;

charAt(index: i32): string;
charCodeAt(index: i32): i32;
codePointAt(index: i32): i32;
Expand All @@ -1254,8 +1250,30 @@ declare class String {
slice(beginIndex: i32, endIndex?: i32): string;
split(separator?: string, limit?: i32): string[];
toString(): string;
static fromUTF8(ptr: usize, len: usize): string;
toUTF8(): usize;
}
declare namespace String {
/** Encoding helpers for UTF-8. */
export namespace UTF8 {
/** Calculates the byte length of the specified string when encoded as UTF-8, optionally null terminated. */
export function byteLength(str: string, nullTerminated?: bool): i32;
/** Encodes the specified string to UTF-8 bytes, optionally null terminated. */
export function encode(str: string, nullTerminated?: bool): ArrayBuffer;
/** Decodes the specified buffer from UTF-8 bytes to a string, optionally null terminated. */
export function decode(buf: ArrayBuffer, nullTerminated?: bool): string;
/** Decodes raw UTF-8 bytes to a string, optionally null terminated. */
export function decodeUnsafe(buf: usize, len: usize, nullTerminated?: bool): string;
}
/** Encoding helpers for UTF-16. */
export namespace UTF16 {
/** Calculates the byte length of the specified string when encoded as UTF-16. */
export function byteLength(str: string): i32;
/** Encodes the specified string to UTF-16 bytes. */
export function encode(str: string): ArrayBuffer;
/** Decodes the specified buffer from UTF-16 bytes to a string. */
export function decode(buf: ArrayBuffer): string;
/** Decodes raw UTF-16 bytes to a string. */
export function decodeUnsafe(buf: usize, len: usize): string;
}
}

/** Class for representing a runtime error. Base class of all errors. */
Expand Down
244 changes: 143 additions & 101 deletions std/assembly/string.ts
Expand Up @@ -512,121 +512,163 @@ import { idof } from "./builtins";
toString(): String {
return this;
}
}

get lengthUTF8(): i32 {
var len = 1; // null terminated
var pos: usize = 0;
var end = <usize>this.length;
while (pos < end) {
let c = <u32>load<u16>(changetype<usize>(this) + (pos << 1));
if (c < 128) {
len += 1; ++pos;
} else if (c < 2048) {
len += 2; ++pos;
} else {
if (
(c & 0xFC00) == 0xD800 && pos + 1 < end &&
(<u32>load<u16>(changetype<usize>(this) + ((pos + 1) << 1)) & 0xFC00) == 0xDC00
) {
len += 4; pos += 2;
// @ts-ignore: nolib
export type string = String;

export function parseInt(str: string, radix: i32 = 0): f64 {
return strtol<f64>(str, radix);
}

export function parseFloat(str: string): f64 {
return strtod(str);
}

// Encoding helpers
export namespace String {

export namespace UTF8 {

export function byteLength(str: string, nullTerminated: bool = false): i32 {
var strOff = changetype<usize>(str);
var strEnd = strOff + <usize>changetype<BLOCK>(changetype<usize>(str) - BLOCK_OVERHEAD).rtSize;
var bufLen = nullTerminated ? 1 : 0;
while (strOff < strEnd) {
let c1 = <u32>load<u16>(strOff);
if (c1 < 128) {
if (nullTerminated && !c1) break;
bufLen += 1; strOff += 2;
} else if (c1 < 2048) {
bufLen += 2; strOff += 2;
} else {
len += 3; ++pos;
if ((c1 & 0xFC00) == 0xD800 && strOff + 2 < strEnd) {
if ((<u32>load<u16>(strOff, 2) & 0xFC00) == 0xDC00) {
strOff += 4; bufLen += 4;
continue;
}
}
strOff += 2; bufLen += 3;
}
}
return bufLen;
}
return len;
}

static fromUTF8(ptr: usize, len: usize): String {
if (len < 1) return changetype<String>("");
var ptrPos = <usize>0;
var buf = __alloc(<usize>len << 1, 0);
var bufPos = <usize>0;
while (ptrPos < len) {
let cp = <u32>load<u8>(ptr + ptrPos++);
if (cp < 128) {
store<u16>(buf + bufPos, cp);
bufPos += 2;
} else if (cp > 191 && cp < 224) {
assert(ptrPos + 1 <= len);
store<u16>(buf + bufPos, (cp & 31) << 6 | load<u8>(ptr + ptrPos++) & 63);
bufPos += 2;
} else if (cp > 239 && cp < 365) {
assert(ptrPos + 3 <= len);
cp = (
(cp & 7) << 18 |
(load<u8>(ptr + ptrPos++) & 63) << 12 |
(load<u8>(ptr + ptrPos++) & 63) << 6 |
load<u8>(ptr + ptrPos++) & 63
) - 0x10000;
store<u16>(buf + bufPos, 0xD800 + (cp >> 10));
bufPos += 2;
store<u16>(buf + bufPos, 0xDC00 + (cp & 1023));
bufPos += 2;

export function encode(str: string, nullTerminated: bool = false): ArrayBuffer {
var strOff = changetype<usize>(str);
var strEnd = changetype<usize>(str) + <usize>changetype<BLOCK>(changetype<usize>(str) - BLOCK_OVERHEAD).rtSize;
var buf = __alloc(UTF8.byteLength(str, nullTerminated), idof<ArrayBuffer>());
var bufOff = buf;
while (strOff < strEnd) {
let c1 = <u32>load<u16>(strOff);
if (c1 < 128) {
if (nullTerminated && !c1) break;
store<u8>(bufOff, c1);
bufOff += 1; strOff += 2;
} else if (c1 < 2048) {
store<u8>(bufOff, c1 >> 6 | 192);
store<u8>(bufOff, c1 & 63 | 128, 1);
bufOff += 2; strOff += 2;
} else {
if ((c1 & 0xFC00) == 0xD800 && strOff + 2 < strEnd) {
let c2 = <u32>load<u16>(strOff, 2);
if ((c2 & 0xFC00) == 0xDC00) {
c1 = 0x10000 + ((c1 & 0x03FF) << 10) + (c2 & 0x03FF);
store<u8>(bufOff, c1 >> 18 | 240);
store<u8>(bufOff, c1 >> 12 & 63 | 128, 1);
store<u8>(bufOff, c1 >> 6 & 63 | 128, 2);
store<u8>(bufOff, c1 & 63 | 128, 3);
strOff += 4; bufOff += 4;
continue;
}
}
store<u8>(bufOff, c1 >> 12 | 224);
store<u8>(bufOff, c1 >> 6 & 63 | 128, 1);
store<u8>(bufOff, c1 & 63 | 128, 2);
strOff += 2; bufOff += 3;
}
}
if (nullTerminated) {
assert(strOff <= strEnd);
buf = __realloc(buf, bufOff - buf + 1);
store<u8>(bufOff, 0);
} else {
assert(ptrPos + 2 <= len);
store<u16>(buf + bufPos,
(cp & 15) << 12 |
(load<u8>(ptr + ptrPos++) & 63) << 6 |
load<u8>(ptr + ptrPos++) & 63
);
bufPos += 2;
assert(strOff == strEnd);
}
return changetype<ArrayBuffer>(buf); // retains
}
assert(ptrPos == len);
var out = __alloc(bufPos, idof<String>());
memory.copy(out, buf, bufPos);
__free(buf);
return changetype<String>(out); // retains
}

toUTF8(): usize {
var buf = __alloc(<usize>this.lengthUTF8, 0);
var pos: usize = 0;
var end = <usize>this.length;
var off: usize = 0;
while (pos < end) {
let c1 = <u32>load<u16>(changetype<usize>(this) + (pos << 1));
if (c1 < 128) {
store<u8>(buf + off, c1);
++off; ++pos;
} else if (c1 < 2048) {
let ptr = buf + off;
store<u8>(ptr, c1 >> 6 | 192);
store<u8>(ptr, c1 & 63 | 128, 1);
off += 2; ++pos;
} else {
let ptr = buf + off;
if ((c1 & 0xFC00) == 0xD800 && pos + 1 < end) {
let c2 = <u32>load<u16>(changetype<usize>(this) + ((pos + 1) << 1));
if ((c2 & 0xFC00) == 0xDC00) {
c1 = 0x10000 + ((c1 & 0x03FF) << 10) + (c2 & 0x03FF);
store<u8>(ptr, c1 >> 18 | 240);
store<u8>(ptr, c1 >> 12 & 63 | 128, 1);
store<u8>(ptr, c1 >> 6 & 63 | 128, 2);
store<u8>(ptr, c1 & 63 | 128, 3);
off += 4; pos += 2;
continue;
}
export function decode(buf: ArrayBuffer, nullTerminated: bool = false): string {
return decodeUnsafe(changetype<usize>(buf), buf.byteLength, nullTerminated);
}

// @ts-ignore: decorator
@unsafe
export function decodeUnsafe(buf: usize, len: usize, nullTerminated: bool = false): string {
var bufOff = buf;
var bufEnd = buf + len;
assert(bufEnd >= bufOff); // guard wraparound
var str = __alloc(len << 1, idof<string>()); // max is one u16 char per u8 byte
var strOff = str;
while (bufOff < bufEnd) {
let cp = <u32>load<u8>(bufOff++);
if (cp < 128) {
if (nullTerminated && !cp) break;
store<u16>(strOff, cp);
strOff += 2;
} else if (cp > 191 && cp < 224) {
if (bufEnd - bufOff < 1) break;
store<u16>(strOff, (cp & 31) << 6 | load<u8>(bufOff++) & 63);
strOff += 2;
} else if (cp > 239 && cp < 365) {
if (bufEnd - bufOff < 3) break;
cp = (
(cp & 7) << 18 |
(load<u8>(bufOff) & 63) << 12 |
(load<u8>(bufOff, 1) & 63) << 6 |
load<u8>(bufOff, 2) & 63
) - 0x10000;
bufOff += 3;
store<u16>(strOff, 0xD800 + (cp >> 10));
store<u16>(strOff, 0xDC00 + (cp & 1023), 2);
strOff += 4;
} else {
if (bufEnd - bufOff < 2) break;
store<u16>(strOff,
(cp & 15) << 12 |
(load<u8>(bufOff) & 63) << 6 |
load<u8>(bufOff, 1) & 63
);
bufOff += 2; strOff += 2;
}
store<u8>(ptr, c1 >> 12 | 224);
store<u8>(ptr, c1 >> 6 & 63 | 128, 1);
store<u8>(ptr, c1 & 63 | 128, 2);
off += 3; ++pos;
}
return changetype<string>(__realloc(str, strOff - str)); // retains
}
store<u8>(buf + off, 0);
return buf;
}
}

// @ts-ignore: nolib
export type string = String;
export namespace UTF16 {

export function parseInt(str: string, radix: i32 = 0): f64 {
return strtol<f64>(str, radix);
}
export function byteLength(str: string): i32 {
return changetype<BLOCK>(changetype<usize>(str) - BLOCK_OVERHEAD).rtSize;
}

export function parseFloat(str: string): f64 {
return strtod(str);
export function encode(str: string): ArrayBuffer {
var size = changetype<BLOCK>(changetype<usize>(str) - BLOCK_OVERHEAD).rtSize;
var buf = __alloc(size, idof<ArrayBuffer>());
memory.copy(buf, changetype<usize>(str), <usize>size);
return changetype<ArrayBuffer>(buf); // retains
}

export function decode(buf: ArrayBuffer): string {
return decodeUnsafe(changetype<usize>(buf), buf.byteLength);
}

// @ts-ignore: decorator
@unsafe
export function decodeUnsafe(buf: usize, len: usize): string {
var str = __alloc(len &= ~1, idof<string>());
memory.copy(str, buf, len);
return changetype<string>(str); // retains
}
}
}
6 changes: 6 additions & 0 deletions tests/compiler/std/string-encoding.json
@@ -0,0 +1,6 @@
{
"asc_flags": [
"--runtime half",
"--use ASC_RTRACE=1"
]
}
3,711 changes: 3,711 additions & 0 deletions tests/compiler/std/string-encoding.optimized.wat

Large diffs are not rendered by default.

0 comments on commit b6feaab

Please sign in to comment.