Skip to content

Commit

Permalink
Faster tokenizer lookahead (#13341)
Browse files Browse the repository at this point in the history
* refactor: simplify token context structure

* add benchmark

* perf: return a sub-state on tokenizer lookahead

* Update packages/babel-parser/src/tokenizer/index.js

Co-authored-by: Brian Ng <bng412@gmail.com>

* Update packages/babel-parser/src/tokenizer/index.js

Co-authored-by: Brian Ng <bng412@gmail.com>

* remove irrelevant comment

* fix: guard curPosition with isLookahead

* add test cases

Co-authored-by: Brian Ng <bng412@gmail.com>
  • Loading branch information
JLHwung and existentialism committed May 26, 2021
1 parent b1f57e5 commit acf2a10
Show file tree
Hide file tree
Showing 10 changed files with 684 additions and 29 deletions.
@@ -0,0 +1,22 @@
import Benchmark from "benchmark";
import baseline from "@babel-baseline/parser";
import current from "../../lib/index.js";
import { report } from "../util.mjs";

const suite = new Benchmark.Suite();
function createInput(length) {
return "type A = " + "| (x) => void".repeat(length);
}
function benchCases(name, implementation, options) {
for (const length of [256, 512, 1024, 2048]) {
const input = createInput(length);
suite.add(`${name} ${length} arrow function types`, () => {
implementation.parse(input, options);
});
}
}

benchCases("baseline", baseline, { plugins: ["flow"] });
benchCases("current", current, { plugins: ["flow"] });

suite.on("cycle", report).run();
3 changes: 1 addition & 2 deletions packages/babel-parser/src/plugins/flow/index.js
Expand Up @@ -9,7 +9,6 @@ import type Parser from "../../parser";
import { types as tt, type TokenType } from "../../tokenizer/types";
import * as N from "../../types";
import type { Pos, Position } from "../../util/location";
import type State from "../../tokenizer/state";
import { types as tc } from "../../tokenizer/context";
import * as charCodes from "charcodes";
import { isIteratorStart, isKeyword } from "../../util/identifier";
Expand Down Expand Up @@ -154,7 +153,7 @@ function hasTypeImportKind(node: N.Node): boolean {
return node.importKind === "type" || node.importKind === "typeof";
}

function isMaybeDefaultImport(state: State): boolean {
function isMaybeDefaultImport(state: { type: TokenType, value: any }): boolean {
return (
(state.type === tt.name || !!state.type.keyword) && state.value !== "from"
);
Expand Down
12 changes: 12 additions & 0 deletions packages/babel-parser/src/plugins/jsx/index.js
Expand Up @@ -15,6 +15,10 @@ import { isIdentifierChar, isIdentifierStart } from "../../util/identifier";
import type { Position } from "../../util/location";
import { isNewLine } from "../../util/whitespace";
import { Errors, makeErrorTemplates, ErrorCodes } from "../../parser/error";
import type { LookaheadState } from "../../tokenizer/state";
import State from "../../tokenizer/state";

type JSXLookaheadState = LookaheadState & { inPropertyName: boolean };

const HEX_NUMBER = /^[\da-fA-F]+$/;
const DECIMAL_NUMBER = /^\d+$/;
Expand Down Expand Up @@ -573,6 +577,14 @@ export default (superClass: Class<Parser>): Class<Parser> =>
}
}

createLookaheadState(state: State): JSXLookaheadState {
const lookaheadState = ((super.createLookaheadState(
state,
): any): JSXLookaheadState);
lookaheadState.inPropertyName = state.inPropertyName;
return lookaheadState;
}

getTokenFromCode(code: number): void {
if (this.state.inPropertyName) return super.getTokenFromCode(code);

Expand Down
11 changes: 2 additions & 9 deletions packages/babel-parser/src/tokenizer/context.js
Expand Up @@ -7,22 +7,15 @@
import { types as tt } from "./types";

export class TokContext {
constructor(
token: string,
isExpr?: boolean,
preserveSpace?: boolean,
override?: ?Function, // Takes a Tokenizer as a this-parameter, and returns void.
) {
constructor(token: string, isExpr?: boolean, preserveSpace?: boolean) {
this.token = token;
this.isExpr = !!isExpr;
this.preserveSpace = !!preserveSpace;
this.override = override;
}

token: string;
isExpr: boolean;
preserveSpace: boolean;
override: ?Function;
}

export const types: {
Expand All @@ -34,7 +27,7 @@ export const types: {
templateQuasi: new TokContext("${", false),
parenStatement: new TokContext("(", false),
parenExpression: new TokContext("(", true),
template: new TokContext("`", true, true, p => p.readTmplToken()),
template: new TokContext("`", true, true),
functionExpression: new TokContext("function", true),
functionStatement: new TokContext("function", false),
};
Expand Down
77 changes: 59 additions & 18 deletions packages/babel-parser/src/tokenizer/index.js
Expand Up @@ -19,6 +19,7 @@ import {
skipWhiteSpace,
} from "../util/whitespace";
import State from "./state";
import type { LookaheadState } from "./state";

const VALID_REGEX_FLAGS = new Set(["g", "m", "s", "i", "y", "u"]);

Expand Down Expand Up @@ -144,11 +145,9 @@ export default class Tokenizer extends ParserErrors {
// Move to the next token

next(): void {
if (!this.isLookahead) {
this.checkKeywordEscapes();
if (this.options.tokens) {
this.pushToken(new Token(this.state));
}
this.checkKeywordEscapes();
if (this.options.tokens) {
this.pushToken(new Token(this.state));
}

this.state.lastTokEnd = this.state.end;
Expand All @@ -175,14 +174,51 @@ export default class Tokenizer extends ParserErrors {
return this.state.type === type;
}

// TODO
/**
* Create a LookaheadState from current parser state
*
* @param {State} state
* @returns {LookaheadState}
* @memberof Tokenizer
*/
createLookaheadState(state: State): LookaheadState {
return {
pos: state.pos,
value: null,
type: state.type,
start: state.start,
end: state.end,
lastTokEnd: state.end,
context: [this.curContext()],
exprAllowed: state.exprAllowed,
inType: state.inType,
};
}

lookahead(): State {
/**
* lookahead peeks the next token, skipping changes to token context and
* comment stack. For performance it returns a limited LookaheadState
* instead of full parser state.
*
* The { column, line } Loc info is not included in lookahead since such usage
* is rare. Although it may return other location properties e.g. `curLine` and
* `lineStart`, these properties are not listed in the LookaheadState interface
* and thus the returned value is _NOT_ reliable.
*
* The tokenizer should make best efforts to avoid using any parser state
* other than those defined in LookaheadState
*
* @returns {LookaheadState}
* @memberof Tokenizer
*/
lookahead(): LookaheadState {
const old = this.state;
this.state = old.clone(true);
// For performance we use a simpified tokenizer state structure
// $FlowIgnore
this.state = this.createLookaheadState(old);

this.isLookahead = true;
this.next();
this.nextToken();
this.isLookahead = false;

const curr = this.state;
Expand Down Expand Up @@ -247,17 +283,16 @@ export default class Tokenizer extends ParserErrors {

nextToken(): void {
const curContext = this.curContext();
if (!curContext?.preserveSpace) this.skipSpace();
if (!curContext.preserveSpace) this.skipSpace();
this.state.start = this.state.pos;
this.state.startLoc = this.state.curPosition();
if (!this.isLookahead) this.state.startLoc = this.state.curPosition();
if (this.state.pos >= this.length) {
this.finishToken(tt.eof);
return;
}

const override = curContext?.override;
if (override) {
override(this);
if (curContext === ct.template) {
this.readTmplToken();
} else {
this.getTokenFromCode(this.codePointAtPos(this.state.pos));
}
Expand Down Expand Up @@ -285,7 +320,8 @@ export default class Tokenizer extends ParserErrors {
}

skipBlockComment(): void {
const startLoc = this.state.curPosition();
let startLoc;
if (!this.isLookahead) startLoc = this.state.curPosition();
const start = this.state.pos;
const end = this.input.indexOf("*/", this.state.pos + 2);
if (end === -1) throw this.raise(start, Errors.UnterminatedComment);
Expand All @@ -304,6 +340,7 @@ export default class Tokenizer extends ParserErrors {
// If we are doing a lookahead right now we need to advance the position (above code)
// but we do not want to push the comment to the state.
if (this.isLookahead) return;
/*:: invariant(startLoc) */

this.pushComment(
true,
Expand All @@ -317,7 +354,8 @@ export default class Tokenizer extends ParserErrors {

skipLineComment(startSkip: number): void {
const start = this.state.pos;
const startLoc = this.state.curPosition();
let startLoc;
if (!this.isLookahead) startLoc = this.state.curPosition();
let ch = this.input.charCodeAt((this.state.pos += startSkip));
if (this.state.pos < this.length) {
while (!isNewLine(ch) && ++this.state.pos < this.length) {
Expand All @@ -328,6 +366,7 @@ export default class Tokenizer extends ParserErrors {
// If we are doing a lookahead right now we need to advance the position (above code)
// but we do not want to push the comment to the state.
if (this.isLookahead) return;
/*:: invariant(startLoc) */

this.pushComment(
false,
Expand Down Expand Up @@ -398,12 +437,14 @@ export default class Tokenizer extends ParserErrors {

finishToken(type: TokenType, val: any): void {
this.state.end = this.state.pos;
this.state.endLoc = this.state.curPosition();
const prevType = this.state.type;
this.state.type = type;
this.state.value = val;

if (!this.isLookahead) this.updateContext(prevType);
if (!this.isLookahead) {
this.state.endLoc = this.state.curPosition();
this.updateContext(prevType);
}
}

// ### Token reading
Expand Down
11 changes: 11 additions & 0 deletions packages/babel-parser/src/tokenizer/state.js
Expand Up @@ -178,3 +178,14 @@ export default class State {
return state;
}
}

export type LookaheadState = {
pos: number,
value: any,
type: TokenType,
start: number,
end: number,
/* Used only in readSlashToken */
exprAllowed: boolean,
inType: boolean,
};
@@ -0,0 +1,2 @@
/*1*/ export /*2*/ { /*3*/ A /*4*/, /*5*/ B /*6*/ as /*7*/ C /*8*/ } /*9*/ from /*10*/ "foo";
/*1*/ export /*2*/ * /*3*/ from /*4*/ "foo"

0 comments on commit acf2a10

Please sign in to comment.