Faster tokenizer lookahead (#13341)

* refactor: simplify token context structure * add benchmark * perf: return a sub-state on tokenizer lookahead * Update packages/babel-parser/src/tokenizer/index.js Co-authored-by: Brian Ng <bng412@gmail.com> * Update packages/babel-parser/src/tokenizer/index.js Co-authored-by: Brian Ng <bng412@gmail.com> * remove irrelevant comment * fix: guard curPosition with isLookahead * add test cases Co-authored-by: Brian Ng <bng412@gmail.com>
babel · May 26, 2021 · acf2a10 · acf2a10
1 parent b1f57e5
commit acf2a10
Show file tree

Hide file tree

Showing 10 changed files with 684 additions and 29 deletions.
diff --git a/packages/babel-parser/benchmark/many-arrow-function-flow-types/bench.mjs b/packages/babel-parser/benchmark/many-arrow-function-flow-types/bench.mjs
@@ -0,0 +1,22 @@
+import Benchmark from "benchmark";
+import baseline from "@babel-baseline/parser";
+import current from "../../lib/index.js";
+import { report } from "../util.mjs";
+
+const suite = new Benchmark.Suite();
+function createInput(length) {
+  return "type A = " + "| (x) => void".repeat(length);
+}
+function benchCases(name, implementation, options) {
+  for (const length of [256, 512, 1024, 2048]) {
+    const input = createInput(length);
+    suite.add(`${name} ${length} arrow function types`, () => {
+      implementation.parse(input, options);
+    });
+  }
+}
+
+benchCases("baseline", baseline, { plugins: ["flow"] });
+benchCases("current", current, { plugins: ["flow"] });
+
+suite.on("cycle", report).run();
diff --git a/packages/babel-parser/src/plugins/flow/index.js b/packages/babel-parser/src/plugins/flow/index.js
@@ -9,7 +9,6 @@ import type Parser from "../../parser";
 import { types as tt, type TokenType } from "../../tokenizer/types";
 import * as N from "../../types";
 import type { Pos, Position } from "../../util/location";
-import type State from "../../tokenizer/state";
 import { types as tc } from "../../tokenizer/context";
 import * as charCodes from "charcodes";
 import { isIteratorStart, isKeyword } from "../../util/identifier";
@@ -154,7 +153,7 @@ function hasTypeImportKind(node: N.Node): boolean {
   return node.importKind === "type" || node.importKind === "typeof";
 }
 
-function isMaybeDefaultImport(state: State): boolean {
+function isMaybeDefaultImport(state: { type: TokenType, value: any }): boolean {
   return (
     (state.type === tt.name || !!state.type.keyword) && state.value !== "from"
   );

diff --git a/packages/babel-parser/src/plugins/jsx/index.js b/packages/babel-parser/src/plugins/jsx/index.js
@@ -15,6 +15,10 @@ import { isIdentifierChar, isIdentifierStart } from "../../util/identifier";
 import type { Position } from "../../util/location";
 import { isNewLine } from "../../util/whitespace";
 import { Errors, makeErrorTemplates, ErrorCodes } from "../../parser/error";
+import type { LookaheadState } from "../../tokenizer/state";
+import State from "../../tokenizer/state";
+
+type JSXLookaheadState = LookaheadState & { inPropertyName: boolean };
 
 const HEX_NUMBER = /^[\da-fA-F]+$/;
 const DECIMAL_NUMBER = /^\d+$/;
@@ -573,6 +577,14 @@ export default (superClass: Class<Parser>): Class<Parser> =>
       }
     }
 
+    createLookaheadState(state: State): JSXLookaheadState {
+      const lookaheadState = ((super.createLookaheadState(
+        state,
+      ): any): JSXLookaheadState);
+      lookaheadState.inPropertyName = state.inPropertyName;
+      return lookaheadState;
+    }
+
     getTokenFromCode(code: number): void {
       if (this.state.inPropertyName) return super.getTokenFromCode(code);
 

diff --git a/packages/babel-parser/src/tokenizer/context.js b/packages/babel-parser/src/tokenizer/context.js
@@ -7,22 +7,15 @@
 import { types as tt } from "./types";
 
 export class TokContext {
-  constructor(
-    token: string,
-    isExpr?: boolean,
-    preserveSpace?: boolean,
-    override?: ?Function, // Takes a Tokenizer as a this-parameter, and returns void.
-  ) {
+  constructor(token: string, isExpr?: boolean, preserveSpace?: boolean) {
     this.token = token;
     this.isExpr = !!isExpr;
     this.preserveSpace = !!preserveSpace;
-    this.override = override;
   }
 
   token: string;
   isExpr: boolean;
   preserveSpace: boolean;
-  override: ?Function;
 }
 
 export const types: {
@@ -34,7 +27,7 @@ export const types: {
   templateQuasi: new TokContext("${", false),
   parenStatement: new TokContext("(", false),
   parenExpression: new TokContext("(", true),
-  template: new TokContext("`", true, true, p => p.readTmplToken()),
+  template: new TokContext("`", true, true),
   functionExpression: new TokContext("function", true),
   functionStatement: new TokContext("function", false),
 };

diff --git a/packages/babel-parser/src/tokenizer/index.js b/packages/babel-parser/src/tokenizer/index.js
@@ -19,6 +19,7 @@ import {
   skipWhiteSpace,
 } from "../util/whitespace";
 import State from "./state";
+import type { LookaheadState } from "./state";
 
 const VALID_REGEX_FLAGS = new Set(["g", "m", "s", "i", "y", "u"]);
 
@@ -144,11 +145,9 @@ export default class Tokenizer extends ParserErrors {
   // Move to the next token
 
   next(): void {
-    if (!this.isLookahead) {
-      this.checkKeywordEscapes();
-      if (this.options.tokens) {
-        this.pushToken(new Token(this.state));
-      }
+    this.checkKeywordEscapes();
+    if (this.options.tokens) {
+      this.pushToken(new Token(this.state));
     }
 
     this.state.lastTokEnd = this.state.end;
@@ -175,14 +174,51 @@ export default class Tokenizer extends ParserErrors {
     return this.state.type === type;
   }
 
-  // TODO
+  /**
+   * Create a LookaheadState from current parser state
+   *
+   * @param {State} state
+   * @returns {LookaheadState}
+   * @memberof Tokenizer
+   */
+  createLookaheadState(state: State): LookaheadState {
+    return {
+      pos: state.pos,
+      value: null,
+      type: state.type,
+      start: state.start,
+      end: state.end,
+      lastTokEnd: state.end,
+      context: [this.curContext()],
+      exprAllowed: state.exprAllowed,
+      inType: state.inType,
+    };
+  }
 
-  lookahead(): State {
+  /**
+   * lookahead peeks the next token, skipping changes to token context and
+   * comment stack. For performance it returns a limited LookaheadState
+   * instead of full parser state.
+   *
+   * The { column, line } Loc info is not included in lookahead since such usage
+   * is rare. Although it may return other location properties e.g. `curLine` and
+   * `lineStart`, these properties are not listed in the LookaheadState interface
+   * and thus the returned value is _NOT_ reliable.
+   *
+   * The tokenizer should make best efforts to avoid using any parser state
+   * other than those defined in LookaheadState
+   *
+   * @returns {LookaheadState}
+   * @memberof Tokenizer
+   */
+  lookahead(): LookaheadState {
     const old = this.state;
-    this.state = old.clone(true);
+    // For performance we use a simpified tokenizer state structure
+    // $FlowIgnore
+    this.state = this.createLookaheadState(old);
 
     this.isLookahead = true;
-    this.next();
+    this.nextToken();
     this.isLookahead = false;
 
     const curr = this.state;
@@ -247,17 +283,16 @@ export default class Tokenizer extends ParserErrors {
 
   nextToken(): void {
     const curContext = this.curContext();
-    if (!curContext?.preserveSpace) this.skipSpace();
+    if (!curContext.preserveSpace) this.skipSpace();
     this.state.start = this.state.pos;
-    this.state.startLoc = this.state.curPosition();
+    if (!this.isLookahead) this.state.startLoc = this.state.curPosition();
     if (this.state.pos >= this.length) {
       this.finishToken(tt.eof);
       return;
     }
 
-    const override = curContext?.override;
-    if (override) {
-      override(this);
+    if (curContext === ct.template) {
+      this.readTmplToken();
     } else {
       this.getTokenFromCode(this.codePointAtPos(this.state.pos));
     }
@@ -285,7 +320,8 @@ export default class Tokenizer extends ParserErrors {
   }
 
   skipBlockComment(): void {
-    const startLoc = this.state.curPosition();
+    let startLoc;
+    if (!this.isLookahead) startLoc = this.state.curPosition();
     const start = this.state.pos;
     const end = this.input.indexOf("*/", this.state.pos + 2);
     if (end === -1) throw this.raise(start, Errors.UnterminatedComment);
@@ -304,6 +340,7 @@ export default class Tokenizer extends ParserErrors {
     // If we are doing a lookahead right now we need to advance the position (above code)
     // but we do not want to push the comment to the state.
     if (this.isLookahead) return;
+    /*:: invariant(startLoc) */
 
     this.pushComment(
       true,
@@ -317,7 +354,8 @@ export default class Tokenizer extends ParserErrors {
 
   skipLineComment(startSkip: number): void {
     const start = this.state.pos;
-    const startLoc = this.state.curPosition();
+    let startLoc;
+    if (!this.isLookahead) startLoc = this.state.curPosition();
     let ch = this.input.charCodeAt((this.state.pos += startSkip));
     if (this.state.pos < this.length) {
       while (!isNewLine(ch) && ++this.state.pos < this.length) {
@@ -328,6 +366,7 @@ export default class Tokenizer extends ParserErrors {
     // If we are doing a lookahead right now we need to advance the position (above code)
     // but we do not want to push the comment to the state.
     if (this.isLookahead) return;
+    /*:: invariant(startLoc) */
 
     this.pushComment(
       false,
@@ -398,12 +437,14 @@ export default class Tokenizer extends ParserErrors {
 
   finishToken(type: TokenType, val: any): void {
     this.state.end = this.state.pos;
-    this.state.endLoc = this.state.curPosition();
     const prevType = this.state.type;
     this.state.type = type;
     this.state.value = val;
 
-    if (!this.isLookahead) this.updateContext(prevType);
+    if (!this.isLookahead) {
+      this.state.endLoc = this.state.curPosition();
+      this.updateContext(prevType);
+    }
   }
 
   // ### Token reading

diff --git a/packages/babel-parser/src/tokenizer/state.js b/packages/babel-parser/src/tokenizer/state.js
@@ -178,3 +178,14 @@ export default class State {
     return state;
   }
 }
+
+export type LookaheadState = {
+  pos: number,
+  value: any,
+  type: TokenType,
+  start: number,
+  end: number,
+  /* Used only in readSlashToken */
+  exprAllowed: boolean,
+  inType: boolean,
+};
diff --git a/packages/babel-parser/test/fixtures/typescript/export/internal-comments/input.ts b/packages/babel-parser/test/fixtures/typescript/export/internal-comments/input.ts
@@ -0,0 +1,2 @@
+/*1*/ export /*2*/ { /*3*/ A /*4*/, /*5*/ B /*6*/ as /*7*/ C /*8*/ } /*9*/ from /*10*/ "foo";
+/*1*/ export /*2*/ * /*3*/ from /*4*/ "foo"