From 7d7472168e8c6b53ceb09355a30f9ba4b98b73ae Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Sun, 26 Dec 2021 19:06:45 +0300 Subject: [PATCH 1/3] Speed up new backtracking parser --- src/blib2to3/pgen2/parse.py | 66 +++++++++++++-- tests/data/pattern_matching_generic.py | 107 +++++++++++++++++++++++++ tests/test_format.py | 1 + 3 files changed, 169 insertions(+), 5 deletions(-) create mode 100644 tests/data/pattern_matching_generic.py diff --git a/src/blib2to3/pgen2/parse.py b/src/blib2to3/pgen2/parse.py index e5dad3ae766..01b4e38c57b 100644 --- a/src/blib2to3/pgen2/parse.py +++ b/src/blib2to3/pgen2/parse.py @@ -46,6 +46,17 @@ def lam_sub(grammar: Grammar, node: RawNode) -> NL: return Node(type=node[0], children=node[3], context=node[2]) +# A placeholder node, used when parser is backtracking. +FAKE_NODE = (-1, None, None, None) + + +def stack_copy( + stack: List[Tuple[DFAS, int, RawNode]] +) -> List[Tuple[DFAS, int, RawNode]]: + """Nodeless stack copy.""" + return [(copy.deepcopy(dfa), label, FAKE_NODE) for dfa, label, _ in stack] + + class Recorder: def __init__(self, parser: "Parser", ilabels: List[int], context: Context) -> None: self.parser = parser @@ -54,7 +65,7 @@ def __init__(self, parser: "Parser", ilabels: List[int], context: Context) -> No self._dead_ilabels: Set[int] = set() self._start_point = self.parser.stack - self._points = {ilabel: copy.deepcopy(self._start_point) for ilabel in ilabels} + self._points = {ilabel: stack_copy(self._start_point) for ilabel in ilabels} @property def ilabels(self) -> Set[int]: @@ -62,13 +73,37 @@ def ilabels(self) -> Set[int]: @contextmanager def switch_to(self, ilabel: int) -> Iterator[None]: - self.parser.stack = self._points[ilabel] + with self.patch(): + self.parser.stack = self._points[ilabel] + try: + yield + except ParseError: + self._dead_ilabels.add(ilabel) + finally: + self.parser.stack = self._start_point + + @contextmanager + def patch(self) -> Iterator[None]: + """ + Patch basic state operations (push/pop/shift) with node-level + immutable variants. These still will operate on the stack; but + they won't create any new nodes, or modify the contents of any + other existing nodes. + + This saves us a ton of time when we are backtracking, since we + want to restore to the initial state as quick as possible, which + can only be done by having as little mutatations as possible. + """ + original_functions = {} + for name in self.parser.STATE_OPERATIONS: + original_functions[name] = getattr(self.parser, name) + safe_variant = getattr(self.parser, name + "_safe") + setattr(self.parser, name, safe_variant) try: yield - except ParseError: - self._dead_ilabels.add(ilabel) finally: - self.parser.stack = self._start_point + for name, func in original_functions.items(): + setattr(self.parser, name, func) def add_token(self, tok_type: int, tok_val: Text, raw: bool = False) -> None: func: Callable[..., Any] @@ -317,6 +352,8 @@ def classify(self, type: int, value: Text, context: Context) -> List[int]: raise ParseError("bad token", type, value, context) return [ilabel] + STATE_OPERATIONS = ["shift", "push", "pop"] + def shift(self, type: int, value: Text, newstate: int, context: Context) -> None: """Shift a token. (Internal)""" dfa, state, node = self.stack[-1] @@ -344,3 +381,22 @@ def pop(self) -> None: else: self.rootnode = newnode self.rootnode.used_names = self.used_names + + def shift_safe( + self, type: int, value: Text, newstate: int, context: Context + ) -> None: + """Immutable (node-level) version of shift()""" + dfa, state, _ = self.stack[-1] + self.stack[-1] = (dfa, newstate, FAKE_NODE) + + def push_safe( + self, type: int, newdfa: DFAS, newstate: int, context: Context + ) -> None: + """Immutable (node-level) version of push()""" + dfa, state, _ = self.stack[-1] + self.stack[-1] = (dfa, newstate, FAKE_NODE) + self.stack.append((newdfa, 0, FAKE_NODE)) + + def pop_safe(self) -> None: + """Immutable (node-level) version of pop()""" + self.stack.pop() diff --git a/tests/data/pattern_matching_generic.py b/tests/data/pattern_matching_generic.py new file mode 100644 index 00000000000..00a0e4a677d --- /dev/null +++ b/tests/data/pattern_matching_generic.py @@ -0,0 +1,107 @@ +re.match() +match = a +with match() as match: + match = f"{match}" + +re.match() +match = a +with match() as match: + match = f"{match}" + + +def get_grammars(target_versions: Set[TargetVersion]) -> List[Grammar]: + if not target_versions: + # No target_version specified, so try all grammars. + return [ + # Python 3.7+ + pygram.python_grammar_no_print_statement_no_exec_statement_async_keywords, + # Python 3.0-3.6 + pygram.python_grammar_no_print_statement_no_exec_statement, + # Python 2.7 with future print_function import + pygram.python_grammar_no_print_statement, + # Python 2.7 + pygram.python_grammar, + ] + + match match: + case case: + match match: + case case: + pass + + if all(version.is_python2() for version in target_versions): + # Python 2-only code, so try Python 2 grammars. + return [ + # Python 2.7 with future print_function import + pygram.python_grammar_no_print_statement, + # Python 2.7 + pygram.python_grammar, + ] + + re.match() + match = a + with match() as match: + match = f"{match}" + + def test_patma_139(self): + x = False + match x: + case bool(z): + y = 0 + self.assertIs(x, False) + self.assertEqual(y, 0) + self.assertIs(z, x) + + # Python 3-compatible code, so only try Python 3 grammar. + grammars = [] + if supports_feature(target_versions, Feature.PATTERN_MATCHING): + # Python 3.10+ + grammars.append(pygram.python_grammar_soft_keywords) + # If we have to parse both, try to parse async as a keyword first + if not supports_feature( + target_versions, Feature.ASYNC_IDENTIFIERS + ) and not supports_feature(target_versions, Feature.PATTERN_MATCHING): + # Python 3.7-3.9 + grammars.append( + pygram.python_grammar_no_print_statement_no_exec_statement_async_keywords + ) + if not supports_feature(target_versions, Feature.ASYNC_KEYWORDS): + # Python 3.0-3.6 + grammars.append(pygram.python_grammar_no_print_statement_no_exec_statement) + + def test_patma_155(self): + x = 0 + y = None + match x: + case 1e1000: + y = 0 + self.assertEqual(x, 0) + self.assertIs(y, None) + + x = range(3) + match x: + case [y, case as x, z]: + w = 0 + + # At least one of the above branches must have been taken, because every Python + # version has exactly one of the two 'ASYNC_*' flags + return grammars + + +def lib2to3_parse(src_txt: str, target_versions: Iterable[TargetVersion] = ()) -> Node: + """Given a string with source, return the lib2to3 Node.""" + if not src_txt.endswith("\n"): + src_txt += "\n" + + grammars = get_grammars(set(target_versions)) + + +re.match() +match = a +with match() as match: + match = f"{match}" + +re.match() +match = a +with match() as match: + match = f"{match}" diff --git a/tests/test_format.py b/tests/test_format.py index 6651272a87c..db39678cdfe 100644 --- a/tests/test_format.py +++ b/tests/test_format.py @@ -69,6 +69,7 @@ "pattern_matching_complex", "pattern_matching_extras", "pattern_matching_style", + "pattern_matching_generic", "parenthesized_context_managers", ] From 4bc06fff40e17e3134f6f882a7bf0edbbc579257 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Mon, 10 Jan 2022 20:51:04 +0300 Subject: [PATCH 2/3] Unify the implementation of push/pop/shift --- src/blib2to3/pgen2/parse.py | 82 ++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 46 deletions(-) diff --git a/src/blib2to3/pgen2/parse.py b/src/blib2to3/pgen2/parse.py index 01b4e38c57b..e25f583d11c 100644 --- a/src/blib2to3/pgen2/parse.py +++ b/src/blib2to3/pgen2/parse.py @@ -47,14 +47,14 @@ def lam_sub(grammar: Grammar, node: RawNode) -> NL: # A placeholder node, used when parser is backtracking. -FAKE_NODE = (-1, None, None, None) +DUMMY_NODE = (-1, None, None, None) def stack_copy( stack: List[Tuple[DFAS, int, RawNode]] ) -> List[Tuple[DFAS, int, RawNode]]: """Nodeless stack copy.""" - return [(copy.deepcopy(dfa), label, FAKE_NODE) for dfa, label, _ in stack] + return [(copy.deepcopy(dfa), label, DUMMY_NODE) for dfa, label, _ in stack] class Recorder: @@ -94,16 +94,12 @@ def patch(self) -> Iterator[None]: want to restore to the initial state as quick as possible, which can only be done by having as little mutatations as possible. """ - original_functions = {} - for name in self.parser.STATE_OPERATIONS: - original_functions[name] = getattr(self.parser, name) - safe_variant = getattr(self.parser, name + "_safe") - setattr(self.parser, name, safe_variant) + is_backtracking = self.parser.is_backtracking try: + self.parser.is_backtracking = True yield finally: - for name, func in original_functions.items(): - setattr(self.parser, name, func) + self.parser.is_backtracking = is_backtracking def add_token(self, tok_type: int, tok_val: Text, raw: bool = False) -> None: func: Callable[..., Any] @@ -214,6 +210,7 @@ def __init__(self, grammar: Grammar, convert: Optional[Convert] = None) -> None: self.grammar = grammar # See note in docstring above. TL;DR this is ignored. self.convert = convert or lam_sub + self.is_backtracking = False def setup(self, proxy: "TokenProxy", start: Optional[int] = None) -> None: """Prepare for parsing. @@ -356,47 +353,40 @@ def classify(self, type: int, value: Text, context: Context) -> List[int]: def shift(self, type: int, value: Text, newstate: int, context: Context) -> None: """Shift a token. (Internal)""" - dfa, state, node = self.stack[-1] - rawnode: RawNode = (type, value, context, None) - newnode = convert(self.grammar, rawnode) - assert node[-1] is not None - node[-1].append(newnode) - self.stack[-1] = (dfa, newstate, node) + if self.is_backtracking: + dfa, state, _ = self.stack[-1] + self.stack[-1] = (dfa, newstate, DUMMY_NODE) + else: + dfa, state, node = self.stack[-1] + rawnode: RawNode = (type, value, context, None) + newnode = convert(self.grammar, rawnode) + assert node[-1] is not None + node[-1].append(newnode) + self.stack[-1] = (dfa, newstate, node) def push(self, type: int, newdfa: DFAS, newstate: int, context: Context) -> None: """Push a nonterminal. (Internal)""" - dfa, state, node = self.stack[-1] - newnode: RawNode = (type, None, context, []) - self.stack[-1] = (dfa, newstate, node) - self.stack.append((newdfa, 0, newnode)) + if self.is_backtracking: + dfa, state, _ = self.stack[-1] + self.stack[-1] = (dfa, newstate, DUMMY_NODE) + self.stack.append((newdfa, 0, DUMMY_NODE)) + else: + dfa, state, node = self.stack[-1] + newnode: RawNode = (type, None, context, []) + self.stack[-1] = (dfa, newstate, node) + self.stack.append((newdfa, 0, newnode)) def pop(self) -> None: """Pop a nonterminal. (Internal)""" - popdfa, popstate, popnode = self.stack.pop() - newnode = convert(self.grammar, popnode) - if self.stack: - dfa, state, node = self.stack[-1] - assert node[-1] is not None - node[-1].append(newnode) + if self.is_backtracking: + self.stack.pop() else: - self.rootnode = newnode - self.rootnode.used_names = self.used_names - - def shift_safe( - self, type: int, value: Text, newstate: int, context: Context - ) -> None: - """Immutable (node-level) version of shift()""" - dfa, state, _ = self.stack[-1] - self.stack[-1] = (dfa, newstate, FAKE_NODE) - - def push_safe( - self, type: int, newdfa: DFAS, newstate: int, context: Context - ) -> None: - """Immutable (node-level) version of push()""" - dfa, state, _ = self.stack[-1] - self.stack[-1] = (dfa, newstate, FAKE_NODE) - self.stack.append((newdfa, 0, FAKE_NODE)) - - def pop_safe(self) -> None: - """Immutable (node-level) version of pop()""" - self.stack.pop() + popdfa, popstate, popnode = self.stack.pop() + newnode = convert(self.grammar, popnode) + if self.stack: + dfa, state, node = self.stack[-1] + assert node[-1] is not None + node[-1].append(newnode) + else: + self.rootnode = newnode + self.rootnode.used_names = self.used_names From ac1d8d84f463ad4f4eb4141df164d8a749af4456 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Mon, 10 Jan 2022 21:00:19 +0300 Subject: [PATCH 3/3] Add changelog / some nits --- CHANGES.md | 2 ++ src/blib2to3/pgen2/parse.py | 13 +++++-------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index f6e8343ed00..a1c8ccb0b7d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -24,6 +24,8 @@ at least one pre-existing blank line (#2736) - Verbose mode also now describes how a project root was discovered and which paths will be formatted. (#2526) +- Speed-up the new backtracking parser about 4X in general (enabled when + `--target-version` is set to 3.10 and higher). (#2728) ### Packaging diff --git a/src/blib2to3/pgen2/parse.py b/src/blib2to3/pgen2/parse.py index e25f583d11c..8fe96672897 100644 --- a/src/blib2to3/pgen2/parse.py +++ b/src/blib2to3/pgen2/parse.py @@ -73,7 +73,7 @@ def ilabels(self) -> Set[int]: @contextmanager def switch_to(self, ilabel: int) -> Iterator[None]: - with self.patch(): + with self.backtrack(): self.parser.stack = self._points[ilabel] try: yield @@ -83,12 +83,11 @@ def switch_to(self, ilabel: int) -> Iterator[None]: self.parser.stack = self._start_point @contextmanager - def patch(self) -> Iterator[None]: + def backtrack(self) -> Iterator[None]: """ - Patch basic state operations (push/pop/shift) with node-level - immutable variants. These still will operate on the stack; but - they won't create any new nodes, or modify the contents of any - other existing nodes. + Use the node-level invariant ones for basic parsing operations (push/pop/shift). + These still will operate on the stack; but they won't create any new nodes, or + modify the contents of any other existing nodes. This saves us a ton of time when we are backtracking, since we want to restore to the initial state as quick as possible, which @@ -349,8 +348,6 @@ def classify(self, type: int, value: Text, context: Context) -> List[int]: raise ParseError("bad token", type, value, context) return [ilabel] - STATE_OPERATIONS = ["shift", "push", "pop"] - def shift(self, type: int, value: Text, newstate: int, context: Context) -> None: """Shift a token. (Internal)""" if self.is_backtracking: