From 5e2bb528e09df368ed7dea6b7fb9c53e799a569f Mon Sep 17 00:00:00 2001 From: Jelle Zijlstra Date: Tue, 30 Nov 2021 18:01:36 -0800 Subject: [PATCH] Reduce usage of regex (#2644) This removes all but one usage of the `regex` dependency. Tricky bits included: - A bug in test_black.py where we were incorrectly using a character range. Fix also submitted separately in #2643. - `tokenize.py` was the original use case for regex (#1047). The important bit is that we rely on `\w` to match anything valid in an identifier, and `re` fails to match a few characters as part of identifiers. My solution is to instead match all characters *except* those we know to mean something else in Python: whitespace and ASCII punctuation. This will make Black able to parse some invalid Python programs, like those that contain non-ASCII punctuation in the place of an identifier, but that seems fine to me. - One import of `regex` remains, in `trans.py`. We use a recursive regex to parse f-strings, and only `regex` supports that. I haven't thought of a better fix there (except maybe writing a manual parser), so I'm leaving that for now. My goal is to remove the `regex` dependency to reduce the risk of breakage due to dependencies and make life easier for users on platforms without wheels. --- CHANGES.md | 9 +++++---- src/black/__init__.py | 2 +- src/black/comments.py | 2 +- src/black/strings.py | 4 ++-- src/black/trans.py | 2 +- src/blib2to3/pgen2/conv.py | 2 +- src/blib2to3/pgen2/tokenize.py | 4 ++-- 7 files changed, 13 insertions(+), 12 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 85feb1a7600..7214405c429 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -7,12 +7,13 @@ - Cell magics are now only processed if they are known Python cell magics. Earlier, all cell magics were tokenized, leading to possible indentation errors e.g. with `%%writefile`. (#2630) -- Fixed Python 3.10 support on platforms without ProcessPoolExecutor (#2631) -- Fixed `match` statements with open sequence subjects, like `match a, b:` or +- Fix Python 3.10 support on platforms without ProcessPoolExecutor (#2631) +- Reduce usage of the `regex` dependency (#2644) +- Fix `match` statements with open sequence subjects, like `match a, b:` or `match a, *b:` (#2639) (#2659) -- Fixed `match`/`case` statements that contain `match`/`case` soft keywords multiple +- Fix `match`/`case` statements that contain `match`/`case` soft keywords multiple times, like `match re.match()` (#2661) -- Fixed assignment to environment variables in Jupyter Notebooks (#2642) +- Fix assignment to environment variables in Jupyter Notebooks (#2642) - Add `flake8-simplify` and `flake8-comprehensions` plugins (#2653) ## 21.11b1 diff --git a/src/black/__init__.py b/src/black/__init__.py index c2b52e6eadb..1923c069ede 100644 --- a/src/black/__init__.py +++ b/src/black/__init__.py @@ -10,7 +10,7 @@ import os from pathlib import Path from pathspec.patterns.gitwildmatch import GitWildMatchPatternError -import regex as re +import re import signal import sys import tokenize diff --git a/src/black/comments.py b/src/black/comments.py index a8152d687a3..28b9117101d 100644 --- a/src/black/comments.py +++ b/src/black/comments.py @@ -1,7 +1,7 @@ import sys from dataclasses import dataclass from functools import lru_cache -import regex as re +import re from typing import Iterator, List, Optional, Union if sys.version_info >= (3, 8): diff --git a/src/black/strings.py b/src/black/strings.py index 97debe3b5de..06a5da01f0c 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -2,7 +2,7 @@ Simple formatting on strings. Further string formatting code is in trans.py. """ -import regex as re +import re import sys from functools import lru_cache from typing import List, Pattern @@ -156,7 +156,7 @@ def normalize_string_prefix(s: str, remove_u_prefix: bool = False) -> str: # performance on a long list literal of strings by 5-9% since lru_cache's # caching overhead is much lower. @lru_cache(maxsize=64) -def _cached_compile(pattern: str) -> re.Pattern: +def _cached_compile(pattern: str) -> Pattern[str]: return re.compile(pattern) diff --git a/src/black/trans.py b/src/black/trans.py index d918ef111a2..a4d1e6fbc79 100644 --- a/src/black/trans.py +++ b/src/black/trans.py @@ -4,7 +4,7 @@ from abc import ABC, abstractmethod from collections import defaultdict from dataclasses import dataclass -import regex as re +import regex as re # We need recursive patterns here (?R) from typing import ( Any, Callable, diff --git a/src/blib2to3/pgen2/conv.py b/src/blib2to3/pgen2/conv.py index 78165217a1b..fa9825e54d6 100644 --- a/src/blib2to3/pgen2/conv.py +++ b/src/blib2to3/pgen2/conv.py @@ -29,7 +29,7 @@ """ # Python imports -import regex as re +import re # Local imports from pgen2 import grammar, token diff --git a/src/blib2to3/pgen2/tokenize.py b/src/blib2to3/pgen2/tokenize.py index 283fac2d537..a7e17df1e8f 100644 --- a/src/blib2to3/pgen2/tokenize.py +++ b/src/blib2to3/pgen2/tokenize.py @@ -52,7 +52,7 @@ __author__ = "Ka-Ping Yee " __credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro" -import regex as re +import re from codecs import BOM_UTF8, lookup from blib2to3.pgen2.token import * @@ -86,7 +86,7 @@ def _combinations(*l): Comment = r"#[^\r\n]*" Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment) Name = ( # this is invalid but it's fine because Name comes after Number in all groups - r"\w+" + r"[^\s#\(\)\[\]\{\}+\-*/!@$%^&=|;:'\",\.<>/?`~\\]+" ) Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"