From 0f9d15b2e4162b74f02c59e37daa09013765fd3a Mon Sep 17 00:00:00 2001 From: Anthony Sottile Date: Wed, 22 Sep 2021 22:12:53 -0400 Subject: [PATCH] handle named escape sequences in format upgrades --- pyupgrade/_main.py | 39 +++++++++++++++++--------- pyupgrade/_plugins/percent_format.py | 12 ++------ pyupgrade/_string_helpers.py | 13 +++++++++ tests/features/format_literals_test.py | 15 ++++++++-- tests/features/fstrings_test.py | 7 +++-- tests/features/percent_format_test.py | 12 ++++++-- 6 files changed, 68 insertions(+), 30 deletions(-) diff --git a/pyupgrade/_main.py b/pyupgrade/_main.py index 291fc47a..2675619e 100644 --- a/pyupgrade/_main.py +++ b/pyupgrade/_main.py @@ -31,8 +31,10 @@ from pyupgrade._data import Settings from pyupgrade._data import Version from pyupgrade._data import visit +from pyupgrade._string_helpers import curly_escape from pyupgrade._string_helpers import is_ascii from pyupgrade._string_helpers import is_codec +from pyupgrade._string_helpers import NAMED_UNICODE_RE from pyupgrade._token_helpers import CLOSING from pyupgrade._token_helpers import KEYWORDS from pyupgrade._token_helpers import OPENING @@ -47,21 +49,34 @@ def parse_format(s: str) -> Tuple[DotFormatPart, ...]: - """Makes the empty string not a special case. In the stdlib, there's - loss of information (the type) on the empty string. - """ - parsed = tuple(_stdlib_parse_format(s)) - if not parsed: - return ((s, None, None, None),) - else: - return parsed + """handle named escape sequences""" + ret: List[DotFormatPart] = [] + + for part in NAMED_UNICODE_RE.split(s): + if NAMED_UNICODE_RE.fullmatch(part): + if not ret: + ret.append((part, None, None, None)) + else: + ret[-1] = (ret[-1][0] + part, None, None, None) + else: + first = True + for tup in _stdlib_parse_format(part): + if not first or not ret: + ret.append(tup) + else: + ret[-1] = (ret[-1][0] + tup[0], *tup[1:]) + first = False + + if not ret: + ret.append((s, None, None, None)) + + return tuple(ret) def unparse_parsed_string(parsed: Sequence[DotFormatPart]) -> str: def _convert_tup(tup: DotFormatPart) -> str: ret, field_name, format_spec, conversion = tup - ret = ret.replace('{', '{{') - ret = ret.replace('}', '}}') + ret = curly_escape(ret) if field_name is not None: ret += '{' + field_name if conversion: @@ -786,10 +801,6 @@ def _fix_py36_plus(contents_text: str, *, min_version: Version) -> str: return contents_text for i, token in reversed_enumerate(tokens): if token.offset in visitor.fstrings: - # TODO: handle \N escape sequences - if r'\N' in token.src: - continue - paren = i + 3 if tokens_to_src(tokens[i + 1:paren + 1]) != '.format(': continue diff --git a/pyupgrade/_plugins/percent_format.py b/pyupgrade/_plugins/percent_format.py index 1b0599df..c75d745a 100644 --- a/pyupgrade/_plugins/percent_format.py +++ b/pyupgrade/_plugins/percent_format.py @@ -18,6 +18,7 @@ from pyupgrade._data import register from pyupgrade._data import State from pyupgrade._data import TokenFunc +from pyupgrade._string_helpers import curly_escape from pyupgrade._token_helpers import KEYWORDS from pyupgrade._token_helpers import remove_brace from pyupgrade._token_helpers import victims @@ -120,7 +121,8 @@ def _simplify_conversion_flag(flag: str) -> str: def _percent_to_format(s: str) -> str: def _handle_part(part: PercentFormat) -> str: s, fmt = part - s = s.replace('{', '{{').replace('}', '}}') + s = curly_escape(s) + if fmt is None: return s else: @@ -155,10 +157,6 @@ def _fix_percent_format_tuple( *, node_right: ast.Tuple, ) -> None: - # TODO: handle \N escape sequences - if r'\N' in tokens[i].src: - return - # TODO: this is overly timid paren = i + 4 if tokens_to_src(tokens[i + 1:paren + 1]) != ' % (': @@ -181,10 +179,6 @@ def _fix_percent_format_dict( *, node_right: ast.Dict, ) -> None: - # TODO: handle \N escape sequences - if r'\N' in tokens[i].src: - return - seen_keys: Set[str] = set() keys = {} diff --git a/pyupgrade/_string_helpers.py b/pyupgrade/_string_helpers.py index cae45270..aac52cb0 100644 --- a/pyupgrade/_string_helpers.py +++ b/pyupgrade/_string_helpers.py @@ -1,4 +1,5 @@ import codecs +import re import string import sys @@ -8,6 +9,18 @@ def is_ascii(s: str) -> bool: return all(c in string.printable for c in s) +NAMED_UNICODE_RE = re.compile(r'(? str: + parts = NAMED_UNICODE_RE.split(s) + return ''.join( + part.replace('{', '{{').replace('}', '}}') + if not NAMED_UNICODE_RE.fullmatch(part) + else part + for part in parts + ) + def is_codec(encoding: str, name: str) -> bool: try: diff --git a/tests/features/format_literals_test.py b/tests/features/format_literals_test.py index bc00bd17..391fb40d 100644 --- a/tests/features/format_literals_test.py +++ b/tests/features/format_literals_test.py @@ -16,6 +16,14 @@ def test_roundtrip_text(s): assert unparse_parsed_string(parse_format(s)) == s +def test_parse_format_starts_with_named(): + # technically not possible since our string always starts with quotes + assert parse_format(r'\N{snowman} hi {0} hello') == ( + (r'\N{snowman} hi ', '0', '', None), + (' hello', None, None, None), + ) + + @pytest.mark.parametrize( ('s', 'expected'), ( @@ -49,8 +57,6 @@ def test_intentionally_not_round_trip(s, expected): "'{' '0}'.format(1)", # comment looks like placeholder but is not! '("{0}" # {1}\n"{2}").format(1, 2, 3)', - # TODO: this works by accident (extended escape treated as placeholder) - r'"\N{snowman} {}".format(1)', # don't touch f-strings (these are wrong but don't make it worse) 'f"{0}".format(a)', ), @@ -101,6 +107,11 @@ def test_format_literals_noop(s): ), # parenthesized string literals ('("{0}").format(1)', '("{}").format(1)'), + pytest.param( + r'"\N{snowman} {0}".format(1)', + r'"\N{snowman} {}".format(1)', + id='named escape sequence', + ), ), ) def test_format_literals(s, expected): diff --git a/tests/features/fstrings_test.py b/tests/features/fstrings_test.py index 3fd54c37..61c2cc1c 100644 --- a/tests/features/fstrings_test.py +++ b/tests/features/fstrings_test.py @@ -26,8 +26,6 @@ '"{:{}}".format(x, y)', '"{a[b]}".format(a=a)', '"{a.a[b]}".format(a=a)', - # TODO: handle \N escape sequences - r'"\N{snowman} {}".format(a)', # not enough placeholders / placeholders missing '"{}{}".format(a)', '"{a}{b}".format(a=a)', # backslashes and quotes cannot nest @@ -58,6 +56,11 @@ def test_fix_fstrings_noop(s): ('"{}{{}}{}".format(escaped, y)', 'f"{escaped}{{}}{y}"'), ('"{}{b}{}".format(a, c, b=b)', 'f"{a}{b}{c}"'), ('"{}".format(0x0)', 'f"{0x0}"'), + pytest.param( + r'"\N{snowman} {}".format(a)', + r'f"\N{snowman} {a}"', + id='named escape sequences', + ), # TODO: poor man's f-strings? # '"{foo}".format(**locals())' ), diff --git a/tests/features/percent_format_test.py b/tests/features/percent_format_test.py index 0d620ce0..536435e9 100644 --- a/tests/features/percent_format_test.py +++ b/tests/features/percent_format_test.py @@ -178,9 +178,6 @@ def test_simplify_conversion_flag(s, expected): '"%(and)s" % {"and": 2}', # invalid string formats '"%" % {}', '"%(hi)" % {}', '"%2" % {}', - # TODO: handle \N escape sequences - r'"%s \N{snowman}" % (a,)', - r'"%(foo)s \N{snowman}" % {"foo": 1}', ), ) def test_percent_format_noop(s): @@ -223,6 +220,15 @@ def test_percent_format_noop_if_bug_16806(): # dict ('"%(k)s" % {"k": "v"}', '"{k}".format(k="v")'), ('"%(to_list)s" % {"to_list": []}', '"{to_list}".format(to_list=[])'), + # \N escapes + ( + r'"%s \N{snowman}" % (a,)', + r'"{} \N{snowman}".format(a)', + ), + ( + r'"%(foo)s \N{snowman}" % {"foo": 1}', + r'"{foo} \N{snowman}".format(foo=1)', + ), ), ) def test_percent_format(s, expected):