Merge pull request #542 from asottile/named_escapes

handle named escape sequences in format upgrades
asottile · Sep 23, 2021 · 0981e83 · 0981e83
2 parents 748a1f0 + 0f9d15b
commit 0981e83
Show file tree

Hide file tree

Showing 6 changed files with 68 additions and 30 deletions.
diff --git a/pyupgrade/_main.py b/pyupgrade/_main.py
@@ -31,8 +31,10 @@
 from pyupgrade._data import Settings
 from pyupgrade._data import Version
 from pyupgrade._data import visit
+from pyupgrade._string_helpers import curly_escape
 from pyupgrade._string_helpers import is_ascii
 from pyupgrade._string_helpers import is_codec
+from pyupgrade._string_helpers import NAMED_UNICODE_RE
 from pyupgrade._token_helpers import CLOSING
 from pyupgrade._token_helpers import KEYWORDS
 from pyupgrade._token_helpers import OPENING
@@ -47,21 +49,34 @@
 
 
 def parse_format(s: str) -> Tuple[DotFormatPart, ...]:
-    """Makes the empty string not a special case.  In the stdlib, there's
-    loss of information (the type) on the empty string.
-    """
-    parsed = tuple(_stdlib_parse_format(s))
-    if not parsed:
-        return ((s, None, None, None),)
-    else:
-        return parsed
+    """handle named escape sequences"""
+    ret: List[DotFormatPart] = []
+
+    for part in NAMED_UNICODE_RE.split(s):
+        if NAMED_UNICODE_RE.fullmatch(part):
+            if not ret:
+                ret.append((part, None, None, None))
+            else:
+                ret[-1] = (ret[-1][0] + part, None, None, None)
+        else:
+            first = True
+            for tup in _stdlib_parse_format(part):
+                if not first or not ret:
+                    ret.append(tup)
+                else:
+                    ret[-1] = (ret[-1][0] + tup[0], *tup[1:])
+                first = False
+
+    if not ret:
+        ret.append((s, None, None, None))
+
+    return tuple(ret)
 
 
 def unparse_parsed_string(parsed: Sequence[DotFormatPart]) -> str:
     def _convert_tup(tup: DotFormatPart) -> str:
         ret, field_name, format_spec, conversion = tup
-        ret = ret.replace('{', '{{')
-        ret = ret.replace('}', '}}')
+        ret = curly_escape(ret)
         if field_name is not None:
             ret += '{' + field_name
             if conversion:
@@ -786,10 +801,6 @@ def _fix_py36_plus(contents_text: str, *, min_version: Version) -> str:
         return contents_text
     for i, token in reversed_enumerate(tokens):
         if token.offset in visitor.fstrings:
-            # TODO: handle \N escape sequences
-            if r'\N' in token.src:
-                continue
-
             paren = i + 3
             if tokens_to_src(tokens[i + 1:paren + 1]) != '.format(':
                 continue

diff --git a/pyupgrade/_plugins/percent_format.py b/pyupgrade/_plugins/percent_format.py
@@ -18,6 +18,7 @@
 from pyupgrade._data import register
 from pyupgrade._data import State
 from pyupgrade._data import TokenFunc
+from pyupgrade._string_helpers import curly_escape
 from pyupgrade._token_helpers import KEYWORDS
 from pyupgrade._token_helpers import remove_brace
 from pyupgrade._token_helpers import victims
@@ -120,7 +121,8 @@ def _simplify_conversion_flag(flag: str) -> str:
 def _percent_to_format(s: str) -> str:
     def _handle_part(part: PercentFormat) -> str:
         s, fmt = part
-        s = s.replace('{', '{{').replace('}', '}}')
+        s = curly_escape(s)
+
         if fmt is None:
             return s
         else:
@@ -155,10 +157,6 @@ def _fix_percent_format_tuple(
         *,
         node_right: ast.Tuple,
 ) -> None:
-    # TODO: handle \N escape sequences
-    if r'\N' in tokens[i].src:
-        return
-
     # TODO: this is overly timid
     paren = i + 4
     if tokens_to_src(tokens[i + 1:paren + 1]) != ' % (':
@@ -181,10 +179,6 @@ def _fix_percent_format_dict(
         *,
         node_right: ast.Dict,
 ) -> None:
-    # TODO: handle \N escape sequences
-    if r'\N' in tokens[i].src:
-        return
-
     seen_keys: Set[str] = set()
     keys = {}
 

diff --git a/pyupgrade/_string_helpers.py b/pyupgrade/_string_helpers.py
@@ -1,4 +1,5 @@
 import codecs
+import re
 import string
 import sys
 
@@ -8,6 +9,18 @@
     def is_ascii(s: str) -> bool:
         return all(c in string.printable for c in s)
 
+NAMED_UNICODE_RE = re.compile(r'(?<!\\)(?:\\\\)*(\\N\{[^}]+\})')
+
+
+def curly_escape(s: str) -> str:
+    parts = NAMED_UNICODE_RE.split(s)
+    return ''.join(
+        part.replace('{', '{{').replace('}', '}}')
+        if not NAMED_UNICODE_RE.fullmatch(part)
+        else part
+        for part in parts
+    )
+
 
 def is_codec(encoding: str, name: str) -> bool:
     try:

diff --git a/tests/features/format_literals_test.py b/tests/features/format_literals_test.py
@@ -16,6 +16,14 @@ def test_roundtrip_text(s):
     assert unparse_parsed_string(parse_format(s)) == s
 
 
+def test_parse_format_starts_with_named():
+    # technically not possible since our string always starts with quotes
+    assert parse_format(r'\N{snowman} hi {0} hello') == (
+        (r'\N{snowman} hi ', '0', '', None),
+        (' hello', None, None, None),
+    )
+
+
 @pytest.mark.parametrize(
     ('s', 'expected'),
     (
@@ -49,8 +57,6 @@ def test_intentionally_not_round_trip(s, expected):
         "'{' '0}'.format(1)",
         # comment looks like placeholder but is not!
         '("{0}" # {1}\n"{2}").format(1, 2, 3)',
-        # TODO: this works by accident (extended escape treated as placeholder)
-        r'"\N{snowman} {}".format(1)',
         # don't touch f-strings (these are wrong but don't make it worse)
         'f"{0}".format(a)',
     ),
@@ -101,6 +107,11 @@ def test_format_literals_noop(s):
         ),
         # parenthesized string literals
         ('("{0}").format(1)', '("{}").format(1)'),
+        pytest.param(
+            r'"\N{snowman} {0}".format(1)',
+            r'"\N{snowman} {}".format(1)',
+            id='named escape sequence',
+        ),
     ),
 )
 def test_format_literals(s, expected):

diff --git a/tests/features/fstrings_test.py b/tests/features/fstrings_test.py
@@ -26,8 +26,6 @@
         '"{:{}}".format(x, y)',
         '"{a[b]}".format(a=a)',
         '"{a.a[b]}".format(a=a)',
-        # TODO: handle \N escape sequences
-        r'"\N{snowman} {}".format(a)',
         # not enough placeholders / placeholders missing
         '"{}{}".format(a)', '"{a}{b}".format(a=a)',
         # backslashes and quotes cannot nest
@@ -58,6 +56,11 @@ def test_fix_fstrings_noop(s):
         ('"{}{{}}{}".format(escaped, y)', 'f"{escaped}{{}}{y}"'),
         ('"{}{b}{}".format(a, c, b=b)', 'f"{a}{b}{c}"'),
         ('"{}".format(0x0)', 'f"{0x0}"'),
+        pytest.param(
+            r'"\N{snowman} {}".format(a)',
+            r'f"\N{snowman} {a}"',
+            id='named escape sequences',
+        ),
         # TODO: poor man's f-strings?
         # '"{foo}".format(**locals())'
     ),

diff --git a/tests/features/percent_format_test.py b/tests/features/percent_format_test.py
@@ -178,9 +178,6 @@ def test_simplify_conversion_flag(s, expected):
         '"%(and)s" % {"and": 2}',
         # invalid string formats
         '"%" % {}', '"%(hi)" % {}', '"%2" % {}',
-        # TODO: handle \N escape sequences
-        r'"%s \N{snowman}" % (a,)',
-        r'"%(foo)s \N{snowman}" % {"foo": 1}',
     ),
 )
 def test_percent_format_noop(s):
@@ -223,6 +220,15 @@ def test_percent_format_noop_if_bug_16806():
         # dict
         ('"%(k)s" % {"k": "v"}', '"{k}".format(k="v")'),
         ('"%(to_list)s" % {"to_list": []}', '"{to_list}".format(to_list=[])'),
+        # \N escapes
+        (
+            r'"%s \N{snowman}" % (a,)',
+            r'"{} \N{snowman}".format(a)',
+        ),
+        (
+            r'"%(foo)s \N{snowman}" % {"foo": 1}',
+            r'"{foo} \N{snowman}".format(foo=1)',
+        ),
     ),
 )
 def test_percent_format(s, expected):