Format hex code in unicode escape sequences in string literals (#2916)

Co-authored-by: Jelle Zijlstra <jelle.zijlstra@gmail.com>
psf · Jan 22, 2023 · eabff67 · eabff67
1 parent 1557f7d
commit eabff67
Show file tree

Hide file tree

Showing 5 changed files with 82 additions and 1 deletion.
diff --git a/CHANGES.md b/CHANGES.md
@@ -16,6 +16,7 @@
 
 <!-- Changes that affect Black's preview style -->
 
+- Format hex code in unicode escape sequences in string literals (#2916)
 - Add parentheses around `if`-`else` expressions (#2278)
 - Improve the performance on large expressions that contain many strings (#3467)
 - Fix a crash in preview style with assert + parenthesized string (#3415)

diff --git a/src/black/linegen.py b/src/black/linegen.py
@@ -59,6 +59,7 @@
     get_string_prefix,
     normalize_string_prefix,
     normalize_string_quotes,
+    normalize_unicode_escape_sequences,
 )
 from black.trans import (
     CannotTransform,
@@ -368,6 +369,9 @@ def visit_factor(self, node: Node) -> Iterator[Line]:
         yield from self.visit_default(node)
 
     def visit_STRING(self, leaf: Leaf) -> Iterator[Line]:
+        if Preview.hex_codes_in_unicode_sequences in self.mode:
+            normalize_unicode_escape_sequences(leaf)
+
         if is_docstring(leaf) and "\\\n" not in leaf.value:
             # We're ignoring docstrings with backslash newline escapes because changing
             # indentation of those changes the AST representation of the code.

diff --git a/src/black/mode.py b/src/black/mode.py
@@ -153,6 +153,7 @@ def supports_feature(target_versions: Set[TargetVersion], feature: Feature) -> b
 class Preview(Enum):
     """Individual preview style features."""
 
+    hex_codes_in_unicode_sequences = auto()
     annotation_parens = auto()
     empty_lines_before_class_or_def_with_leading_comments = auto()
     handle_trailing_commas_in_head = auto()

diff --git a/src/black/strings.py b/src/black/strings.py
@@ -5,7 +5,9 @@
 import re
 import sys
 from functools import lru_cache
-from typing import List, Pattern
+from typing import List, Match, Pattern
+
+from blib2to3.pytree import Leaf
 
 if sys.version_info < (3, 8):
     from typing_extensions import Final
@@ -18,6 +20,15 @@
     r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
 )
 FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
+UNICODE_ESCAPE_RE: Final = re.compile(
+    r"(?P<backslashes>\\+)(?P<body>"
+    r"(u(?P<u>[a-fA-F0-9]{4}))"  # Character with 16-bit hex value xxxx
+    r"|(U(?P<U>[a-fA-F0-9]{8}))"  # Character with 32-bit hex value xxxxxxxx
+    r"|(x(?P<x>[a-fA-F0-9]{2}))"  # Character with hex value hh
+    r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})"  # Character named name in the Unicode database
+    r")",
+    re.VERBOSE,
+)
 
 
 def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
@@ -236,3 +247,34 @@ def normalize_string_quotes(s: str) -> str:
         return s  # Prefer double quotes
 
     return f"{prefix}{new_quote}{new_body}{new_quote}"
+
+
+def normalize_unicode_escape_sequences(leaf: Leaf) -> None:
+    """Replace hex codes in Unicode escape sequences with lowercase representation."""
+    text = leaf.value
+    prefix = get_string_prefix(text)
+    if "r" in prefix.lower():
+        return
+
+    def replace(m: Match[str]) -> str:
+        groups = m.groupdict()
+        back_slashes = groups["backslashes"]
+
+        if len(back_slashes) % 2 == 0:
+            return back_slashes + groups["body"]
+
+        if groups["u"]:
+            # \u
+            return back_slashes + "u" + groups["u"].lower()
+        elif groups["U"]:
+            # \U
+            return back_slashes + "U" + groups["U"].lower()
+        elif groups["x"]:
+            # \x
+            return back_slashes + "x" + groups["x"].lower()
+        else:
+            assert groups["N"], f"Unexpected match: {m}"
+            # \N{}
+            return back_slashes + "N{" + groups["N"].upper() + "}"
+
+    leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text)
diff --git a/tests/data/preview/format_unicode_escape_seq.py b/tests/data/preview/format_unicode_escape_seq.py
@@ -0,0 +1,33 @@
+x = "\x1F"
+x = "\\x1B"
+x = "\\\x1B"
+x = "\U0001F60E"
+x = "\u0001F60E"
+x = r"\u0001F60E"
+x = "don't format me"
+x = "\xA3"
+x = "\u2717"
+x = "\uFaCe"
+x = "\N{ox}\N{OX}"
+x = "\N{lAtIn smaLL letteR x}"
+x = "\N{CYRILLIC small LETTER BYELORUSSIAN-UKRAINIAN I}"
+x = b"\x1Fdon't byte"
+x = rb"\x1Fdon't format"
+
+# output
+
+x = "\x1f"
+x = "\\x1B"
+x = "\\\x1b"
+x = "\U0001f60e"
+x = "\u0001F60E"
+x = r"\u0001F60E"
+x = "don't format me"
+x = "\xa3"
+x = "\u2717"
+x = "\uface"
+x = "\N{OX}\N{OX}"
+x = "\N{LATIN SMALL LETTER X}"
+x = "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}"
+x = b"\x1fdon't byte"
+x = rb"\x1Fdon't format"