From add30b8a28403b09927bdcf7dec64633fef7826a Mon Sep 17 00:00:00 2001
From: Shivansh-007 <shivansh-007@outlook.com>
Date: Sun, 30 Jan 2022 12:43:19 +0530
Subject: [PATCH 01/18] Format hex code in unicode escape sequences in string
 literals

---
 src/black/linegen.py                    | 11 +++++++++--
 src/black/strings.py                    | 16 +++++++++++++++-
 tests/data/format_unicode_escape_seq.py | 18 ++++++++++++++++++
 3 files changed, 42 insertions(+), 3 deletions(-)
 create mode 100644 tests/data/format_unicode_escape_seq.py

diff --git a/src/black/linegen.py b/src/black/linegen.py
index 4dc242a1dfe..4b5b160ada8 100644
--- a/src/black/linegen.py
+++ b/src/black/linegen.py
@@ -19,8 +19,13 @@
 from black.lines import can_omit_invisible_parens, can_be_split, append_leaves
 from black.comments import generate_comments, list_comments, FMT_OFF
 from black.numerics import normalize_numeric_literal
-from black.strings import get_string_prefix, fix_docstring
-from black.strings import normalize_string_prefix, normalize_string_quotes
+from black.strings import (
+    get_string_prefix,
+    fix_docstring,
+    normalize_string_prefix,
+    normalize_string_quotes,
+    normalize_unicode_escape_sequences,
+)
 from black.trans import Transformer, CannotTransform, StringMerger, StringSplitter
 from black.trans import StringParenWrapper, StringParenStripper, hug_power_op
 from black.mode import Mode, Feature, Preview
@@ -255,6 +260,8 @@ def visit_factor(self, node: Node) -> Iterator[Line]:
         yield from self.visit_default(node)
 
     def visit_STRING(self, leaf: Leaf) -> Iterator[Line]:
+        normalize_unicode_escape_sequences(leaf)
+
         if is_docstring(leaf) and "\\\n" not in leaf.value:
             # We're ignoring docstrings with backslash newline escapes because changing
             # indentation of those changes the AST representation of the code.
diff --git a/src/black/strings.py b/src/black/strings.py
index 9d0e2eb8430..e2b7534a3f2 100644
--- a/src/black/strings.py
+++ b/src/black/strings.py
@@ -5,7 +5,9 @@
 import re
 import sys
 from functools import lru_cache
-from typing import List, Pattern
+from typing import List, Pattern, AnyStr, Match
+
+from blib2to3.pytree import Leaf
 
 if sys.version_info < (3, 8):
     from typing_extensions import Final
@@ -18,6 +20,7 @@
     r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
 )
 FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
+UNICODE_RE = re.compile(r"(\\+)(u|U|x)([a-zA-Z0-9]+)")
 
 
 def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
@@ -236,3 +239,14 @@ def normalize_string_quotes(s: str) -> str:
         return s  # Prefer double quotes
 
     return f"{prefix}{new_quote}{new_body}{new_quote}"
+
+
+def normalize_unicode_escape_sequences(leaf: Leaf) -> None:
+    """Replace hex codes in Unicode escape sequences with lowercase representation."""
+    text = leaf.value
+
+    def replace(m: Match[AnyStr]) -> AnyStr:
+        groups = m.groups()
+        return groups[0] + groups[1] + groups[2].lower()
+
+    leaf.value = re.sub(UNICODE_RE, replace, text)
diff --git a/tests/data/format_unicode_escape_seq.py b/tests/data/format_unicode_escape_seq.py
new file mode 100644
index 00000000000..288db53c42f
--- /dev/null
+++ b/tests/data/format_unicode_escape_seq.py
@@ -0,0 +1,18 @@
+x = "\x1B"
+x = "\\\U0001f60e"
+x = "\u0001F60E"
+x = r"\u0001F60E"
+x = "don't format me"
+x = "\xhhhhh"
+x = "\uhhhhh"
+
+
+# Output
+
+x = "\x1b"
+x = "\\\U0001f60e"
+x = "\u0001f60e"
+x = r"\u0001f60e"
+x = "don't format me"
+x = "\xhhhhh"
+x = "\uhhhhh"

From 483fc150faccd7ae56aa8ef8906509447d695ffe Mon Sep 17 00:00:00 2001
From: Shivansh-007 <shivansh-007@outlook.com>
Date: Mon, 31 Jan 2022 07:27:11 +0530
Subject: [PATCH 02/18] Format \N character name escapes with uppercased
 literals

---
 src/black/strings.py                    | 9 +++++++--
 tests/data/format_unicode_escape_seq.py | 3 ++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/black/strings.py b/src/black/strings.py
index e2b7534a3f2..00c754c14e4 100644
--- a/src/black/strings.py
+++ b/src/black/strings.py
@@ -20,7 +20,7 @@
     r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
 )
 FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
-UNICODE_RE = re.compile(r"(\\+)(u|U|x)([a-zA-Z0-9]+)")
+UNICODE_RE = re.compile(r"(\\+)(u|U|x|N)(([a-zA-Z0-9]+)|\{([a-zA-Z0-9]+)\})")
 
 
 def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
@@ -247,6 +247,11 @@ def normalize_unicode_escape_sequences(leaf: Leaf) -> None:
 
     def replace(m: Match[AnyStr]) -> AnyStr:
         groups = m.groups()
-        return groups[0] + groups[1] + groups[2].lower()
+        if m.group(4):
+            # \\U or \\u or \\x
+            return groups[0] + groups[1] + groups[2].lower()
+        else:
+            # \\N{}
+            return groups[0] + groups[1] + groups[2].upper()
 
     leaf.value = re.sub(UNICODE_RE, replace, text)
diff --git a/tests/data/format_unicode_escape_seq.py b/tests/data/format_unicode_escape_seq.py
index 288db53c42f..3a8bc20f15d 100644
--- a/tests/data/format_unicode_escape_seq.py
+++ b/tests/data/format_unicode_escape_seq.py
@@ -5,7 +5,7 @@
 x = "don't format me"
 x = "\xhhhhh"
 x = "\uhhhhh"
-
+x = "\N{ox}\N{OX}"
 
 # Output
 
@@ -16,3 +16,4 @@
 x = "don't format me"
 x = "\xhhhhh"
 x = "\uhhhhh"
+x = "\N{OX}\N{OX}"

From cc48d2df1b408824f94f1fc46394f8a7ea9002a5 Mon Sep 17 00:00:00 2001
From: Shivansh-007 <shivansh-007@outlook.com>
Date: Sun, 13 Mar 2022 14:52:44 +0530
Subject: [PATCH 03/18] Fix formatting with correct length for each format

According to the table at https://docs.python.org/3/reference/lexical_analysis.html\#string-and-bytes-literals
---
 src/black/strings.py                    | 30 ++++++++++++++++++++-----
 tests/data/format_unicode_escape_seq.py | 24 +++++++++++---------
 tests/test_format.py                    |  1 +
 3 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/src/black/strings.py b/src/black/strings.py
index 00c754c14e4..1302a40f42e 100644
--- a/src/black/strings.py
+++ b/src/black/strings.py
@@ -20,7 +20,14 @@
     r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
 )
 FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
-UNICODE_RE = re.compile(r"(\\+)(u|U|x|N)(([a-zA-Z0-9]+)|\{([a-zA-Z0-9]+)\})")
+UNICODE_RE = re.compile(
+    r"(\\+)("
+    r"(u([a-zA-Z0-9]{4}))"
+    r"|(U([a-zA-Z0-9]{0,8}))"
+    r"|(x([a-zA-Z0-9]{2}))"
+    r"|(N\{([a-zA-Z0-9]{2})\})"
+    r")"
+)
 
 
 def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
@@ -244,14 +251,25 @@ def normalize_string_quotes(s: str) -> str:
 def normalize_unicode_escape_sequences(leaf: Leaf) -> None:
     """Replace hex codes in Unicode escape sequences with lowercase representation."""
     text = leaf.value
+    prefix = get_string_prefix(text)
 
     def replace(m: Match[AnyStr]) -> AnyStr:
         groups = m.groups()
-        if m.group(4):
-            # \\U or \\u or \\x
-            return groups[0] + groups[1] + groups[2].lower()
+
+        if len(groups[0]) % 2 == 0 or prefix == "r":
+            return groups[0] + groups[1]
+
+        if groups[2]:
+            # \u
+            return groups[0] + "u" + groups[3].lower()
+        elif groups[4]:
+            # \U
+            return groups[0] + "U" + groups[5].lower()
+        elif groups[6]:
+            # \x
+            return groups[0] + "x" + groups[7].lower()
         else:
-            # \\N{}
-            return groups[0] + groups[1] + groups[2].upper()
+            # \N{}
+            return groups[0] + "N{" + groups[9].upper() + "}"
 
     leaf.value = re.sub(UNICODE_RE, replace, text)
diff --git a/tests/data/format_unicode_escape_seq.py b/tests/data/format_unicode_escape_seq.py
index 3a8bc20f15d..25a7994f197 100644
--- a/tests/data/format_unicode_escape_seq.py
+++ b/tests/data/format_unicode_escape_seq.py
@@ -1,19 +1,23 @@
-x = "\x1B"
-x = "\\\U0001f60e"
+x = "\x1F"
+x = "\\x1B"
+x = "\\\x1B"
+x = "\U0001F60E"
 x = "\u0001F60E"
 x = r"\u0001F60E"
 x = "don't format me"
-x = "\xhhhhh"
-x = "\uhhhhh"
+x = "\xA3"
+x = "\u2717"
 x = "\N{ox}\N{OX}"
 
 # Output
 
-x = "\x1b"
-x = "\\\U0001f60e"
-x = "\u0001f60e"
-x = r"\u0001f60e"
+x = "\x1f"
+x = "\\x1B"
+x = "\\\x1b"
+x = "\U0001f60e"
+x = "\u0001F60E"
+x = r"\u0001F60E"
 x = "don't format me"
-x = "\xhhhhh"
-x = "\uhhhhh"
+x = "\xa3"
+x = "\u2717"
 x = "\N{OX}\N{OX}"
diff --git a/tests/test_format.py b/tests/test_format.py
index 04eda43d5cf..799c599c9ba 100644
--- a/tests/test_format.py
+++ b/tests/test_format.py
@@ -44,6 +44,7 @@
     "fmtskip4",
     "fmtskip5",
     "fmtskip6",
+    "format_unicode_escape_seq",
     "fstring",
     "function",
     "function2",

From f1dbc964af5427b2aed1b2536d1aa2fe5a5cf843 Mon Sep 17 00:00:00 2001
From: Shivansh-007 <shivansh-007@outlook.com>
Date: Sun, 13 Mar 2022 14:56:06 +0530
Subject: [PATCH 04/18] Add changelog

---
 CHANGES.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGES.md b/CHANGES.md
index edca0dcdad4..f4c26bb6194 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -12,7 +12,7 @@
 
 ### Preview style
 
-<!-- Changes that affect Black's preview style -->
+- Format hex code in unicode escape sequences in string literals (#2916)
 
 ### _Blackd_
 

From ef442a6d83577348c71317826fc01fff7fd31a35 Mon Sep 17 00:00:00 2001
From: Shivansh-007 <shivansh-007@outlook.com>
Date: Sun, 13 Mar 2022 16:07:49 +0530
Subject: [PATCH 05/18] Move feature to preview styling only

---
 src/black/linegen.py | 4 +++-
 src/black/mode.py    | 1 +
 tests/test_format.py | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/black/linegen.py b/src/black/linegen.py
index 4b5b160ada8..96dea20dea3 100644
--- a/src/black/linegen.py
+++ b/src/black/linegen.py
@@ -260,7 +260,9 @@ def visit_factor(self, node: Node) -> Iterator[Line]:
         yield from self.visit_default(node)
 
     def visit_STRING(self, leaf: Leaf) -> Iterator[Line]:
-        normalize_unicode_escape_sequences(leaf)
+        if Preview.hex_codes_in_unicode_sequences in self.mode:
+            # Preview style only
+            normalize_unicode_escape_sequences(leaf)
 
         if is_docstring(leaf) and "\\\n" not in leaf.value:
             # We're ignoring docstrings with backslash newline escapes because changing
diff --git a/src/black/mode.py b/src/black/mode.py
index 455ed36e27e..59715db5d07 100644
--- a/src/black/mode.py
+++ b/src/black/mode.py
@@ -128,6 +128,7 @@ class Preview(Enum):
 
     string_processing = auto()
     hug_simple_powers = auto()
+    hex_codes_in_unicode_sequences = auto()
 
 
 class Deprecated(UserWarning):
diff --git a/tests/test_format.py b/tests/test_format.py
index 799c599c9ba..3fe77eb7e15 100644
--- a/tests/test_format.py
+++ b/tests/test_format.py
@@ -44,7 +44,6 @@
     "fmtskip4",
     "fmtskip5",
     "fmtskip6",
-    "format_unicode_escape_seq",
     "fstring",
     "function",
     "function2",
@@ -76,6 +75,7 @@
     # string processing
     "cantfit",
     "comments7",
+    "format_unicode_escape_seq",
     "long_strings",
     "long_strings__edge_case",
     "long_strings__regression",

From 2ada012cae1e7a465506c5f725851eb739cc985c Mon Sep 17 00:00:00 2001
From: Shivansh-007 <shivansh-007@outlook.com>
Date: Sun, 13 Mar 2022 16:23:23 +0530
Subject: [PATCH 06/18] Fix typo

---
 tests/data/format_unicode_escape_seq.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/data/format_unicode_escape_seq.py b/tests/data/format_unicode_escape_seq.py
index 25a7994f197..2be38d2402d 100644
--- a/tests/data/format_unicode_escape_seq.py
+++ b/tests/data/format_unicode_escape_seq.py
@@ -9,7 +9,7 @@
 x = "\u2717"
 x = "\N{ox}\N{OX}"
 
-# Output
+# output
 
 x = "\x1f"
 x = "\\x1B"

From 125ebec401bcbd10b944271032552365ab558683 Mon Sep 17 00:00:00 2001
From: Shivansh-007 <shivansh-007@outlook.com>
Date: Wed, 16 Mar 2022 06:44:15 +0530
Subject: [PATCH 07/18] Change Match[AnyStr] to Match[str]

---
 src/black/linegen.py |  1 -
 src/black/strings.py | 25 +++++++++++++------------
 test.py              | 10 ++++++++++
 3 files changed, 23 insertions(+), 13 deletions(-)
 create mode 100644 test.py

diff --git a/src/black/linegen.py b/src/black/linegen.py
index 96dea20dea3..9df5122549e 100644
--- a/src/black/linegen.py
+++ b/src/black/linegen.py
@@ -261,7 +261,6 @@ def visit_factor(self, node: Node) -> Iterator[Line]:
 
     def visit_STRING(self, leaf: Leaf) -> Iterator[Line]:
         if Preview.hex_codes_in_unicode_sequences in self.mode:
-            # Preview style only
             normalize_unicode_escape_sequences(leaf)
 
         if is_docstring(leaf) and "\\\n" not in leaf.value:
diff --git a/src/black/strings.py b/src/black/strings.py
index 1302a40f42e..e578393f457 100644
--- a/src/black/strings.py
+++ b/src/black/strings.py
@@ -5,7 +5,7 @@
 import re
 import sys
 from functools import lru_cache
-from typing import List, Pattern, AnyStr, Match
+from typing import List, Pattern, Match
 
 from blib2to3.pytree import Leaf
 
@@ -22,10 +22,10 @@
 FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
 UNICODE_RE = re.compile(
     r"(\\+)("
-    r"(u([a-zA-Z0-9]{4}))"
-    r"|(U([a-zA-Z0-9]{0,8}))"
-    r"|(x([a-zA-Z0-9]{2}))"
-    r"|(N\{([a-zA-Z0-9]{2})\})"
+    r"(u([a-zA-Z0-9]{4}))"  # Formatting 16-bit unicodes i.e. \uxxxx
+    r"|(U([a-zA-Z0-9]{0,8}))"  # Formatting 32-bit unicodes i.e. \Uxxxxxxxx
+    r"|(x([a-zA-Z0-9]{2}))"  # Formatting unicodes in format of \xhh
+    r"|(N\{([a-zA-Z0-9]{2})\})"  # Formatting named unicodes in format of \N{name}
     r")"
 )
 
@@ -253,23 +253,24 @@ def normalize_unicode_escape_sequences(leaf: Leaf) -> None:
     text = leaf.value
     prefix = get_string_prefix(text)
 
-    def replace(m: Match[AnyStr]) -> AnyStr:
+    def replace(m: Match[str]) -> str:
         groups = m.groups()
+        back_slashes = groups[0]
 
-        if len(groups[0]) % 2 == 0 or prefix == "r":
-            return groups[0] + groups[1]
+        if len(back_slashes) % 2 == 0 or prefix == "r":
+            return back_slashes + groups[1]
 
         if groups[2]:
             # \u
-            return groups[0] + "u" + groups[3].lower()
+            return back_slashes + "u" + groups[3].lower()
         elif groups[4]:
             # \U
-            return groups[0] + "U" + groups[5].lower()
+            return back_slashes + "U" + groups[5].lower()
         elif groups[6]:
             # \x
-            return groups[0] + "x" + groups[7].lower()
+            return back_slashes + "x" + groups[7].lower()
         else:
             # \N{}
-            return groups[0] + "N{" + groups[9].upper() + "}"
+            return back_slashes + "N{" + groups[9].upper() + "}"
 
     leaf.value = re.sub(UNICODE_RE, replace, text)
diff --git a/test.py b/test.py
new file mode 100644
index 00000000000..7e715cfd8c8
--- /dev/null
+++ b/test.py
@@ -0,0 +1,10 @@
+x = "\x1f"
+x = "\\x1B"
+x = "\\\x1b"
+x = "\U0001f60e"
+x = "\u0001F60E"
+x = r"\u0001F60E"
+x = "don't format me"
+x = "\xa3"
+x = "\u2717"
+x = "\N{OX}\N{OX}"

From af86102e2507b8fb35c158fa9255606567353045 Mon Sep 17 00:00:00 2001
From: Shivansh-007 <shivansh-007@outlook.com>
Date: Wed, 16 Mar 2022 09:27:18 +0530
Subject: [PATCH 08/18] Make UNICODE_RE Final and accept multiline strings

---
 src/black/strings.py |  5 +++--
 test.py              | 10 ----------
 2 files changed, 3 insertions(+), 12 deletions(-)
 delete mode 100644 test.py

diff --git a/src/black/strings.py b/src/black/strings.py
index e578393f457..93c54b69f1b 100644
--- a/src/black/strings.py
+++ b/src/black/strings.py
@@ -20,13 +20,14 @@
     r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
 )
 FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
-UNICODE_RE = re.compile(
+UNICODE_RE: Final = re.compile(
     r"(\\+)("
     r"(u([a-zA-Z0-9]{4}))"  # Formatting 16-bit unicodes i.e. \uxxxx
     r"|(U([a-zA-Z0-9]{0,8}))"  # Formatting 32-bit unicodes i.e. \Uxxxxxxxx
     r"|(x([a-zA-Z0-9]{2}))"  # Formatting unicodes in format of \xhh
     r"|(N\{([a-zA-Z0-9]{2})\})"  # Formatting named unicodes in format of \N{name}
-    r")"
+    r")",
+    re.MULTILINE,
 )
 
 
diff --git a/test.py b/test.py
deleted file mode 100644
index 7e715cfd8c8..00000000000
--- a/test.py
+++ /dev/null
@@ -1,10 +0,0 @@
-x = "\x1f"
-x = "\\x1B"
-x = "\\\x1b"
-x = "\U0001f60e"
-x = "\u0001F60E"
-x = r"\u0001F60E"
-x = "don't format me"
-x = "\xa3"
-x = "\u2717"
-x = "\N{OX}\N{OX}"

From 69c9664f1cd0d1c6a4722d88fbba308e8655a34a Mon Sep 17 00:00:00 2001
From: Shivansh-007 <shivansh-007@outlook.com>
Date: Wed, 16 Mar 2022 09:28:13 +0530
Subject: [PATCH 09/18] Reword regex comments to use 'character'

---
 src/black/strings.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/black/strings.py b/src/black/strings.py
index 93c54b69f1b..4cfca8173d3 100644
--- a/src/black/strings.py
+++ b/src/black/strings.py
@@ -22,10 +22,10 @@
 FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
 UNICODE_RE: Final = re.compile(
     r"(\\+)("
-    r"(u([a-zA-Z0-9]{4}))"  # Formatting 16-bit unicodes i.e. \uxxxx
-    r"|(U([a-zA-Z0-9]{0,8}))"  # Formatting 32-bit unicodes i.e. \Uxxxxxxxx
-    r"|(x([a-zA-Z0-9]{2}))"  # Formatting unicodes in format of \xhh
-    r"|(N\{([a-zA-Z0-9]{2})\})"  # Formatting named unicodes in format of \N{name}
+    r"(u([a-zA-Z0-9]{4}))"  # Character with 16-bit hex value xxxx
+    r"|(U([a-zA-Z0-9]{0,8}))"  # Character with 32-bit hex value xxxxxxxx
+    r"|(x([a-zA-Z0-9]{2}))"  # Character with hex value hh
+    r"|(N\{([a-zA-Z0-9]{2})\})"  # Character named name in the Unicode database
     r")",
     re.MULTILINE,
 )

From 52bd904d44a9832d83042d3fbfb1c69b1e9c5b67 Mon Sep 17 00:00:00 2001
From: Shivansh-007 <shivansh-007@outlook.com>
Date: Wed, 16 Mar 2022 10:04:27 +0530
Subject: [PATCH 10/18] ITS RE.VERBOSE NOT RE.MULTILINE?!

---
 src/black/strings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/black/strings.py b/src/black/strings.py
index 4cfca8173d3..6d0b6add08a 100644
--- a/src/black/strings.py
+++ b/src/black/strings.py
@@ -27,7 +27,7 @@
     r"|(x([a-zA-Z0-9]{2}))"  # Character with hex value hh
     r"|(N\{([a-zA-Z0-9]{2})\})"  # Character named name in the Unicode database
     r")",
-    re.MULTILINE,
+    re.VERBOSE,
 )
 
 

From 221995eb18b40ba9df2735b5c85272d19813075a Mon Sep 17 00:00:00 2001
From: Shivansh-007 <shivansh-007@outlook.com>
Date: Thu, 24 Mar 2022 15:59:18 +0530
Subject: [PATCH 11/18] Update CHANGES.md

Co-authored-by: Jelle Zijlstra <jelle.zijlstra@gmail.com>
---
 CHANGES.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGES.md b/CHANGES.md
index 853ba3fc86f..b9fc6e5cb6e 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -12,6 +12,8 @@
 
 ### Preview style
 
+<!-- Changes that affect Black's preview style -->
+
 - Format hex code in unicode escape sequences in string literals (#2916)
 
 - Code cell separators `#%%` are now standardised to `# %%` (#2919)

From 77a48e67e382f7bdba7fb50e0b2410805d23058c Mon Sep 17 00:00:00 2001
From: Jelle Zijlstra <jelle.zijlstra@gmail.com>
Date: Sun, 18 Dec 2022 07:59:01 -0800
Subject: [PATCH 12/18] CR improvements

Also, use named groups
---
 src/black/strings.py | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/src/black/strings.py b/src/black/strings.py
index 6d0b6add08a..77cbbdcf259 100644
--- a/src/black/strings.py
+++ b/src/black/strings.py
@@ -20,12 +20,12 @@
     r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
 )
 FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
-UNICODE_RE: Final = re.compile(
-    r"(\\+)("
-    r"(u([a-zA-Z0-9]{4}))"  # Character with 16-bit hex value xxxx
-    r"|(U([a-zA-Z0-9]{0,8}))"  # Character with 32-bit hex value xxxxxxxx
-    r"|(x([a-zA-Z0-9]{2}))"  # Character with hex value hh
-    r"|(N\{([a-zA-Z0-9]{2})\})"  # Character named name in the Unicode database
+UNICODE_ESCAPE_RE: Final = re.compile(
+    r"(?P<backslashes>\\+)(?P<body>"
+    r"(?P<u>u([a-zA-Z0-9]{4}))"  # Character with 16-bit hex value xxxx
+    r"|(?P<U>U([a-zA-Z0-9]{0,8}))"  # Character with 32-bit hex value xxxxxxxx
+    r"|(?P<x>x([a-zA-Z0-9]{2}))"  # Character with hex value hh
+    r"|(?P<N>N\{([a-zA-Z0-9]{2})\})"  # Character named name in the Unicode database
     r")",
     re.VERBOSE,
 )
@@ -253,25 +253,27 @@ def normalize_unicode_escape_sequences(leaf: Leaf) -> None:
     """Replace hex codes in Unicode escape sequences with lowercase representation."""
     text = leaf.value
     prefix = get_string_prefix(text)
+    if "r" in prefix.lower():
+        return
 
     def replace(m: Match[str]) -> str:
         groups = m.groups()
-        back_slashes = groups[0]
+        back_slashes = groups["backslashes"]
 
-        if len(back_slashes) % 2 == 0 or prefix == "r":
-            return back_slashes + groups[1]
+        if len(back_slashes) % 2 == 0:
+            return back_slashes + groups["body"]
 
-        if groups[2]:
+        if groups["u"]:
             # \u
-            return back_slashes + "u" + groups[3].lower()
-        elif groups[4]:
+            return back_slashes + "u" + groups["u"].lower()
+        elif groups["U"]:
             # \U
-            return back_slashes + "U" + groups[5].lower()
-        elif groups[6]:
+            return back_slashes + "U" + groups["U"].lower()
+        elif groups["x"]:
             # \x
-            return back_slashes + "x" + groups[7].lower()
-        else:
+            return back_slashes + "x" + groups["x"].lower()
+        elif groups["N"]:
             # \N{}
-            return back_slashes + "N{" + groups[9].upper() + "}"
+            return back_slashes + "N{" + groups["N"].upper() + "}"
 
     leaf.value = re.sub(UNICODE_RE, replace, text)

From 1b9d5fda4eb7cfef83d8242becabb063583b592e Mon Sep 17 00:00:00 2001
From: Jelle Zijlstra <jelle.zijlstra@gmail.com>
Date: Sun, 18 Dec 2022 08:00:20 -0800
Subject: [PATCH 13/18] fix lint

---
 src/black/strings.py | 2 +-
 tests/test_format.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/black/strings.py b/src/black/strings.py
index 77cbbdcf259..e5cc736480b 100644
--- a/src/black/strings.py
+++ b/src/black/strings.py
@@ -5,7 +5,7 @@
 import re
 import sys
 from functools import lru_cache
-from typing import List, Pattern, Match
+from typing import List, Match, Pattern
 
 from blib2to3.pytree import Leaf
 
diff --git a/tests/test_format.py b/tests/test_format.py
index ae9740727ff..01cd61eef63 100644
--- a/tests/test_format.py
+++ b/tests/test_format.py
@@ -14,6 +14,7 @@
     read_data,
 )
 
+
 @pytest.fixture(autouse=True)
 def patch_dump_to_file(request: Any) -> Iterator[None]:
     with patch("black.dump_to_file", dump_to_stderr):

From 3c24427b6c54d7ab8a884772ccb90720e55355f2 Mon Sep 17 00:00:00 2001
From: Jelle Zijlstra <jelle.zijlstra@gmail.com>
Date: Sun, 18 Dec 2022 08:09:07 -0800
Subject: [PATCH 14/18] fix my sloppy code

---
 src/black/strings.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/black/strings.py b/src/black/strings.py
index e5cc736480b..16e17852aef 100644
--- a/src/black/strings.py
+++ b/src/black/strings.py
@@ -22,10 +22,10 @@
 FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
 UNICODE_ESCAPE_RE: Final = re.compile(
     r"(?P<backslashes>\\+)(?P<body>"
-    r"(?P<u>u([a-zA-Z0-9]{4}))"  # Character with 16-bit hex value xxxx
-    r"|(?P<U>U([a-zA-Z0-9]{0,8}))"  # Character with 32-bit hex value xxxxxxxx
-    r"|(?P<x>x([a-zA-Z0-9]{2}))"  # Character with hex value hh
-    r"|(?P<N>N\{([a-zA-Z0-9]{2})\})"  # Character named name in the Unicode database
+    r"(u(?P<u>[a-fA-F0-9]{4}))"  # Character with 16-bit hex value xxxx
+    r"|(U(?P<U>[a-fA-F0-9]{0,8}))"  # Character with 32-bit hex value xxxxxxxx
+    r"|(x(?P<x>[a-fA-F0-9]{2}))"  # Character with hex value hh
+    r"|(N\{(?P<N>[a-fA-F0-9]{2})\})"  # Character named name in the Unicode database
     r")",
     re.VERBOSE,
 )
@@ -257,7 +257,7 @@ def normalize_unicode_escape_sequences(leaf: Leaf) -> None:
         return
 
     def replace(m: Match[str]) -> str:
-        groups = m.groups()
+        groups = m.groupdict()
         back_slashes = groups["backslashes"]
 
         if len(back_slashes) % 2 == 0:
@@ -272,8 +272,9 @@ def replace(m: Match[str]) -> str:
         elif groups["x"]:
             # \x
             return back_slashes + "x" + groups["x"].lower()
-        elif groups["N"]:
+        else:
+            assert groups["N"], f"Unexpected match: {m}"
             # \N{}
             return back_slashes + "N{" + groups["N"].upper() + "}"
 
-    leaf.value = re.sub(UNICODE_RE, replace, text)
+    leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text)

From 9f35b61bdff21671576eafc952b8cfd76c8a9e9e Mon Sep 17 00:00:00 2001
From: Jelle Zijlstra <jelle.zijlstra@gmail.com>
Date: Sun, 18 Dec 2022 08:13:08 -0800
Subject: [PATCH 15/18] fix the new test; \U requires exactly 8 digits

---
 src/black/strings.py                                  | 4 ++--
 tests/data/{ => preview}/format_unicode_escape_seq.py | 0
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename tests/data/{ => preview}/format_unicode_escape_seq.py (100%)

diff --git a/src/black/strings.py b/src/black/strings.py
index 16e17852aef..52f8b8e786e 100644
--- a/src/black/strings.py
+++ b/src/black/strings.py
@@ -23,9 +23,9 @@
 UNICODE_ESCAPE_RE: Final = re.compile(
     r"(?P<backslashes>\\+)(?P<body>"
     r"(u(?P<u>[a-fA-F0-9]{4}))"  # Character with 16-bit hex value xxxx
-    r"|(U(?P<U>[a-fA-F0-9]{0,8}))"  # Character with 32-bit hex value xxxxxxxx
+    r"|(U(?P<U>[a-fA-F0-9]{8}))"  # Character with 32-bit hex value xxxxxxxx
     r"|(x(?P<x>[a-fA-F0-9]{2}))"  # Character with hex value hh
-    r"|(N\{(?P<N>[a-fA-F0-9]{2})\})"  # Character named name in the Unicode database
+    r"|(N\{(?P<N>[a-zA-Z0-9]{2})\})"  # Character named name in the Unicode database
     r")",
     re.VERBOSE,
 )
diff --git a/tests/data/format_unicode_escape_seq.py b/tests/data/preview/format_unicode_escape_seq.py
similarity index 100%
rename from tests/data/format_unicode_escape_seq.py
rename to tests/data/preview/format_unicode_escape_seq.py

From 27d2d865fef6a6c9325f2a774717eb7b507e8313 Mon Sep 17 00:00:00 2001
From: Jelle Zijlstra <jelle.zijlstra@gmail.com>
Date: Sun, 18 Dec 2022 08:21:51 -0800
Subject: [PATCH 16/18] fix \N escapes

---
 src/black/strings.py                            | 2 +-
 tests/data/preview/format_unicode_escape_seq.py | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/black/strings.py b/src/black/strings.py
index 52f8b8e786e..3e3bc12fe72 100644
--- a/src/black/strings.py
+++ b/src/black/strings.py
@@ -25,7 +25,7 @@
     r"(u(?P<u>[a-fA-F0-9]{4}))"  # Character with 16-bit hex value xxxx
     r"|(U(?P<U>[a-fA-F0-9]{8}))"  # Character with 32-bit hex value xxxxxxxx
     r"|(x(?P<x>[a-fA-F0-9]{2}))"  # Character with hex value hh
-    r"|(N\{(?P<N>[a-zA-Z0-9]{2})\})"  # Character named name in the Unicode database
+    r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})"  # Character named name in the Unicode database
     r")",
     re.VERBOSE,
 )
diff --git a/tests/data/preview/format_unicode_escape_seq.py b/tests/data/preview/format_unicode_escape_seq.py
index 2be38d2402d..3e0d8c132a8 100644
--- a/tests/data/preview/format_unicode_escape_seq.py
+++ b/tests/data/preview/format_unicode_escape_seq.py
@@ -8,6 +8,8 @@
 x = "\xA3"
 x = "\u2717"
 x = "\N{ox}\N{OX}"
+x = "\N{lAtIn smaLL letteR x}"
+x = "\N{CYRILLIC small LETTER BYELORUSSIAN-UKRAINIAN I}"
 
 # output
 
@@ -21,3 +23,5 @@
 x = "\xa3"
 x = "\u2717"
 x = "\N{OX}\N{OX}"
+x = "\N{LATIN SMALL LETTER X}"
+x = "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}"

From 420a8f9ee1c61797b3dee7b0d92f1df286bf7039 Mon Sep 17 00:00:00 2001
From: Jelle Zijlstra <jelle.zijlstra@gmail.com>
Date: Sun, 18 Dec 2022 08:27:08 -0800
Subject: [PATCH 17/18] add a test

---
 tests/data/preview/format_unicode_escape_seq.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/data/preview/format_unicode_escape_seq.py b/tests/data/preview/format_unicode_escape_seq.py
index 3e0d8c132a8..be5651f981c 100644
--- a/tests/data/preview/format_unicode_escape_seq.py
+++ b/tests/data/preview/format_unicode_escape_seq.py
@@ -7,6 +7,7 @@
 x = "don't format me"
 x = "\xA3"
 x = "\u2717"
+x = "\uFaCe"
 x = "\N{ox}\N{OX}"
 x = "\N{lAtIn smaLL letteR x}"
 x = "\N{CYRILLIC small LETTER BYELORUSSIAN-UKRAINIAN I}"
@@ -22,6 +23,7 @@
 x = "don't format me"
 x = "\xa3"
 x = "\u2717"
+x = "\uface"
 x = "\N{OX}\N{OX}"
 x = "\N{LATIN SMALL LETTER X}"
 x = "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}"

From 296cdb93a22efc3ac8f3bbc931fb522441249ee5 Mon Sep 17 00:00:00 2001
From: Jelle Zijlstra <jelle.zijlstra@gmail.com>
Date: Sun, 18 Dec 2022 08:34:57 -0800
Subject: [PATCH 18/18] bytes tests

---
 tests/data/preview/format_unicode_escape_seq.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/data/preview/format_unicode_escape_seq.py b/tests/data/preview/format_unicode_escape_seq.py
index be5651f981c..3440696c303 100644
--- a/tests/data/preview/format_unicode_escape_seq.py
+++ b/tests/data/preview/format_unicode_escape_seq.py
@@ -11,6 +11,8 @@
 x = "\N{ox}\N{OX}"
 x = "\N{lAtIn smaLL letteR x}"
 x = "\N{CYRILLIC small LETTER BYELORUSSIAN-UKRAINIAN I}"
+x = b"\x1Fdon't byte"
+x = rb"\x1Fdon't format"
 
 # output
 
@@ -27,3 +29,5 @@
 x = "\N{OX}\N{OX}"
 x = "\N{LATIN SMALL LETTER X}"
 x = "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}"
+x = b"\x1fdon't byte"
+x = rb"\x1Fdon't format"