From add30b8a28403b09927bdcf7dec64633fef7826a Mon Sep 17 00:00:00 2001 From: Shivansh-007 Date: Sun, 30 Jan 2022 12:43:19 +0530 Subject: [PATCH 01/18] Format hex code in unicode escape sequences in string literals --- src/black/linegen.py | 11 +++++++++-- src/black/strings.py | 16 +++++++++++++++- tests/data/format_unicode_escape_seq.py | 18 ++++++++++++++++++ 3 files changed, 42 insertions(+), 3 deletions(-) create mode 100644 tests/data/format_unicode_escape_seq.py diff --git a/src/black/linegen.py b/src/black/linegen.py index 4dc242a1dfe..4b5b160ada8 100644 --- a/src/black/linegen.py +++ b/src/black/linegen.py @@ -19,8 +19,13 @@ from black.lines import can_omit_invisible_parens, can_be_split, append_leaves from black.comments import generate_comments, list_comments, FMT_OFF from black.numerics import normalize_numeric_literal -from black.strings import get_string_prefix, fix_docstring -from black.strings import normalize_string_prefix, normalize_string_quotes +from black.strings import ( + get_string_prefix, + fix_docstring, + normalize_string_prefix, + normalize_string_quotes, + normalize_unicode_escape_sequences, +) from black.trans import Transformer, CannotTransform, StringMerger, StringSplitter from black.trans import StringParenWrapper, StringParenStripper, hug_power_op from black.mode import Mode, Feature, Preview @@ -255,6 +260,8 @@ def visit_factor(self, node: Node) -> Iterator[Line]: yield from self.visit_default(node) def visit_STRING(self, leaf: Leaf) -> Iterator[Line]: + normalize_unicode_escape_sequences(leaf) + if is_docstring(leaf) and "\\\n" not in leaf.value: # We're ignoring docstrings with backslash newline escapes because changing # indentation of those changes the AST representation of the code. diff --git a/src/black/strings.py b/src/black/strings.py index 9d0e2eb8430..e2b7534a3f2 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -5,7 +5,9 @@ import re import sys from functools import lru_cache -from typing import List, Pattern +from typing import List, Pattern, AnyStr, Match + +from blib2to3.pytree import Leaf if sys.version_info < (3, 8): from typing_extensions import Final @@ -18,6 +20,7 @@ r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL ) FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)") +UNICODE_RE = re.compile(r"(\\+)(u|U|x)([a-zA-Z0-9]+)") def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str: @@ -236,3 +239,14 @@ def normalize_string_quotes(s: str) -> str: return s # Prefer double quotes return f"{prefix}{new_quote}{new_body}{new_quote}" + + +def normalize_unicode_escape_sequences(leaf: Leaf) -> None: + """Replace hex codes in Unicode escape sequences with lowercase representation.""" + text = leaf.value + + def replace(m: Match[AnyStr]) -> AnyStr: + groups = m.groups() + return groups[0] + groups[1] + groups[2].lower() + + leaf.value = re.sub(UNICODE_RE, replace, text) diff --git a/tests/data/format_unicode_escape_seq.py b/tests/data/format_unicode_escape_seq.py new file mode 100644 index 00000000000..288db53c42f --- /dev/null +++ b/tests/data/format_unicode_escape_seq.py @@ -0,0 +1,18 @@ +x = "\x1B" +x = "\\\U0001f60e" +x = "\u0001F60E" +x = r"\u0001F60E" +x = "don't format me" +x = "\xhhhhh" +x = "\uhhhhh" + + +# Output + +x = "\x1b" +x = "\\\U0001f60e" +x = "\u0001f60e" +x = r"\u0001f60e" +x = "don't format me" +x = "\xhhhhh" +x = "\uhhhhh" From 483fc150faccd7ae56aa8ef8906509447d695ffe Mon Sep 17 00:00:00 2001 From: Shivansh-007 Date: Mon, 31 Jan 2022 07:27:11 +0530 Subject: [PATCH 02/18] Format \N character name escapes with uppercased literals --- src/black/strings.py | 9 +++++++-- tests/data/format_unicode_escape_seq.py | 3 ++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/black/strings.py b/src/black/strings.py index e2b7534a3f2..00c754c14e4 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -20,7 +20,7 @@ r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL ) FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)") -UNICODE_RE = re.compile(r"(\\+)(u|U|x)([a-zA-Z0-9]+)") +UNICODE_RE = re.compile(r"(\\+)(u|U|x|N)(([a-zA-Z0-9]+)|\{([a-zA-Z0-9]+)\})") def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str: @@ -247,6 +247,11 @@ def normalize_unicode_escape_sequences(leaf: Leaf) -> None: def replace(m: Match[AnyStr]) -> AnyStr: groups = m.groups() - return groups[0] + groups[1] + groups[2].lower() + if m.group(4): + # \\U or \\u or \\x + return groups[0] + groups[1] + groups[2].lower() + else: + # \\N{} + return groups[0] + groups[1] + groups[2].upper() leaf.value = re.sub(UNICODE_RE, replace, text) diff --git a/tests/data/format_unicode_escape_seq.py b/tests/data/format_unicode_escape_seq.py index 288db53c42f..3a8bc20f15d 100644 --- a/tests/data/format_unicode_escape_seq.py +++ b/tests/data/format_unicode_escape_seq.py @@ -5,7 +5,7 @@ x = "don't format me" x = "\xhhhhh" x = "\uhhhhh" - +x = "\N{ox}\N{OX}" # Output @@ -16,3 +16,4 @@ x = "don't format me" x = "\xhhhhh" x = "\uhhhhh" +x = "\N{OX}\N{OX}" From cc48d2df1b408824f94f1fc46394f8a7ea9002a5 Mon Sep 17 00:00:00 2001 From: Shivansh-007 Date: Sun, 13 Mar 2022 14:52:44 +0530 Subject: [PATCH 03/18] Fix formatting with correct length for each format According to the table at https://docs.python.org/3/reference/lexical_analysis.html\#string-and-bytes-literals --- src/black/strings.py | 30 ++++++++++++++++++++----- tests/data/format_unicode_escape_seq.py | 24 +++++++++++--------- tests/test_format.py | 1 + 3 files changed, 39 insertions(+), 16 deletions(-) diff --git a/src/black/strings.py b/src/black/strings.py index 00c754c14e4..1302a40f42e 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -20,7 +20,14 @@ r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL ) FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)") -UNICODE_RE = re.compile(r"(\\+)(u|U|x|N)(([a-zA-Z0-9]+)|\{([a-zA-Z0-9]+)\})") +UNICODE_RE = re.compile( + r"(\\+)(" + r"(u([a-zA-Z0-9]{4}))" + r"|(U([a-zA-Z0-9]{0,8}))" + r"|(x([a-zA-Z0-9]{2}))" + r"|(N\{([a-zA-Z0-9]{2})\})" + r")" +) def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str: @@ -244,14 +251,25 @@ def normalize_string_quotes(s: str) -> str: def normalize_unicode_escape_sequences(leaf: Leaf) -> None: """Replace hex codes in Unicode escape sequences with lowercase representation.""" text = leaf.value + prefix = get_string_prefix(text) def replace(m: Match[AnyStr]) -> AnyStr: groups = m.groups() - if m.group(4): - # \\U or \\u or \\x - return groups[0] + groups[1] + groups[2].lower() + + if len(groups[0]) % 2 == 0 or prefix == "r": + return groups[0] + groups[1] + + if groups[2]: + # \u + return groups[0] + "u" + groups[3].lower() + elif groups[4]: + # \U + return groups[0] + "U" + groups[5].lower() + elif groups[6]: + # \x + return groups[0] + "x" + groups[7].lower() else: - # \\N{} - return groups[0] + groups[1] + groups[2].upper() + # \N{} + return groups[0] + "N{" + groups[9].upper() + "}" leaf.value = re.sub(UNICODE_RE, replace, text) diff --git a/tests/data/format_unicode_escape_seq.py b/tests/data/format_unicode_escape_seq.py index 3a8bc20f15d..25a7994f197 100644 --- a/tests/data/format_unicode_escape_seq.py +++ b/tests/data/format_unicode_escape_seq.py @@ -1,19 +1,23 @@ -x = "\x1B" -x = "\\\U0001f60e" +x = "\x1F" +x = "\\x1B" +x = "\\\x1B" +x = "\U0001F60E" x = "\u0001F60E" x = r"\u0001F60E" x = "don't format me" -x = "\xhhhhh" -x = "\uhhhhh" +x = "\xA3" +x = "\u2717" x = "\N{ox}\N{OX}" # Output -x = "\x1b" -x = "\\\U0001f60e" -x = "\u0001f60e" -x = r"\u0001f60e" +x = "\x1f" +x = "\\x1B" +x = "\\\x1b" +x = "\U0001f60e" +x = "\u0001F60E" +x = r"\u0001F60E" x = "don't format me" -x = "\xhhhhh" -x = "\uhhhhh" +x = "\xa3" +x = "\u2717" x = "\N{OX}\N{OX}" diff --git a/tests/test_format.py b/tests/test_format.py index 04eda43d5cf..799c599c9ba 100644 --- a/tests/test_format.py +++ b/tests/test_format.py @@ -44,6 +44,7 @@ "fmtskip4", "fmtskip5", "fmtskip6", + "format_unicode_escape_seq", "fstring", "function", "function2", From f1dbc964af5427b2aed1b2536d1aa2fe5a5cf843 Mon Sep 17 00:00:00 2001 From: Shivansh-007 Date: Sun, 13 Mar 2022 14:56:06 +0530 Subject: [PATCH 04/18] Add changelog --- CHANGES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index edca0dcdad4..f4c26bb6194 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -12,7 +12,7 @@ ### Preview style - +- Format hex code in unicode escape sequences in string literals (#2916) ### _Blackd_ From ef442a6d83577348c71317826fc01fff7fd31a35 Mon Sep 17 00:00:00 2001 From: Shivansh-007 Date: Sun, 13 Mar 2022 16:07:49 +0530 Subject: [PATCH 05/18] Move feature to preview styling only --- src/black/linegen.py | 4 +++- src/black/mode.py | 1 + tests/test_format.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/black/linegen.py b/src/black/linegen.py index 4b5b160ada8..96dea20dea3 100644 --- a/src/black/linegen.py +++ b/src/black/linegen.py @@ -260,7 +260,9 @@ def visit_factor(self, node: Node) -> Iterator[Line]: yield from self.visit_default(node) def visit_STRING(self, leaf: Leaf) -> Iterator[Line]: - normalize_unicode_escape_sequences(leaf) + if Preview.hex_codes_in_unicode_sequences in self.mode: + # Preview style only + normalize_unicode_escape_sequences(leaf) if is_docstring(leaf) and "\\\n" not in leaf.value: # We're ignoring docstrings with backslash newline escapes because changing diff --git a/src/black/mode.py b/src/black/mode.py index 455ed36e27e..59715db5d07 100644 --- a/src/black/mode.py +++ b/src/black/mode.py @@ -128,6 +128,7 @@ class Preview(Enum): string_processing = auto() hug_simple_powers = auto() + hex_codes_in_unicode_sequences = auto() class Deprecated(UserWarning): diff --git a/tests/test_format.py b/tests/test_format.py index 799c599c9ba..3fe77eb7e15 100644 --- a/tests/test_format.py +++ b/tests/test_format.py @@ -44,7 +44,6 @@ "fmtskip4", "fmtskip5", "fmtskip6", - "format_unicode_escape_seq", "fstring", "function", "function2", @@ -76,6 +75,7 @@ # string processing "cantfit", "comments7", + "format_unicode_escape_seq", "long_strings", "long_strings__edge_case", "long_strings__regression", From 2ada012cae1e7a465506c5f725851eb739cc985c Mon Sep 17 00:00:00 2001 From: Shivansh-007 Date: Sun, 13 Mar 2022 16:23:23 +0530 Subject: [PATCH 06/18] Fix typo --- tests/data/format_unicode_escape_seq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data/format_unicode_escape_seq.py b/tests/data/format_unicode_escape_seq.py index 25a7994f197..2be38d2402d 100644 --- a/tests/data/format_unicode_escape_seq.py +++ b/tests/data/format_unicode_escape_seq.py @@ -9,7 +9,7 @@ x = "\u2717" x = "\N{ox}\N{OX}" -# Output +# output x = "\x1f" x = "\\x1B" From 125ebec401bcbd10b944271032552365ab558683 Mon Sep 17 00:00:00 2001 From: Shivansh-007 Date: Wed, 16 Mar 2022 06:44:15 +0530 Subject: [PATCH 07/18] Change Match[AnyStr] to Match[str] --- src/black/linegen.py | 1 - src/black/strings.py | 25 +++++++++++++------------ test.py | 10 ++++++++++ 3 files changed, 23 insertions(+), 13 deletions(-) create mode 100644 test.py diff --git a/src/black/linegen.py b/src/black/linegen.py index 96dea20dea3..9df5122549e 100644 --- a/src/black/linegen.py +++ b/src/black/linegen.py @@ -261,7 +261,6 @@ def visit_factor(self, node: Node) -> Iterator[Line]: def visit_STRING(self, leaf: Leaf) -> Iterator[Line]: if Preview.hex_codes_in_unicode_sequences in self.mode: - # Preview style only normalize_unicode_escape_sequences(leaf) if is_docstring(leaf) and "\\\n" not in leaf.value: diff --git a/src/black/strings.py b/src/black/strings.py index 1302a40f42e..e578393f457 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -5,7 +5,7 @@ import re import sys from functools import lru_cache -from typing import List, Pattern, AnyStr, Match +from typing import List, Pattern, Match from blib2to3.pytree import Leaf @@ -22,10 +22,10 @@ FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)") UNICODE_RE = re.compile( r"(\\+)(" - r"(u([a-zA-Z0-9]{4}))" - r"|(U([a-zA-Z0-9]{0,8}))" - r"|(x([a-zA-Z0-9]{2}))" - r"|(N\{([a-zA-Z0-9]{2})\})" + r"(u([a-zA-Z0-9]{4}))" # Formatting 16-bit unicodes i.e. \uxxxx + r"|(U([a-zA-Z0-9]{0,8}))" # Formatting 32-bit unicodes i.e. \Uxxxxxxxx + r"|(x([a-zA-Z0-9]{2}))" # Formatting unicodes in format of \xhh + r"|(N\{([a-zA-Z0-9]{2})\})" # Formatting named unicodes in format of \N{name} r")" ) @@ -253,23 +253,24 @@ def normalize_unicode_escape_sequences(leaf: Leaf) -> None: text = leaf.value prefix = get_string_prefix(text) - def replace(m: Match[AnyStr]) -> AnyStr: + def replace(m: Match[str]) -> str: groups = m.groups() + back_slashes = groups[0] - if len(groups[0]) % 2 == 0 or prefix == "r": - return groups[0] + groups[1] + if len(back_slashes) % 2 == 0 or prefix == "r": + return back_slashes + groups[1] if groups[2]: # \u - return groups[0] + "u" + groups[3].lower() + return back_slashes + "u" + groups[3].lower() elif groups[4]: # \U - return groups[0] + "U" + groups[5].lower() + return back_slashes + "U" + groups[5].lower() elif groups[6]: # \x - return groups[0] + "x" + groups[7].lower() + return back_slashes + "x" + groups[7].lower() else: # \N{} - return groups[0] + "N{" + groups[9].upper() + "}" + return back_slashes + "N{" + groups[9].upper() + "}" leaf.value = re.sub(UNICODE_RE, replace, text) diff --git a/test.py b/test.py new file mode 100644 index 00000000000..7e715cfd8c8 --- /dev/null +++ b/test.py @@ -0,0 +1,10 @@ +x = "\x1f" +x = "\\x1B" +x = "\\\x1b" +x = "\U0001f60e" +x = "\u0001F60E" +x = r"\u0001F60E" +x = "don't format me" +x = "\xa3" +x = "\u2717" +x = "\N{OX}\N{OX}" From af86102e2507b8fb35c158fa9255606567353045 Mon Sep 17 00:00:00 2001 From: Shivansh-007 Date: Wed, 16 Mar 2022 09:27:18 +0530 Subject: [PATCH 08/18] Make UNICODE_RE Final and accept multiline strings --- src/black/strings.py | 5 +++-- test.py | 10 ---------- 2 files changed, 3 insertions(+), 12 deletions(-) delete mode 100644 test.py diff --git a/src/black/strings.py b/src/black/strings.py index e578393f457..93c54b69f1b 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -20,13 +20,14 @@ r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL ) FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)") -UNICODE_RE = re.compile( +UNICODE_RE: Final = re.compile( r"(\\+)(" r"(u([a-zA-Z0-9]{4}))" # Formatting 16-bit unicodes i.e. \uxxxx r"|(U([a-zA-Z0-9]{0,8}))" # Formatting 32-bit unicodes i.e. \Uxxxxxxxx r"|(x([a-zA-Z0-9]{2}))" # Formatting unicodes in format of \xhh r"|(N\{([a-zA-Z0-9]{2})\})" # Formatting named unicodes in format of \N{name} - r")" + r")", + re.MULTILINE, ) diff --git a/test.py b/test.py deleted file mode 100644 index 7e715cfd8c8..00000000000 --- a/test.py +++ /dev/null @@ -1,10 +0,0 @@ -x = "\x1f" -x = "\\x1B" -x = "\\\x1b" -x = "\U0001f60e" -x = "\u0001F60E" -x = r"\u0001F60E" -x = "don't format me" -x = "\xa3" -x = "\u2717" -x = "\N{OX}\N{OX}" From 69c9664f1cd0d1c6a4722d88fbba308e8655a34a Mon Sep 17 00:00:00 2001 From: Shivansh-007 Date: Wed, 16 Mar 2022 09:28:13 +0530 Subject: [PATCH 09/18] Reword regex comments to use 'character' --- src/black/strings.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/black/strings.py b/src/black/strings.py index 93c54b69f1b..4cfca8173d3 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -22,10 +22,10 @@ FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)") UNICODE_RE: Final = re.compile( r"(\\+)(" - r"(u([a-zA-Z0-9]{4}))" # Formatting 16-bit unicodes i.e. \uxxxx - r"|(U([a-zA-Z0-9]{0,8}))" # Formatting 32-bit unicodes i.e. \Uxxxxxxxx - r"|(x([a-zA-Z0-9]{2}))" # Formatting unicodes in format of \xhh - r"|(N\{([a-zA-Z0-9]{2})\})" # Formatting named unicodes in format of \N{name} + r"(u([a-zA-Z0-9]{4}))" # Character with 16-bit hex value xxxx + r"|(U([a-zA-Z0-9]{0,8}))" # Character with 32-bit hex value xxxxxxxx + r"|(x([a-zA-Z0-9]{2}))" # Character with hex value hh + r"|(N\{([a-zA-Z0-9]{2})\})" # Character named name in the Unicode database r")", re.MULTILINE, ) From 52bd904d44a9832d83042d3fbfb1c69b1e9c5b67 Mon Sep 17 00:00:00 2001 From: Shivansh-007 Date: Wed, 16 Mar 2022 10:04:27 +0530 Subject: [PATCH 10/18] ITS RE.VERBOSE NOT RE.MULTILINE?! --- src/black/strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/black/strings.py b/src/black/strings.py index 4cfca8173d3..6d0b6add08a 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -27,7 +27,7 @@ r"|(x([a-zA-Z0-9]{2}))" # Character with hex value hh r"|(N\{([a-zA-Z0-9]{2})\})" # Character named name in the Unicode database r")", - re.MULTILINE, + re.VERBOSE, ) From 221995eb18b40ba9df2735b5c85272d19813075a Mon Sep 17 00:00:00 2001 From: Shivansh-007 Date: Thu, 24 Mar 2022 15:59:18 +0530 Subject: [PATCH 11/18] Update CHANGES.md Co-authored-by: Jelle Zijlstra --- CHANGES.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 853ba3fc86f..b9fc6e5cb6e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -12,6 +12,8 @@ ### Preview style + + - Format hex code in unicode escape sequences in string literals (#2916) - Code cell separators `#%%` are now standardised to `# %%` (#2919) From 77a48e67e382f7bdba7fb50e0b2410805d23058c Mon Sep 17 00:00:00 2001 From: Jelle Zijlstra Date: Sun, 18 Dec 2022 07:59:01 -0800 Subject: [PATCH 12/18] CR improvements Also, use named groups --- src/black/strings.py | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/src/black/strings.py b/src/black/strings.py index 6d0b6add08a..77cbbdcf259 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -20,12 +20,12 @@ r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL ) FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)") -UNICODE_RE: Final = re.compile( - r"(\\+)(" - r"(u([a-zA-Z0-9]{4}))" # Character with 16-bit hex value xxxx - r"|(U([a-zA-Z0-9]{0,8}))" # Character with 32-bit hex value xxxxxxxx - r"|(x([a-zA-Z0-9]{2}))" # Character with hex value hh - r"|(N\{([a-zA-Z0-9]{2})\})" # Character named name in the Unicode database +UNICODE_ESCAPE_RE: Final = re.compile( + r"(?P\\+)(?P" + r"(?Pu([a-zA-Z0-9]{4}))" # Character with 16-bit hex value xxxx + r"|(?PU([a-zA-Z0-9]{0,8}))" # Character with 32-bit hex value xxxxxxxx + r"|(?Px([a-zA-Z0-9]{2}))" # Character with hex value hh + r"|(?PN\{([a-zA-Z0-9]{2})\})" # Character named name in the Unicode database r")", re.VERBOSE, ) @@ -253,25 +253,27 @@ def normalize_unicode_escape_sequences(leaf: Leaf) -> None: """Replace hex codes in Unicode escape sequences with lowercase representation.""" text = leaf.value prefix = get_string_prefix(text) + if "r" in prefix.lower(): + return def replace(m: Match[str]) -> str: groups = m.groups() - back_slashes = groups[0] + back_slashes = groups["backslashes"] - if len(back_slashes) % 2 == 0 or prefix == "r": - return back_slashes + groups[1] + if len(back_slashes) % 2 == 0: + return back_slashes + groups["body"] - if groups[2]: + if groups["u"]: # \u - return back_slashes + "u" + groups[3].lower() - elif groups[4]: + return back_slashes + "u" + groups["u"].lower() + elif groups["U"]: # \U - return back_slashes + "U" + groups[5].lower() - elif groups[6]: + return back_slashes + "U" + groups["U"].lower() + elif groups["x"]: # \x - return back_slashes + "x" + groups[7].lower() - else: + return back_slashes + "x" + groups["x"].lower() + elif groups["N"]: # \N{} - return back_slashes + "N{" + groups[9].upper() + "}" + return back_slashes + "N{" + groups["N"].upper() + "}" leaf.value = re.sub(UNICODE_RE, replace, text) From 1b9d5fda4eb7cfef83d8242becabb063583b592e Mon Sep 17 00:00:00 2001 From: Jelle Zijlstra Date: Sun, 18 Dec 2022 08:00:20 -0800 Subject: [PATCH 13/18] fix lint --- src/black/strings.py | 2 +- tests/test_format.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/black/strings.py b/src/black/strings.py index 77cbbdcf259..e5cc736480b 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -5,7 +5,7 @@ import re import sys from functools import lru_cache -from typing import List, Pattern, Match +from typing import List, Match, Pattern from blib2to3.pytree import Leaf diff --git a/tests/test_format.py b/tests/test_format.py index ae9740727ff..01cd61eef63 100644 --- a/tests/test_format.py +++ b/tests/test_format.py @@ -14,6 +14,7 @@ read_data, ) + @pytest.fixture(autouse=True) def patch_dump_to_file(request: Any) -> Iterator[None]: with patch("black.dump_to_file", dump_to_stderr): From 3c24427b6c54d7ab8a884772ccb90720e55355f2 Mon Sep 17 00:00:00 2001 From: Jelle Zijlstra Date: Sun, 18 Dec 2022 08:09:07 -0800 Subject: [PATCH 14/18] fix my sloppy code --- src/black/strings.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/black/strings.py b/src/black/strings.py index e5cc736480b..16e17852aef 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -22,10 +22,10 @@ FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)") UNICODE_ESCAPE_RE: Final = re.compile( r"(?P\\+)(?P" - r"(?Pu([a-zA-Z0-9]{4}))" # Character with 16-bit hex value xxxx - r"|(?PU([a-zA-Z0-9]{0,8}))" # Character with 32-bit hex value xxxxxxxx - r"|(?Px([a-zA-Z0-9]{2}))" # Character with hex value hh - r"|(?PN\{([a-zA-Z0-9]{2})\})" # Character named name in the Unicode database + r"(u(?P[a-fA-F0-9]{4}))" # Character with 16-bit hex value xxxx + r"|(U(?P[a-fA-F0-9]{0,8}))" # Character with 32-bit hex value xxxxxxxx + r"|(x(?P[a-fA-F0-9]{2}))" # Character with hex value hh + r"|(N\{(?P[a-fA-F0-9]{2})\})" # Character named name in the Unicode database r")", re.VERBOSE, ) @@ -257,7 +257,7 @@ def normalize_unicode_escape_sequences(leaf: Leaf) -> None: return def replace(m: Match[str]) -> str: - groups = m.groups() + groups = m.groupdict() back_slashes = groups["backslashes"] if len(back_slashes) % 2 == 0: @@ -272,8 +272,9 @@ def replace(m: Match[str]) -> str: elif groups["x"]: # \x return back_slashes + "x" + groups["x"].lower() - elif groups["N"]: + else: + assert groups["N"], f"Unexpected match: {m}" # \N{} return back_slashes + "N{" + groups["N"].upper() + "}" - leaf.value = re.sub(UNICODE_RE, replace, text) + leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text) From 9f35b61bdff21671576eafc952b8cfd76c8a9e9e Mon Sep 17 00:00:00 2001 From: Jelle Zijlstra Date: Sun, 18 Dec 2022 08:13:08 -0800 Subject: [PATCH 15/18] fix the new test; \U requires exactly 8 digits --- src/black/strings.py | 4 ++-- tests/data/{ => preview}/format_unicode_escape_seq.py | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename tests/data/{ => preview}/format_unicode_escape_seq.py (100%) diff --git a/src/black/strings.py b/src/black/strings.py index 16e17852aef..52f8b8e786e 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -23,9 +23,9 @@ UNICODE_ESCAPE_RE: Final = re.compile( r"(?P\\+)(?P" r"(u(?P[a-fA-F0-9]{4}))" # Character with 16-bit hex value xxxx - r"|(U(?P[a-fA-F0-9]{0,8}))" # Character with 32-bit hex value xxxxxxxx + r"|(U(?P[a-fA-F0-9]{8}))" # Character with 32-bit hex value xxxxxxxx r"|(x(?P[a-fA-F0-9]{2}))" # Character with hex value hh - r"|(N\{(?P[a-fA-F0-9]{2})\})" # Character named name in the Unicode database + r"|(N\{(?P[a-zA-Z0-9]{2})\})" # Character named name in the Unicode database r")", re.VERBOSE, ) diff --git a/tests/data/format_unicode_escape_seq.py b/tests/data/preview/format_unicode_escape_seq.py similarity index 100% rename from tests/data/format_unicode_escape_seq.py rename to tests/data/preview/format_unicode_escape_seq.py From 27d2d865fef6a6c9325f2a774717eb7b507e8313 Mon Sep 17 00:00:00 2001 From: Jelle Zijlstra Date: Sun, 18 Dec 2022 08:21:51 -0800 Subject: [PATCH 16/18] fix \N escapes --- src/black/strings.py | 2 +- tests/data/preview/format_unicode_escape_seq.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/black/strings.py b/src/black/strings.py index 52f8b8e786e..3e3bc12fe72 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -25,7 +25,7 @@ r"(u(?P[a-fA-F0-9]{4}))" # Character with 16-bit hex value xxxx r"|(U(?P[a-fA-F0-9]{8}))" # Character with 32-bit hex value xxxxxxxx r"|(x(?P[a-fA-F0-9]{2}))" # Character with hex value hh - r"|(N\{(?P[a-zA-Z0-9]{2})\})" # Character named name in the Unicode database + r"|(N\{(?P[a-zA-Z0-9 \-]{2,})\})" # Character named name in the Unicode database r")", re.VERBOSE, ) diff --git a/tests/data/preview/format_unicode_escape_seq.py b/tests/data/preview/format_unicode_escape_seq.py index 2be38d2402d..3e0d8c132a8 100644 --- a/tests/data/preview/format_unicode_escape_seq.py +++ b/tests/data/preview/format_unicode_escape_seq.py @@ -8,6 +8,8 @@ x = "\xA3" x = "\u2717" x = "\N{ox}\N{OX}" +x = "\N{lAtIn smaLL letteR x}" +x = "\N{CYRILLIC small LETTER BYELORUSSIAN-UKRAINIAN I}" # output @@ -21,3 +23,5 @@ x = "\xa3" x = "\u2717" x = "\N{OX}\N{OX}" +x = "\N{LATIN SMALL LETTER X}" +x = "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}" From 420a8f9ee1c61797b3dee7b0d92f1df286bf7039 Mon Sep 17 00:00:00 2001 From: Jelle Zijlstra Date: Sun, 18 Dec 2022 08:27:08 -0800 Subject: [PATCH 17/18] add a test --- tests/data/preview/format_unicode_escape_seq.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/data/preview/format_unicode_escape_seq.py b/tests/data/preview/format_unicode_escape_seq.py index 3e0d8c132a8..be5651f981c 100644 --- a/tests/data/preview/format_unicode_escape_seq.py +++ b/tests/data/preview/format_unicode_escape_seq.py @@ -7,6 +7,7 @@ x = "don't format me" x = "\xA3" x = "\u2717" +x = "\uFaCe" x = "\N{ox}\N{OX}" x = "\N{lAtIn smaLL letteR x}" x = "\N{CYRILLIC small LETTER BYELORUSSIAN-UKRAINIAN I}" @@ -22,6 +23,7 @@ x = "don't format me" x = "\xa3" x = "\u2717" +x = "\uface" x = "\N{OX}\N{OX}" x = "\N{LATIN SMALL LETTER X}" x = "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}" From 296cdb93a22efc3ac8f3bbc931fb522441249ee5 Mon Sep 17 00:00:00 2001 From: Jelle Zijlstra Date: Sun, 18 Dec 2022 08:34:57 -0800 Subject: [PATCH 18/18] bytes tests --- tests/data/preview/format_unicode_escape_seq.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/data/preview/format_unicode_escape_seq.py b/tests/data/preview/format_unicode_escape_seq.py index be5651f981c..3440696c303 100644 --- a/tests/data/preview/format_unicode_escape_seq.py +++ b/tests/data/preview/format_unicode_escape_seq.py @@ -11,6 +11,8 @@ x = "\N{ox}\N{OX}" x = "\N{lAtIn smaLL letteR x}" x = "\N{CYRILLIC small LETTER BYELORUSSIAN-UKRAINIAN I}" +x = b"\x1Fdon't byte" +x = rb"\x1Fdon't format" # output @@ -27,3 +29,5 @@ x = "\N{OX}\N{OX}" x = "\N{LATIN SMALL LETTER X}" x = "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}" +x = b"\x1fdon't byte" +x = rb"\x1Fdon't format"