Skip to content

Commit

Permalink
Format hex code in unicode escape sequences in string literals (#2916)
Browse files Browse the repository at this point in the history
Co-authored-by: Jelle Zijlstra <jelle.zijlstra@gmail.com>
  • Loading branch information
Shivansh-007 and JelleZijlstra committed Jan 22, 2023
1 parent 1557f7d commit eabff67
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGES.md
Expand Up @@ -16,6 +16,7 @@

<!-- Changes that affect Black's preview style -->

- Format hex code in unicode escape sequences in string literals (#2916)
- Add parentheses around `if`-`else` expressions (#2278)
- Improve the performance on large expressions that contain many strings (#3467)
- Fix a crash in preview style with assert + parenthesized string (#3415)
Expand Down
4 changes: 4 additions & 0 deletions src/black/linegen.py
Expand Up @@ -59,6 +59,7 @@
get_string_prefix,
normalize_string_prefix,
normalize_string_quotes,
normalize_unicode_escape_sequences,
)
from black.trans import (
CannotTransform,
Expand Down Expand Up @@ -368,6 +369,9 @@ def visit_factor(self, node: Node) -> Iterator[Line]:
yield from self.visit_default(node)

def visit_STRING(self, leaf: Leaf) -> Iterator[Line]:
if Preview.hex_codes_in_unicode_sequences in self.mode:
normalize_unicode_escape_sequences(leaf)

if is_docstring(leaf) and "\\\n" not in leaf.value:
# We're ignoring docstrings with backslash newline escapes because changing
# indentation of those changes the AST representation of the code.
Expand Down
1 change: 1 addition & 0 deletions src/black/mode.py
Expand Up @@ -153,6 +153,7 @@ def supports_feature(target_versions: Set[TargetVersion], feature: Feature) -> b
class Preview(Enum):
"""Individual preview style features."""

hex_codes_in_unicode_sequences = auto()
annotation_parens = auto()
empty_lines_before_class_or_def_with_leading_comments = auto()
handle_trailing_commas_in_head = auto()
Expand Down
44 changes: 43 additions & 1 deletion src/black/strings.py
Expand Up @@ -5,7 +5,9 @@
import re
import sys
from functools import lru_cache
from typing import List, Pattern
from typing import List, Match, Pattern

from blib2to3.pytree import Leaf

if sys.version_info < (3, 8):
from typing_extensions import Final
Expand All @@ -18,6 +20,15 @@
r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
)
FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
UNICODE_ESCAPE_RE: Final = re.compile(
r"(?P<backslashes>\\+)(?P<body>"
r"(u(?P<u>[a-fA-F0-9]{4}))" # Character with 16-bit hex value xxxx
r"|(U(?P<U>[a-fA-F0-9]{8}))" # Character with 32-bit hex value xxxxxxxx
r"|(x(?P<x>[a-fA-F0-9]{2}))" # Character with hex value hh
r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})" # Character named name in the Unicode database
r")",
re.VERBOSE,
)


def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
Expand Down Expand Up @@ -236,3 +247,34 @@ def normalize_string_quotes(s: str) -> str:
return s # Prefer double quotes

return f"{prefix}{new_quote}{new_body}{new_quote}"


def normalize_unicode_escape_sequences(leaf: Leaf) -> None:
"""Replace hex codes in Unicode escape sequences with lowercase representation."""
text = leaf.value
prefix = get_string_prefix(text)
if "r" in prefix.lower():
return

def replace(m: Match[str]) -> str:
groups = m.groupdict()
back_slashes = groups["backslashes"]

if len(back_slashes) % 2 == 0:
return back_slashes + groups["body"]

if groups["u"]:
# \u
return back_slashes + "u" + groups["u"].lower()
elif groups["U"]:
# \U
return back_slashes + "U" + groups["U"].lower()
elif groups["x"]:
# \x
return back_slashes + "x" + groups["x"].lower()
else:
assert groups["N"], f"Unexpected match: {m}"
# \N{}
return back_slashes + "N{" + groups["N"].upper() + "}"

leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text)
33 changes: 33 additions & 0 deletions tests/data/preview/format_unicode_escape_seq.py
@@ -0,0 +1,33 @@
x = "\x1F"
x = "\\x1B"
x = "\\\x1B"
x = "\U0001F60E"
x = "\u0001F60E"
x = r"\u0001F60E"
x = "don't format me"
x = "\xA3"
x = "\u2717"
x = "\uFaCe"
x = "\N{ox}\N{OX}"
x = "\N{lAtIn smaLL letteR x}"
x = "\N{CYRILLIC small LETTER BYELORUSSIAN-UKRAINIAN I}"
x = b"\x1Fdon't byte"
x = rb"\x1Fdon't format"

# output

x = "\x1f"
x = "\\x1B"
x = "\\\x1b"
x = "\U0001f60e"
x = "\u0001F60E"
x = r"\u0001F60E"
x = "don't format me"
x = "\xa3"
x = "\u2717"
x = "\uface"
x = "\N{OX}\N{OX}"
x = "\N{LATIN SMALL LETTER X}"
x = "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}"
x = b"\x1fdon't byte"
x = rb"\x1Fdon't format"

0 comments on commit eabff67

Please sign in to comment.