Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Format hex code in unicode escape sequences in string literals #2916

Merged
merged 24 commits into from Jan 22, 2023
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
add30b8
Format hex code in unicode escape sequences in string literals
Shivansh-007 Jan 30, 2022
483fc15
Format \N character name escapes with uppercased literals
Shivansh-007 Jan 31, 2022
cc48d2d
Fix formatting with correct length for each format
Shivansh-007 Mar 13, 2022
f1dbc96
Add changelog
Shivansh-007 Mar 13, 2022
ef442a6
Move feature to preview styling only
Shivansh-007 Mar 13, 2022
2ada012
Fix typo
Shivansh-007 Mar 13, 2022
125ebec
Change Match[AnyStr] to Match[str]
Shivansh-007 Mar 16, 2022
af86102
Make UNICODE_RE Final and accept multiline strings
Shivansh-007 Mar 16, 2022
69c9664
Reword regex comments to use 'character'
Shivansh-007 Mar 16, 2022
7d0e548
Merge remote-tracking branch 'upstream/main' into format/hex-code-lit…
Shivansh-007 Mar 16, 2022
52bd904
ITS RE.VERBOSE NOT RE.MULTILINE?!
Shivansh-007 Mar 16, 2022
a5c4e62
Merge branch 'main' into format/hex-code-literals
JelleZijlstra Mar 24, 2022
221995e
Update CHANGES.md
Shivansh-007 Mar 24, 2022
d4dde2e
Merge branch 'main' into format/hex-code-literals
JelleZijlstra Apr 2, 2022
3557faf
Merge branch 'main' into format/hex-code-literals
JelleZijlstra Dec 18, 2022
77a48e6
CR improvements
JelleZijlstra Dec 18, 2022
1b9d5fd
fix lint
JelleZijlstra Dec 18, 2022
3c24427
fix my sloppy code
JelleZijlstra Dec 18, 2022
9f35b61
fix the new test; \U requires exactly 8 digits
JelleZijlstra Dec 18, 2022
27d2d86
fix \N escapes
JelleZijlstra Dec 18, 2022
420a8f9
add a test
JelleZijlstra Dec 18, 2022
296cdb9
bytes tests
JelleZijlstra Dec 18, 2022
625c085
Merge branch 'main' into format/hex-code-literals
JelleZijlstra Dec 29, 2022
1511959
Merge branch 'main' into format/hex-code-literals
JelleZijlstra Dec 29, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGES.md
Expand Up @@ -12,7 +12,7 @@

### Preview style

<!-- Changes that affect Black's preview style -->
- Format hex code in unicode escape sequences in string literals (#2916)
Shivansh-007 marked this conversation as resolved.
Show resolved Hide resolved

- Code cell separators `#%%` are now standardised to `# %%` (#2919)
- Avoid magic-trailing-comma in single-element subscripts (#2942)
Expand Down
12 changes: 10 additions & 2 deletions src/black/linegen.py
Expand Up @@ -24,8 +24,13 @@
from black.lines import can_omit_invisible_parens, can_be_split, append_leaves
from black.comments import generate_comments, list_comments, FMT_OFF
from black.numerics import normalize_numeric_literal
from black.strings import get_string_prefix, fix_docstring
from black.strings import normalize_string_prefix, normalize_string_quotes
from black.strings import (
get_string_prefix,
fix_docstring,
normalize_string_prefix,
normalize_string_quotes,
normalize_unicode_escape_sequences,
)
from black.trans import Transformer, CannotTransform, StringMerger, StringSplitter
from black.trans import StringParenWrapper, StringParenStripper, hug_power_op
from black.mode import Mode, Feature, Preview
Expand Down Expand Up @@ -260,6 +265,9 @@ def visit_factor(self, node: Node) -> Iterator[Line]:
yield from self.visit_default(node)

def visit_STRING(self, leaf: Leaf) -> Iterator[Line]:
if Preview.hex_codes_in_unicode_sequences in self.mode:
normalize_unicode_escape_sequences(leaf)

if is_docstring(leaf) and "\\\n" not in leaf.value:
# We're ignoring docstrings with backslash newline escapes because changing
# indentation of those changes the AST representation of the code.
Expand Down
1 change: 1 addition & 0 deletions src/black/mode.py
Expand Up @@ -127,6 +127,7 @@ class Preview(Enum):
"""Individual preview style features."""

string_processing = auto()
hex_codes_in_unicode_sequences = auto()
one_element_subscript = auto()


Expand Down
41 changes: 40 additions & 1 deletion src/black/strings.py
Expand Up @@ -5,7 +5,9 @@
import re
import sys
from functools import lru_cache
from typing import List, Pattern
from typing import List, Pattern, Match

from blib2to3.pytree import Leaf

if sys.version_info < (3, 8):
from typing_extensions import Final
Expand All @@ -18,6 +20,15 @@
r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
)
FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
UNICODE_RE: Final = re.compile(
JelleZijlstra marked this conversation as resolved.
Show resolved Hide resolved
r"(\\+)("
r"(u([a-zA-Z0-9]{4}))" # Character with 16-bit hex value xxxx
r"|(U([a-zA-Z0-9]{0,8}))" # Character with 32-bit hex value xxxxxxxx
r"|(x([a-zA-Z0-9]{2}))" # Character with hex value hh
r"|(N\{([a-zA-Z0-9]{2})\})" # Character named name in the Unicode database
r")",
re.VERBOSE,
)


def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
Expand Down Expand Up @@ -236,3 +247,31 @@ def normalize_string_quotes(s: str) -> str:
return s # Prefer double quotes

return f"{prefix}{new_quote}{new_body}{new_quote}"


def normalize_unicode_escape_sequences(leaf: Leaf) -> None:
"""Replace hex codes in Unicode escape sequences with lowercase representation."""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will have to be thought out still, as this comment points out. My two cents: I prefer upper case, and since Black formats hex numbers to upper already I think it would be consistent. The Python repr argument is solid too, but we should think about changing hex literals as well then.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd rather not change hex numbers, we already changed our mind there a few times.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So if we're not changing numbers (which I agree with), do y'all share the concern for consistency?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My comments read a bit ambiguously. So to be clear, I'm proposing that we switch the formatting to be upper case to be consistent with hex numbers. Y'all in?

text = leaf.value
prefix = get_string_prefix(text)
JelleZijlstra marked this conversation as resolved.
Show resolved Hide resolved

def replace(m: Match[str]) -> str:
groups = m.groups()
back_slashes = groups[0]

if len(back_slashes) % 2 == 0 or prefix == "r":
ichard26 marked this conversation as resolved.
Show resolved Hide resolved
JelleZijlstra marked this conversation as resolved.
Show resolved Hide resolved
return back_slashes + groups[1]

if groups[2]:
# \u
return back_slashes + "u" + groups[3].lower()
elif groups[4]:
# \U
return back_slashes + "U" + groups[5].lower()
elif groups[6]:
# \x
return back_slashes + "x" + groups[7].lower()
else:
# \N{}
return back_slashes + "N{" + groups[9].upper() + "}"

leaf.value = re.sub(UNICODE_RE, replace, text)
23 changes: 23 additions & 0 deletions tests/data/format_unicode_escape_seq.py
@@ -0,0 +1,23 @@
x = "\x1F"
x = "\\x1B"
x = "\\\x1B"
x = "\U0001F60E"
x = "\u0001F60E"
x = r"\u0001F60E"
x = "don't format me"
x = "\xA3"
x = "\u2717"
x = "\N{ox}\N{OX}"

# output

x = "\x1f"
x = "\\x1B"
x = "\\\x1b"
x = "\U0001f60e"
x = "\u0001F60E"
x = r"\u0001F60E"
x = "don't format me"
x = "\xa3"
x = "\u2717"
x = "\N{OX}\N{OX}"
1 change: 1 addition & 0 deletions tests/test_format.py
Expand Up @@ -76,6 +76,7 @@
"cantfit",
"comments7",
"comments8",
"format_unicode_escape_seq",
"long_strings",
"long_strings__edge_case",
"long_strings__regression",
Expand Down