Fix determination of f-string expression spans (#2654)

Co-authored-by: Jelle Zijlstra <jelle.zijlstra@gmail.com>
psf · Dec 1, 2021 · f1813e3 · f1813e3
1 parent 0f7cf91
commit f1813e3
Show file tree

Hide file tree

Showing 3 changed files with 106 additions and 17 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -15,6 +15,7 @@
   times, like `match re.match()` (#2661)
 - Fix assignment to environment variables in Jupyter Notebooks (#2642)
 - Add `flake8-simplify` and `flake8-comprehensions` plugins (#2653)
+- Fix determination of f-string expression spans (#2654)
 - Fix parser error location on invalid syntax in a `match` statement (#2649)
 
 ## 21.11b1

diff --git a/src/black/trans.py b/src/black/trans.py
@@ -942,6 +942,57 @@ def _get_max_string_length(self, line: Line, string_idx: int) -> int:
         return max_string_length
 
 
+def iter_fexpr_spans(s: str) -> Iterator[Tuple[int, int]]:
+    """
+    Yields spans corresponding to expressions in a given f-string.
+    Spans are half-open ranges (left inclusive, right exclusive).
+    Assumes the input string is a valid f-string, but will not crash if the input
+    string is invalid.
+    """
+    stack: List[int] = []  # our curly paren stack
+    i = 0
+    while i < len(s):
+        if s[i] == "{":
+            # if we're in a string part of the f-string, ignore escaped curly braces
+            if not stack and i + 1 < len(s) and s[i + 1] == "{":
+                i += 2
+                continue
+            stack.append(i)
+            i += 1
+            continue
+
+        if s[i] == "}":
+            if not stack:
+                i += 1
+                continue
+            j = stack.pop()
+            # we've made it back out of the expression! yield the span
+            if not stack:
+                yield (j, i + 1)
+            i += 1
+            continue
+
+        # if we're in an expression part of the f-string, fast forward through strings
+        # note that backslashes are not legal in the expression portion of f-strings
+        if stack:
+            delim = None
+            if s[i : i + 3] in ("'''", '"""'):
+                delim = s[i : i + 3]
+            elif s[i] in ("'", '"'):
+                delim = s[i]
+            if delim:
+                i += len(delim)
+                while i < len(s) and s[i : i + len(delim)] != delim:
+                    i += 1
+                i += len(delim)
+                continue
+        i += 1
+
+
+def fstring_contains_expr(s: str) -> bool:
+    return any(iter_fexpr_spans(s))
+
+
 class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
     """
     StringTransformer that splits "atom" strings (i.e. strings which exist on
@@ -981,17 +1032,6 @@ class StringSplitter(BaseStringSplitter, CustomSplitMapMixin):
     """
 
     MIN_SUBSTR_SIZE: Final = 6
-    # Matches an "f-expression" (e.g. {var}) that might be found in an f-string.
-    RE_FEXPR: Final = r"""
-    (?<!\{) (?:\{\{)* \{ (?!\{)
-        (?:
-            [^\{\}]
-            | \{\{
-            | \}\}
-            | (?R)
-        )+
-    \}
-    """
 
     def do_splitter_match(self, line: Line) -> TMatchResult:
         LL = line.leaves
@@ -1058,8 +1098,8 @@ def do_transform(self, line: Line, string_idx: int) -> Iterator[TResult[Line]]:
         # contain any f-expressions, but ONLY if the original f-string
         # contains at least one f-expression. Otherwise, we will alter the AST
         # of the program.
-        drop_pointless_f_prefix = ("f" in prefix) and re.search(
-            self.RE_FEXPR, LL[string_idx].value, re.VERBOSE
+        drop_pointless_f_prefix = ("f" in prefix) and fstring_contains_expr(
+            LL[string_idx].value
         )
 
         first_string_line = True
@@ -1299,9 +1339,7 @@ def _iter_fexpr_slices(self, string: str) -> Iterator[Tuple[Index, Index]]:
         """
         if "f" not in get_string_prefix(string).lower():
             return
-
-        for match in re.finditer(self.RE_FEXPR, string, re.VERBOSE):
-            yield match.span()
+        yield from iter_fexpr_spans(string)
 
     def _get_illegal_split_indices(self, string: str) -> Set[Index]:
         illegal_indices: Set[Index] = set()
@@ -1417,7 +1455,7 @@ def _normalize_f_string(self, string: str, prefix: str) -> str:
         """
         assert_is_leaf_string(string)
 
-        if "f" in prefix and not re.search(self.RE_FEXPR, string, re.VERBOSE):
+        if "f" in prefix and not fstring_contains_expr(string):
             new_prefix = prefix.replace("f", "")
 
             temp = string[len(prefix) :]

diff --git a/tests/test_trans.py b/tests/test_trans.py
@@ -0,0 +1,50 @@
+from typing import List, Tuple
+from black.trans import iter_fexpr_spans
+
+
+def test_fexpr_spans() -> None:
+    def check(
+        string: str, expected_spans: List[Tuple[int, int]], expected_slices: List[str]
+    ) -> None:
+        spans = list(iter_fexpr_spans(string))
+
+        # Checking slices isn't strictly necessary, but it's easier to verify at
+        # a glance than only spans
+        assert len(spans) == len(expected_slices)
+        for (i, j), slice in zip(spans, expected_slices):
+            assert len(string[i:j]) == j - i
+            assert string[i:j] == slice
+
+        assert spans == expected_spans
+
+    # Most of these test cases omit the leading 'f' and leading / closing quotes
+    # for convenience
+    # Some additional property-based tests can be found in
+    # https://github.com/psf/black/pull/2654#issuecomment-981411748
+    check("""{var}""", [(0, 5)], ["{var}"])
+    check("""f'{var}'""", [(2, 7)], ["{var}"])
+    check("""f'{1 + f() + 2 + "asdf"}'""", [(2, 24)], ["""{1 + f() + 2 + "asdf"}"""])
+    check("""text {var} text""", [(5, 10)], ["{var}"])
+    check("""text {{ {var} }} text""", [(8, 13)], ["{var}"])
+    check("""{a} {b} {c}""", [(0, 3), (4, 7), (8, 11)], ["{a}", "{b}", "{c}"])
+    check("""f'{a} {b} {c}'""", [(2, 5), (6, 9), (10, 13)], ["{a}", "{b}", "{c}"])
+    check("""{ {} }""", [(0, 6)], ["{ {} }"])
+    check("""{ {{}} }""", [(0, 8)], ["{ {{}} }"])
+    check("""{ {{{}}} }""", [(0, 10)], ["{ {{{}}} }"])
+    check("""{{ {{{}}} }}""", [(5, 7)], ["{}"])
+    check("""{{ {{{var}}} }}""", [(5, 10)], ["{var}"])
+    check("""{f"{0}"}""", [(0, 8)], ["""{f"{0}"}"""])
+    check("""{"'"}""", [(0, 5)], ["""{"'"}"""])
+    check("""{"{"}""", [(0, 5)], ["""{"{"}"""])
+    check("""{"}"}""", [(0, 5)], ["""{"}"}"""])
+    check("""{"{{"}""", [(0, 6)], ["""{"{{"}"""])
+    check("""{''' '''}""", [(0, 9)], ["""{''' '''}"""])
+    check("""{'''{'''}""", [(0, 9)], ["""{'''{'''}"""])
+    check("""{''' {'{ '''}""", [(0, 13)], ["""{''' {'{ '''}"""])
+    check(
+        '''f\'\'\'-{f"""*{f"+{f'.{x}.'}+"}*"""}-'y\\'\'\'\'''',
+        [(5, 33)],
+        ['''{f"""*{f"+{f'.{x}.'}+"}*"""}'''],
+    )
+    check(r"""{}{""", [(0, 2)], ["{}"])
+    check("""f"{'{'''''''''}\"""", [(2, 15)], ["{'{'''''''''}"])