Improve database keys

HypothesisWorks · Dec 4, 2022 · 1059489 · 1059489
1 parent c3101d6
commit 1059489
Show file tree

Hide file tree

Showing 3 changed files with 97 additions and 16 deletions.
diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst
@@ -0,0 +1,7 @@
+RELEASE_TYPE: minor
+
+This release improves our treatment of database keys, which based on (among other things)
+the source code of your test function.  We now post-process this source to ignore
+decorators, comments, trailing whitespace, and blank lines - so that you can add
+:obj:`@example() <hypothesis.example>`\ s or make some small no-op edits to your code
+without preventing replay of any known failing or covering examples.
diff --git a/hypothesis-python/src/hypothesis/internal/reflection.py b/hypothesis-python/src/hypothesis/internal/reflection.py
@@ -20,10 +20,11 @@
 import textwrap
 import types
 from functools import wraps
+from io import StringIO
 from keyword import iskeyword
-from tokenize import detect_encoding
+from tokenize import COMMENT, detect_encoding, generate_tokens, untokenize
 from types import ModuleType
-from typing import TYPE_CHECKING, Callable
+from typing import TYPE_CHECKING, Any, Callable
 from unittest.mock import _patch as PatchType
 
 from hypothesis.internal.compat import is_typed_named_tuple, update_code_location
@@ -48,6 +49,33 @@ def is_mock(obj):
     return hasattr(obj, "hypothesis_internal_is_this_a_mock_check")
 
 
+def _clean_source(src: str) -> bytes:
+    """Return the source code as bytes, without decorators or comments.
+
+    Because this is part of our database key, we reduce the cache invalidation
+    rate by ignoring decorators, comments, trailing whitespace, and empty lines.
+    We can't just use the (dumped) AST directly because it changes between Python
+    versions (e.g. ast.Constant)
+    """
+    # Get the (one-indexed) line number of the function definition, and drop preceding
+    # lines - i.e. any decorators, so that adding `@example()`s keeps the same key.
+    try:
+        func_lineno = ast.parse(src).body[0].lineno - 1
+        src = "".join(src.splitlines(keepends=True)[func_lineno:])
+    except Exception:
+        pass
+    # Remove blank lines and use the tokenize module to strip out comments,
+    # so that those can be changed without changing the database key.
+    try:
+        src = untokenize(
+            t for t in generate_tokens(StringIO(src).readline) if t.type != COMMENT
+        )
+    except Exception:
+        pass
+    # Finally, remove any trailing whitespace and empty lines as a last cleanup.
+    return "\n".join(x.rstrip() for x in src.splitlines() if x.rstrip()).encode()
+
+
 def function_digest(function):
     """Returns a string that is stable across multiple invocations across
     multiple processes and is prone to changing significantly in response to
@@ -57,24 +85,23 @@ def function_digest(function):
     """
     hasher = hashlib.sha384()
     try:
-        hasher.update(inspect.getsource(function).encode())
+        src = inspect.getsource(function)
     except (OSError, TypeError):
-        pass
-    try:
-        hasher.update(function.__name__.encode())
-    except AttributeError:
-        pass
+        # If we can't actually get the source code, try for the name as a fallback.
+        try:
+            hasher.update(function.__name__.encode())
+        except AttributeError:
+            pass
+    else:
+        hasher.update(_clean_source(src))
     try:
-        # We prefer to use the modern signature API, but left this for compatibility.
-        # While we don't promise stability of the database, there's no advantage to
-        # using signature here, so we might as well keep the existing keys for now.
-        spec = inspect.getfullargspec(function)
-        if inspect.ismethod(function):
-            del spec.args[0]
-        hasher.update(repr(spec).encode())
-    except TypeError:
+        # This is additional to the source code because it can include the effects
+        # of decorators, or of post-hoc assignment to the .__signature__ attribute.
+        hasher.update(repr(get_signature(function)).encode())
+    except Exception:
         pass
     try:
+        # We set this in order to distinguish e.g. @pytest.mark.parametrize cases.
         hasher.update(function._hypothesis_internal_add_digest)
     except AttributeError:
         pass

diff --git a/hypothesis-python/tests/cover/test_reflection.py b/hypothesis-python/tests/cover/test_reflection.py
@@ -13,6 +13,7 @@
 from datetime import time
 from functools import partial, wraps
 from inspect import Parameter, Signature, signature
+from textwrap import dedent
 from unittest.mock import MagicMock, Mock, NonCallableMagicMock, NonCallableMock
 
 import pytest
@@ -651,3 +652,49 @@ def test_param_called_within_defaults_on_error():
     # Create a function object for which we cannot retrieve the source.
     f = compile("lambda: ...", "_.py", "eval")
     assert is_first_param_referenced_in_function(f)
+
+
+def _prep_source(*pairs):
+    return [
+        pytest.param(dedent(x).strip(), dedent(y).strip().encode(), id=f"case-{i}")
+        for i, (x, y) in enumerate(pairs)
+    ]
+
+
+@pytest.mark.parametrize(
+    "src, clean",
+    _prep_source(
+        ("", ""),
+        ("def test(): pass", "def test(): pass"),
+        ("def invalid syntax", "def invalid syntax"),
+        (
+            """
+            @example(1)
+            @given(st.integers())
+            def test(x):
+                # line comment
+                assert x  # end-of-line comment
+
+
+                "Had some blank lines above"
+            """,
+            """
+            def test(x):
+                assert x
+                "Had some blank lines above"
+            """,
+        ),
+        (
+            """
+            def      \\
+                f(): pass
+            """,
+            """
+            def\\
+                f(): pass
+            """,
+        ),
+    ),
+)
+def test_clean_source(src, clean):
+    assert reflection._clean_source(src).splitlines() == clean.splitlines()