Merge pull request #3730 from Zac-HD/regex-alphabet

HypothesisWorks · Sep 4, 2023 · 09d9cd5 · 09d9cd5
2 parents a173366 + 0cd8ca9
commit 09d9cd5
Show file tree

Hide file tree

Showing 16 changed files with 404 additions and 233 deletions.
diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst
@@ -0,0 +1,13 @@
+RELEASE_TYPE: minor
+
+The :func:`~hypothesis.strategies.from_regex` strategy now takes an optional
+``alphabet=characters(codec="utf-8")`` argument for unicode strings, like
+:func:`~hypothesis.strategies.text`.
+
+This offers more and more-consistent control over the generated strings,
+removing previously-hard-coded limitations.  With ``fullmatch=False`` and
+``alphabet=characters()``, surrogate characters are now possible in leading
+and trailing text as well as the body of the match.  Negated character classes
+such as ``[^A-Z]`` or ``\S`` had a hard-coded exclusion of control characters
+and surrogate characters; now they permit anything in ``alphabet=`` consistent
+with the class, and control characters are permitted by default.
diff --git a/hypothesis-python/docs/changes.rst b/hypothesis-python/docs/changes.rst
@@ -144,7 +144,7 @@ help narrow down any particularly weird bugs in complex environments.
 -------------------
 
 Fixes some lingering issues with inference of recursive types
-in `~hypothesis.strategies.from_type`. Closes :issue:`3525`.
+in :func:`~hypothesis.strategies.from_type`. Closes :issue:`3525`.
 
 .. _v6.81.0:
 
@@ -335,8 +335,8 @@ is strongly recommended.  You can ensure you have the dependencies with
 -------------------
 
 This patch continues the work started in :pull:`3651` by adding
-:pypi:`ruff` linter rules for pyflakes, flake8-comprehensions, and
-flake8-implicit-str-concat.
+:pypi:`ruff` linter rules for :pypi:`pyflakes`, :pypi:`flake8-comprehensions`,
+and :pypi:`flake8-implicit-str-concat`.
 
 .. _v6.75.5:
 
@@ -1184,7 +1184,7 @@ is really annoying.  See :issue:`2701` for details.
 6.48.0 - 2022-06-27
 -------------------
 
-This release raises :class:`~unittest.SkipTest` for which never executed any
+This release raises :class:`~unittest.SkipTest` for tests which never executed any
 examples, for example because the :obj:`~hypothesis.settings.phases` setting
 excluded the :obj:`~hypothesis.Phase.explicit`, :obj:`~hypothesis.Phase.reuse`,
 and :obj:`~hypothesis.Phase.generate` phases.  This helps to avoid cases where

diff --git a/hypothesis-python/src/hypothesis/core.py b/hypothesis-python/src/hypothesis/core.py
@@ -70,6 +70,7 @@
 from hypothesis.internal.compat import (
     PYPY,
     BaseExceptionGroup,
+    add_note,
     bad_django_TestCase,
     get_type_hints,
     int_from_bytes,
@@ -1008,15 +1009,6 @@ def run_engine(self):
         _raise_to_user(errors_to_report, self.settings, report_lines)
 
 
-def add_note(exc, note):
-    try:
-        exc.add_note(note)
-    except AttributeError:
-        if not hasattr(exc, "__notes__"):
-            exc.__notes__ = []
-        exc.__notes__.append(note)
-
-
 def _raise_to_user(errors_to_report, settings, target_lines, trailer=""):
     """Helper function for attaching notes and grouping multiple errors."""
     failing_prefix = "Falsifying example: "

diff --git a/hypothesis-python/src/hypothesis/internal/charmap.py b/hypothesis-python/src/hypothesis/internal/charmap.py
@@ -18,6 +18,7 @@
 
 from hypothesis.configuration import mkdir_p, storage_directory
 from hypothesis.errors import InvalidArgument
+from hypothesis.internal.intervalsets import IntervalSet
 
 intervals = Tuple[Tuple[int, int], ...]
 cache_type = Dict[Tuple[Tuple[str, ...], int, int, intervals], intervals]
@@ -146,126 +147,6 @@ def as_general_categories(cats, name="cats"):
     return tuple(c for c in cs if c in out)
 
 
-def _union_intervals(x, y):
-    """Merge two sequences of intervals into a single tuple of intervals.
-
-    Any integer bounded by `x` or `y` is also bounded by the result.
-
-    >>> _union_intervals([(3, 10)], [(1, 2), (5, 17)])
-    ((1, 17),)
-    """
-    if not x:
-        return tuple((u, v) for u, v in y)
-    if not y:
-        return tuple((u, v) for u, v in x)
-    intervals = sorted(x + y, reverse=True)
-    result = [intervals.pop()]
-    while intervals:
-        # 1. intervals is in descending order
-        # 2. pop() takes from the RHS.
-        # 3. (a, b) was popped 1st, then (u, v) was popped 2nd
-        # 4. Therefore: a <= u
-        # 5. We assume that u <= v and a <= b
-        # 6. So we need to handle 2 cases of overlap, and one disjoint case
-        #    |   u--v     |   u----v   |       u--v  |
-        #    |   a----b   |   a--b     |  a--b       |
-        u, v = intervals.pop()
-        a, b = result[-1]
-        if u <= b + 1:
-            # Overlap cases
-            result[-1] = (a, max(v, b))
-        else:
-            # Disjoint case
-            result.append((u, v))
-    return tuple(result)
-
-
-def _subtract_intervals(x, y):
-    """Set difference for lists of intervals. That is, returns a list of
-    intervals that bounds all values bounded by x that are not also bounded by
-    y. x and y are expected to be in sorted order.
-
-    For example _subtract_intervals([(1, 10)], [(2, 3), (9, 15)]) would
-    return [(1, 1), (4, 8)], removing the values 2, 3, 9 and 10 from the
-    interval.
-    """
-    if not y:
-        return tuple(x)
-    x = list(map(list, x))
-    i = 0
-    j = 0
-    result = []
-    while i < len(x) and j < len(y):
-        # Iterate in parallel over x and y. j stays pointing at the smallest
-        # interval in the left hand side that could still overlap with some
-        # element of x at index >= i.
-        # Similarly, i is not incremented until we know that it does not
-        # overlap with any element of y at index >= j.
-
-        xl, xr = x[i]
-        assert xl <= xr
-        yl, yr = y[j]
-        assert yl <= yr
-
-        if yr < xl:
-            # The interval at y[j] is strictly to the left of the interval at
-            # x[i], so will not overlap with it or any later interval of x.
-            j += 1
-        elif yl > xr:
-            # The interval at y[j] is strictly to the right of the interval at
-            # x[i], so all of x[i] goes into the result as no further intervals
-            # in y will intersect it.
-            result.append(x[i])
-            i += 1
-        elif yl <= xl:
-            if yr >= xr:
-                # x[i] is contained entirely in y[j], so we just skip over it
-                # without adding it to the result.
-                i += 1
-            else:
-                # The beginning of x[i] is contained in y[j], so we update the
-                # left endpoint of x[i] to remove this, and increment j as we
-                # now have moved past it. Note that this is not added to the
-                # result as is, as more intervals from y may intersect it so it
-                # may need updating further.
-                x[i][0] = yr + 1
-                j += 1
-        else:
-            # yl > xl, so the left hand part of x[i] is not contained in y[j],
-            # so there are some values we should add to the result.
-            result.append((xl, yl - 1))
-
-            if yr + 1 <= xr:
-                # If y[j] finishes before x[i] does, there may be some values
-                # in x[i] left that should go in the result (or they may be
-                # removed by a later interval in y), so we update x[i] to
-                # reflect that and increment j because it no longer overlaps
-                # with any remaining element of x.
-                x[i][0] = yr + 1
-                j += 1
-            else:
-                # Every element of x[i] other than the initial part we have
-                # already added is contained in y[j], so we move to the next
-                # interval.
-                i += 1
-    # Any remaining intervals in x do not overlap with any of y, as if they did
-    # we would not have incremented j to the end, so can be added to the result
-    # as they are.
-    result.extend(x[i:])
-    return tuple(map(tuple, result))
-
-
-def _intervals(s):
-    """Return a tuple of intervals, covering the codepoints of characters in
-    `s`.
-
-    >>> _intervals('abcdef0123456789')
-    ((48, 57), (97, 102))
-    """
-    intervals = tuple((ord(c), ord(c)) for c in sorted(s))
-    return _union_intervals(intervals, intervals)
-
-
 category_index_cache = {(): ()}
 
 
@@ -306,11 +187,14 @@ def _query_for_key(key):
         pass
     assert key
     if set(key) == set(categories()):
-        result = ((0, sys.maxunicode),)
+        result = IntervalSet([(0, sys.maxunicode)])
     else:
-        result = _union_intervals(_query_for_key(key[:-1]), charmap()[key[-1]])
-    category_index_cache[key] = result
-    return result
+        result = IntervalSet(_query_for_key(key[:-1])).union(
+            IntervalSet(charmap()[key[-1]])
+        )
+    assert isinstance(result, IntervalSet)
+    category_index_cache[key] = result.intervals
+    return result.intervals
 
 
 limited_category_index_cache: cache_type = {}
@@ -344,14 +228,14 @@ def query(
     if max_codepoint is None:
         max_codepoint = sys.maxunicode
     catkey = _category_key(exclude_categories, include_categories)
-    character_intervals = _intervals(include_characters or "")
-    exclude_intervals = _intervals(exclude_characters or "")
+    character_intervals = IntervalSet.from_string(include_characters or "")
+    exclude_intervals = IntervalSet.from_string(exclude_characters or "")
     qkey = (
         catkey,
         min_codepoint,
         max_codepoint,
-        character_intervals,
-        exclude_intervals,
+        character_intervals.intervals,
+        exclude_intervals.intervals,
     )
     try:
         return limited_category_index_cache[qkey]
@@ -362,8 +246,6 @@ def query(
     for u, v in base:
         if v >= min_codepoint and u <= max_codepoint:
             result.append((max(u, min_codepoint), min(v, max_codepoint)))
-    result = tuple(result)
-    result = _union_intervals(result, character_intervals)
-    result = _subtract_intervals(result, exclude_intervals)
+    result = (IntervalSet(result) | character_intervals) - exclude_intervals
     limited_category_index_cache[qkey] = result
     return result
diff --git a/hypothesis-python/src/hypothesis/internal/compat.py b/hypothesis-python/src/hypothesis/internal/compat.py
@@ -43,6 +43,15 @@
 WINDOWS = platform.system() == "Windows"
 
 
+def add_note(exc, note):
+    try:
+        exc.add_note(note)
+    except AttributeError:
+        if not hasattr(exc, "__notes__"):
+            exc.__notes__ = []
+        exc.__notes__.append(note)
+
+
 def escape_unicode_characters(s: str) -> str:
     return codecs.encode(s, "unicode_escape").decode("ascii")