Skip to content

Commit

Permalink
Merge pull request #3730 from Zac-HD/regex-alphabet
Browse files Browse the repository at this point in the history
  • Loading branch information
Zac-HD committed Sep 4, 2023
2 parents a173366 + 0cd8ca9 commit 09d9cd5
Show file tree
Hide file tree
Showing 16 changed files with 404 additions and 233 deletions.
13 changes: 13 additions & 0 deletions hypothesis-python/RELEASE.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
RELEASE_TYPE: minor

The :func:`~hypothesis.strategies.from_regex` strategy now takes an optional
``alphabet=characters(codec="utf-8")`` argument for unicode strings, like
:func:`~hypothesis.strategies.text`.

This offers more and more-consistent control over the generated strings,
removing previously-hard-coded limitations. With ``fullmatch=False`` and
``alphabet=characters()``, surrogate characters are now possible in leading
and trailing text as well as the body of the match. Negated character classes
such as ``[^A-Z]`` or ``\S`` had a hard-coded exclusion of control characters
and surrogate characters; now they permit anything in ``alphabet=`` consistent
with the class, and control characters are permitted by default.
8 changes: 4 additions & 4 deletions hypothesis-python/docs/changes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ help narrow down any particularly weird bugs in complex environments.
-------------------

Fixes some lingering issues with inference of recursive types
in `~hypothesis.strategies.from_type`. Closes :issue:`3525`.
in :func:`~hypothesis.strategies.from_type`. Closes :issue:`3525`.

.. _v6.81.0:

Expand Down Expand Up @@ -335,8 +335,8 @@ is strongly recommended. You can ensure you have the dependencies with
-------------------

This patch continues the work started in :pull:`3651` by adding
:pypi:`ruff` linter rules for pyflakes, flake8-comprehensions, and
flake8-implicit-str-concat.
:pypi:`ruff` linter rules for :pypi:`pyflakes`, :pypi:`flake8-comprehensions`,
and :pypi:`flake8-implicit-str-concat`.

.. _v6.75.5:

Expand Down Expand Up @@ -1184,7 +1184,7 @@ is really annoying. See :issue:`2701` for details.
6.48.0 - 2022-06-27
-------------------

This release raises :class:`~unittest.SkipTest` for which never executed any
This release raises :class:`~unittest.SkipTest` for tests which never executed any
examples, for example because the :obj:`~hypothesis.settings.phases` setting
excluded the :obj:`~hypothesis.Phase.explicit`, :obj:`~hypothesis.Phase.reuse`,
and :obj:`~hypothesis.Phase.generate` phases. This helps to avoid cases where
Expand Down
10 changes: 1 addition & 9 deletions hypothesis-python/src/hypothesis/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
from hypothesis.internal.compat import (
PYPY,
BaseExceptionGroup,
add_note,
bad_django_TestCase,
get_type_hints,
int_from_bytes,
Expand Down Expand Up @@ -1008,15 +1009,6 @@ def run_engine(self):
_raise_to_user(errors_to_report, self.settings, report_lines)


def add_note(exc, note):
try:
exc.add_note(note)
except AttributeError:
if not hasattr(exc, "__notes__"):
exc.__notes__ = []
exc.__notes__.append(note)


def _raise_to_user(errors_to_report, settings, target_lines, trailer=""):
"""Helper function for attaching notes and grouping multiple errors."""
failing_prefix = "Falsifying example: "
Expand Down
144 changes: 13 additions & 131 deletions hypothesis-python/src/hypothesis/internal/charmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from hypothesis.configuration import mkdir_p, storage_directory
from hypothesis.errors import InvalidArgument
from hypothesis.internal.intervalsets import IntervalSet

intervals = Tuple[Tuple[int, int], ...]
cache_type = Dict[Tuple[Tuple[str, ...], int, int, intervals], intervals]
Expand Down Expand Up @@ -146,126 +147,6 @@ def as_general_categories(cats, name="cats"):
return tuple(c for c in cs if c in out)


def _union_intervals(x, y):
"""Merge two sequences of intervals into a single tuple of intervals.
Any integer bounded by `x` or `y` is also bounded by the result.
>>> _union_intervals([(3, 10)], [(1, 2), (5, 17)])
((1, 17),)
"""
if not x:
return tuple((u, v) for u, v in y)
if not y:
return tuple((u, v) for u, v in x)
intervals = sorted(x + y, reverse=True)
result = [intervals.pop()]
while intervals:
# 1. intervals is in descending order
# 2. pop() takes from the RHS.
# 3. (a, b) was popped 1st, then (u, v) was popped 2nd
# 4. Therefore: a <= u
# 5. We assume that u <= v and a <= b
# 6. So we need to handle 2 cases of overlap, and one disjoint case
# | u--v | u----v | u--v |
# | a----b | a--b | a--b |
u, v = intervals.pop()
a, b = result[-1]
if u <= b + 1:
# Overlap cases
result[-1] = (a, max(v, b))
else:
# Disjoint case
result.append((u, v))
return tuple(result)


def _subtract_intervals(x, y):
"""Set difference for lists of intervals. That is, returns a list of
intervals that bounds all values bounded by x that are not also bounded by
y. x and y are expected to be in sorted order.
For example _subtract_intervals([(1, 10)], [(2, 3), (9, 15)]) would
return [(1, 1), (4, 8)], removing the values 2, 3, 9 and 10 from the
interval.
"""
if not y:
return tuple(x)
x = list(map(list, x))
i = 0
j = 0
result = []
while i < len(x) and j < len(y):
# Iterate in parallel over x and y. j stays pointing at the smallest
# interval in the left hand side that could still overlap with some
# element of x at index >= i.
# Similarly, i is not incremented until we know that it does not
# overlap with any element of y at index >= j.

xl, xr = x[i]
assert xl <= xr
yl, yr = y[j]
assert yl <= yr

if yr < xl:
# The interval at y[j] is strictly to the left of the interval at
# x[i], so will not overlap with it or any later interval of x.
j += 1
elif yl > xr:
# The interval at y[j] is strictly to the right of the interval at
# x[i], so all of x[i] goes into the result as no further intervals
# in y will intersect it.
result.append(x[i])
i += 1
elif yl <= xl:
if yr >= xr:
# x[i] is contained entirely in y[j], so we just skip over it
# without adding it to the result.
i += 1
else:
# The beginning of x[i] is contained in y[j], so we update the
# left endpoint of x[i] to remove this, and increment j as we
# now have moved past it. Note that this is not added to the
# result as is, as more intervals from y may intersect it so it
# may need updating further.
x[i][0] = yr + 1
j += 1
else:
# yl > xl, so the left hand part of x[i] is not contained in y[j],
# so there are some values we should add to the result.
result.append((xl, yl - 1))

if yr + 1 <= xr:
# If y[j] finishes before x[i] does, there may be some values
# in x[i] left that should go in the result (or they may be
# removed by a later interval in y), so we update x[i] to
# reflect that and increment j because it no longer overlaps
# with any remaining element of x.
x[i][0] = yr + 1
j += 1
else:
# Every element of x[i] other than the initial part we have
# already added is contained in y[j], so we move to the next
# interval.
i += 1
# Any remaining intervals in x do not overlap with any of y, as if they did
# we would not have incremented j to the end, so can be added to the result
# as they are.
result.extend(x[i:])
return tuple(map(tuple, result))


def _intervals(s):
"""Return a tuple of intervals, covering the codepoints of characters in
`s`.
>>> _intervals('abcdef0123456789')
((48, 57), (97, 102))
"""
intervals = tuple((ord(c), ord(c)) for c in sorted(s))
return _union_intervals(intervals, intervals)


category_index_cache = {(): ()}


Expand Down Expand Up @@ -306,11 +187,14 @@ def _query_for_key(key):
pass
assert key
if set(key) == set(categories()):
result = ((0, sys.maxunicode),)
result = IntervalSet([(0, sys.maxunicode)])
else:
result = _union_intervals(_query_for_key(key[:-1]), charmap()[key[-1]])
category_index_cache[key] = result
return result
result = IntervalSet(_query_for_key(key[:-1])).union(
IntervalSet(charmap()[key[-1]])
)
assert isinstance(result, IntervalSet)
category_index_cache[key] = result.intervals
return result.intervals


limited_category_index_cache: cache_type = {}
Expand Down Expand Up @@ -344,14 +228,14 @@ def query(
if max_codepoint is None:
max_codepoint = sys.maxunicode
catkey = _category_key(exclude_categories, include_categories)
character_intervals = _intervals(include_characters or "")
exclude_intervals = _intervals(exclude_characters or "")
character_intervals = IntervalSet.from_string(include_characters or "")
exclude_intervals = IntervalSet.from_string(exclude_characters or "")
qkey = (
catkey,
min_codepoint,
max_codepoint,
character_intervals,
exclude_intervals,
character_intervals.intervals,
exclude_intervals.intervals,
)
try:
return limited_category_index_cache[qkey]
Expand All @@ -362,8 +246,6 @@ def query(
for u, v in base:
if v >= min_codepoint and u <= max_codepoint:
result.append((max(u, min_codepoint), min(v, max_codepoint)))
result = tuple(result)
result = _union_intervals(result, character_intervals)
result = _subtract_intervals(result, exclude_intervals)
result = (IntervalSet(result) | character_intervals) - exclude_intervals
limited_category_index_cache[qkey] = result
return result
9 changes: 9 additions & 0 deletions hypothesis-python/src/hypothesis/internal/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,15 @@
WINDOWS = platform.system() == "Windows"


def add_note(exc, note):
try:
exc.add_note(note)
except AttributeError:
if not hasattr(exc, "__notes__"):
exc.__notes__ = []
exc.__notes__.append(note)


def escape_unicode_characters(s: str) -> str:
return codecs.encode(s, "unicode_escape").decode("ascii")

Expand Down

0 comments on commit 09d9cd5

Please sign in to comment.