Skip to content

Commit

Permalink
Merge pull request #2245 from Zac-HD/numpy-strings
Browse files Browse the repository at this point in the history
Generate longer strings for unsized dtypes
  • Loading branch information
Zac-HD committed Dec 1, 2019
2 parents 30cc2ea + 22ef32e commit 96289c7
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 3 deletions.
5 changes: 5 additions & 0 deletions hypothesis-python/RELEASE.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
RELEASE_TYPE: patch

This patch fixes :issue:`2229`, where Numpy arrays of unsized strings would
only ever have strings of size one due to an interaction between our generation
logic and Numpy's allocation strategy.
32 changes: 29 additions & 3 deletions hypothesis-python/src/hypothesis/extra/numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,14 +177,23 @@ def do_draw(self, data):
if 0 in self.shape:
return np.zeros(dtype=self.dtype, shape=self.shape)

# Because Numpy allocates memory for strings at array creation, if we have
# an unsized string dtype we'll fill an object array and then cast it back.
unsized_string_dtype = (
self.dtype.kind in (u"S", u"a", u"U") and self.dtype.itemsize == 0
)

# Reset this flag for each test case to emit warnings from set_element
# Skip the check for object or void (multi-element) dtypes
self._report_overflow = self.dtype.kind not in ("O", "V")
self._report_overflow = self.dtype.kind not in "OV" and not unsized_string_dtype

# This could legitimately be a np.empty, but the performance gains for
# that would be so marginal that there's really not much point risking
# undefined behaviour shenanigans.
result = np.zeros(shape=self.array_size, dtype=self.dtype)
result = np.zeros(
shape=self.array_size, dtype=object if unsized_string_dtype else self.dtype
)
print(self.dtype, result.dtype)

if self.fill.is_empty:
# We have no fill value (either because the user explicitly
Expand Down Expand Up @@ -261,8 +270,12 @@ def do_draw(self, data):
# single element, we both get an array with the right value in
# it and putmask will do the right thing by repeating the
# values of the array across the mask.
one_element = np.zeros(shape=1, dtype=self.dtype)
one_element = np.zeros(
shape=1, dtype=object if unsized_string_dtype else self.dtype
)
self.set_element(data, one_element, 0, self.fill)
if unsized_string_dtype:
one_element = one_element.astype(self.dtype)
fill_value = one_element[0]
if self.unique:
try:
Expand All @@ -278,6 +291,19 @@ def do_draw(self, data):

np.putmask(result, needs_fill, one_element)

if unsized_string_dtype:
out = result.astype(self.dtype)
mismatch = out != result
if mismatch.any():
note_deprecation(
"Array elements %r cannot be represented as dtype %r - instead "
"they becomes %r. Use a more precise strategy, e.g. without "
"trailing null bytes, as this will be an error future versions."
% (result[mismatch], self.dtype, out[mismatch]),
since="2019-07-28",
)
result = out

return result.reshape(self.shape)


Expand Down
6 changes: 6 additions & 0 deletions hypothesis-python/tests/numpy/test_argument_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,12 @@ def test_unicode_string_dtype_len_0(data):
assert data.draw(s).itemsize == 4


@checks_deprecated_behaviour
@given(nps.arrays(dtype="U", shape=1, elements=st.just("abc\0\0")))
def test_unicode_string_dtype_not_trimmed(arr):
assert arr[0] == u"abc"


def test_test_basic_indices_kwonly_emulation():
with pytest.raises(TypeError):
nps.basic_indices((), 0, 1).validate()
Expand Down
6 changes: 6 additions & 0 deletions hypothesis-python/tests/numpy/test_gen_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,12 @@ def test_byte_string_dtypes_generate_unicode_strings(data):
assert isinstance(result, binary_type)


@pytest.mark.parametrize("dtype", ["U", "S", "a"])
def test_unsized_strings_length_gt_one(dtype):
# See https://github.com/HypothesisWorks/hypothesis/issues/2229
find_any(nps.arrays(dtype=dtype, shape=1), lambda arr: len(arr[0]) >= 2)


@given(nps.arrays(dtype="int8", shape=st.integers(0, 20), unique=True))
def test_array_values_are_unique(arr):
assert len(set(arr)) == len(arr)
Expand Down

0 comments on commit 96289c7

Please sign in to comment.