HypothesisWorks · Zac-HD · Dec 1, 2019 · Nov 30, 2019 · Stranger6667 · Dec 1, 2019
diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst
@@ -0,0 +1,5 @@
+RELEASE_TYPE: patch
+
+This patch fixes :issue:`2229`, where Numpy arrays of unsized strings would
+only ever have strings of size one due to an interaction between our generation
+logic and Numpy's allocation strategy.
diff --git a/hypothesis-python/src/hypothesis/extra/numpy.py b/hypothesis-python/src/hypothesis/extra/numpy.py
@@ -177,14 +177,23 @@ def do_draw(self, data):
         if 0 in self.shape:
             return np.zeros(dtype=self.dtype, shape=self.shape)
 
+        # Because Numpy allocates memory for strings at array creation, if we have
+        # an unsized string dtype we'll fill an object array and then cast it back.
+        unsized_string_dtype = (
+            self.dtype.kind in (u"S", u"a", u"U") and self.dtype.itemsize == 0
+        )
+
         # Reset this flag for each test case to emit warnings from set_element
         # Skip the check for object or void (multi-element) dtypes
-        self._report_overflow = self.dtype.kind not in ("O", "V")
+        self._report_overflow = self.dtype.kind not in "OV" and not unsized_string_dtype
 
         # This could legitimately be a np.empty, but the performance gains for
         # that would be so marginal that there's really not much point risking
         # undefined behaviour shenanigans.
-        result = np.zeros(shape=self.array_size, dtype=self.dtype)
+        result = np.zeros(
+            shape=self.array_size, dtype=object if unsized_string_dtype else self.dtype
+        )
+        print(self.dtype, result.dtype)
 
         if self.fill.is_empty:
             # We have no fill value (either because the user explicitly
@@ -261,8 +270,12 @@ def do_draw(self, data):
                 # single element, we both get an array with the right value in
                 # it and putmask will do the right thing by repeating the
                 # values of the array across the mask.
-                one_element = np.zeros(shape=1, dtype=self.dtype)
+                one_element = np.zeros(
+                    shape=1, dtype=object if unsized_string_dtype else self.dtype
+                )
                 self.set_element(data, one_element, 0, self.fill)
+                if unsized_string_dtype:
+                    one_element = one_element.astype(self.dtype)
                 fill_value = one_element[0]
                 if self.unique:
                     try:
@@ -278,6 +291,19 @@ def do_draw(self, data):
 
                 np.putmask(result, needs_fill, one_element)
 
+        if unsized_string_dtype:
+            out = result.astype(self.dtype)
+            mismatch = out != result
+            if mismatch.any():
+                note_deprecation(
+                    "Array elements %r cannot be represented as dtype %r - instead "
+                    "they becomes %r.  Use a more precise strategy, e.g. without "
+                    "trailing null bytes, as this will be an error future versions."
+                    % (result[mismatch], self.dtype, out[mismatch]),
+                    since="2019-07-28",
+                )
+            result = out
+
         return result.reshape(self.shape)
 
 

diff --git a/hypothesis-python/tests/numpy/test_argument_validation.py b/hypothesis-python/tests/numpy/test_argument_validation.py
@@ -306,6 +306,12 @@ def test_unicode_string_dtype_len_0(data):
     assert data.draw(s).itemsize == 4
 
 
+@checks_deprecated_behaviour
+@given(nps.arrays(dtype="U", shape=1, elements=st.just("abc\0\0")))
+def test_unicode_string_dtype_not_trimmed(arr):
+    assert arr[0] == u"abc"
+
+
 def test_test_basic_indices_kwonly_emulation():
     with pytest.raises(TypeError):
         nps.basic_indices((), 0, 1).validate()

diff --git a/hypothesis-python/tests/numpy/test_gen_data.py b/hypothesis-python/tests/numpy/test_gen_data.py
@@ -338,6 +338,12 @@ def test_byte_string_dtypes_generate_unicode_strings(data):
     assert isinstance(result, binary_type)
 
 
+@pytest.mark.parametrize("dtype", ["U", "S", "a"])
+def test_unsized_strings_length_gt_one(dtype):
+    # See https://github.com/HypothesisWorks/hypothesis/issues/2229
+    find_any(nps.arrays(dtype=dtype, shape=1), lambda arr: len(arr[0]) >= 2)
+
+
 @given(nps.arrays(dtype="int8", shape=st.integers(0, 20), unique=True))
 def test_array_values_are_unique(arr):
     assert len(set(arr)) == len(arr)