Adjust tests in array folder for new string option (#56188)

* Adjust tests in array directory for new string option * BUG: value_counts not preserving object dtype * Adjust tests in array folder for new string option * Fixup * Fix * Fix * Revert "BUG: value_counts not preserving object dtype" This reverts commit f570a4f
pandas-dev · Dec 9, 2023 · 8aa7a96 · 8aa7a96
1 parent f0b61c5
commit 8aa7a96
Show file tree

Hide file tree

Showing 12 changed files with 129 additions and 47 deletions.
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -932,7 +932,10 @@ def value_counts_internal(
             idx = Index(keys)
             if idx.dtype == bool and keys.dtype == object:
                 idx = idx.astype(object)
-            elif idx.dtype != keys.dtype:
+            elif (
+                idx.dtype != keys.dtype  # noqa: PLR1714  # # pylint: disable=R1714
+                and idx.dtype != "string[pyarrow_numpy]"
+            ):
                 warnings.warn(
                     # GH#56161
                     "The behavior of value_counts with object-dtype is deprecated. "

diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py
@@ -90,9 +90,16 @@ def test_op_int8(left_array, right_array, opname):
 # -----------------------------------------------------------------------------
 
 
-def test_error_invalid_values(data, all_arithmetic_operators):
+def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
     # invalid ops
 
+    if using_infer_string:
+        import pyarrow as pa
+
+        err = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError)
+    else:
+        err = TypeError
+
     op = all_arithmetic_operators
     s = pd.Series(data)
     ops = getattr(s, op)
@@ -110,9 +117,10 @@ def test_error_invalid_values(data, all_arithmetic_operators):
         [
             r"unsupported operand type\(s\) for",
             "Concatenation operation is not implemented for NumPy arrays",
+            "has no kernel",
         ]
     )
-    with pytest.raises(TypeError, match=msg):
+    with pytest.raises(err, match=msg):
         ops(pd.Timestamp("20180101"))
 
     # invalid array-likes
@@ -123,7 +131,9 @@ def test_error_invalid_values(data, all_arithmetic_operators):
                 r"unsupported operand type\(s\) for",
                 "can only concatenate str",
                 "not all arguments converted during string formatting",
+                "has no kernel",
+                "not implemented",
             ]
         )
-        with pytest.raises(TypeError, match=msg):
+        with pytest.raises(err, match=msg):
             ops(pd.Series("foo", index=s.index))
diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py
@@ -89,7 +89,7 @@ def test_astype(self, ordered):
         expected = np.array(cat)
         tm.assert_numpy_array_equal(result, expected)
 
-        msg = r"Cannot cast object dtype to float64"
+        msg = r"Cannot cast object|string dtype to float64"
         with pytest.raises(ValueError, match=msg):
             cat.astype(float)
 

diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py
@@ -6,6 +6,8 @@
 import numpy as np
 import pytest
 
+from pandas._config import using_pyarrow_string_dtype
+
 from pandas.core.dtypes.common import (
     is_float_dtype,
     is_integer_dtype,
@@ -447,6 +449,7 @@ def test_constructor_str_unknown(self):
         with pytest.raises(ValueError, match="Unknown dtype"):
             Categorical([1, 2], dtype="foo")
 
+    @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="Can't be NumPy strings")
     def test_constructor_np_strs(self):
         # GH#31499 Hashtable.map_locations needs to work on np.str_ objects
         cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")])

diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py
@@ -92,7 +92,7 @@ def test_comparisons(self, factor):
             cat > cat_unordered
 
         # comparison (in both directions) with Series will raise
-        s = Series(["b", "b", "b"])
+        s = Series(["b", "b", "b"], dtype=object)
         msg = (
             "Cannot compare a Categorical for op __gt__ with type "
             r"<class 'numpy\.ndarray'>"
@@ -108,7 +108,7 @@ def test_comparisons(self, factor):
 
         # comparison with numpy.array will raise in both direction, but only on
         # newer numpy versions
-        a = np.array(["b", "b", "b"])
+        a = np.array(["b", "b", "b"], dtype=object)
         with pytest.raises(TypeError, match=msg):
             cat > a
         with pytest.raises(TypeError, match=msg):
@@ -248,7 +248,7 @@ def test_comparisons(self, data, reverse, base):
         cat_base = Series(
             Categorical(base, categories=cat.cat.categories, ordered=True)
         )
-        s = Series(base)
+        s = Series(base, dtype=object if base == list("bbb") else None)
         a = np.array(base)
 
         # comparisons need to take categories ordering into account

diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py
@@ -1,9 +1,13 @@
 import numpy as np
+import pytest
+
+from pandas._config import using_pyarrow_string_dtype
 
 from pandas import (
     Categorical,
     CategoricalDtype,
     CategoricalIndex,
+    Index,
     Series,
     date_range,
     option_context,
@@ -13,11 +17,17 @@
 
 
 class TestCategoricalReprWithFactor:
-    def test_print(self, factor):
-        expected = [
-            "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
-            "Categories (3, object): ['a' < 'b' < 'c']",
-        ]
+    def test_print(self, factor, using_infer_string):
+        if using_infer_string:
+            expected = [
+                "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
+                "Categories (3, string): [a < b < c]",
+            ]
+        else:
+            expected = [
+                "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
+                "Categories (3, object): ['a' < 'b' < 'c']",
+            ]
         expected = "\n".join(expected)
         actual = repr(factor)
         assert actual == expected
@@ -26,7 +36,7 @@ def test_print(self, factor):
 class TestCategoricalRepr:
     def test_big_print(self):
         codes = np.array([0, 1, 2, 0, 1, 2] * 100)
-        dtype = CategoricalDtype(categories=["a", "b", "c"])
+        dtype = CategoricalDtype(categories=Index(["a", "b", "c"], dtype=object))
         factor = Categorical.from_codes(codes, dtype=dtype)
         expected = [
             "['a', 'b', 'c', 'a', 'b', ..., 'b', 'c', 'a', 'b', 'c']",
@@ -40,13 +50,13 @@ def test_big_print(self):
         assert actual == expected
 
     def test_empty_print(self):
-        factor = Categorical([], ["a", "b", "c"])
+        factor = Categorical([], Index(["a", "b", "c"], dtype=object))
         expected = "[], Categories (3, object): ['a', 'b', 'c']"
         actual = repr(factor)
         assert actual == expected
 
         assert expected == actual
-        factor = Categorical([], ["a", "b", "c"], ordered=True)
+        factor = Categorical([], Index(["a", "b", "c"], dtype=object), ordered=True)
         expected = "[], Categories (3, object): ['a' < 'b' < 'c']"
         actual = repr(factor)
         assert expected == actual
@@ -66,6 +76,10 @@ def test_print_none_width(self):
         with option_context("display.width", None):
             assert exp == repr(a)
 
+    @pytest.mark.skipif(
+        using_pyarrow_string_dtype(),
+        reason="Change once infer_string is set to True by default",
+    )
     def test_unicode_print(self):
         c = Categorical(["aaaaa", "bb", "cccc"] * 20)
         expected = """\

diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py
@@ -122,11 +122,18 @@ def test_arith_zero_dim_ndarray(other):
 # -----------------------------------------------------------------------------
 
 
-def test_error_invalid_values(data, all_arithmetic_operators):
+def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
     op = all_arithmetic_operators
     s = pd.Series(data)
     ops = getattr(s, op)
 
+    if using_infer_string:
+        import pyarrow as pa
+
+        errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError)
+    else:
+        errs = TypeError
+
     # invalid scalars
     msg = "|".join(
         [
@@ -140,15 +147,17 @@ def test_error_invalid_values(data, all_arithmetic_operators):
             "ufunc '.*' not supported for the input types, and the inputs could not",
             "ufunc '.*' did not contain a loop with signature matching types",
             "Concatenation operation is not implemented for NumPy arrays",
+            "has no kernel",
+            "not implemented",
         ]
     )
-    with pytest.raises(TypeError, match=msg):
+    with pytest.raises(errs, match=msg):
         ops("foo")
-    with pytest.raises(TypeError, match=msg):
+    with pytest.raises(errs, match=msg):
         ops(pd.Timestamp("20180101"))
 
     # invalid array-likes
-    with pytest.raises(TypeError, match=msg):
+    with pytest.raises(errs, match=msg):
         ops(pd.Series("foo", index=s.index))
 
     msg = "|".join(
@@ -167,9 +176,11 @@ def test_error_invalid_values(data, all_arithmetic_operators):
             ),
             r"ufunc 'add' cannot use operands with types dtype\('float\d{2}'\)",
             "cannot subtract DatetimeArray from ndarray",
+            "has no kernel",
+            "not implemented",
         ]
     )
-    with pytest.raises(TypeError, match=msg):
+    with pytest.raises(errs, match=msg):
         ops(pd.Series(pd.date_range("20180101", periods=len(s))))
 
 

diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py
@@ -172,11 +172,18 @@ def test_numpy_zero_dim_ndarray(other):
 # -----------------------------------------------------------------------------
 
 
-def test_error_invalid_values(data, all_arithmetic_operators):
+def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
     op = all_arithmetic_operators
     s = pd.Series(data)
     ops = getattr(s, op)
 
+    if using_infer_string:
+        import pyarrow as pa
+
+        errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError)
+    else:
+        errs = TypeError
+
     # invalid scalars
     msg = "|".join(
         [
@@ -188,20 +195,26 @@ def test_error_invalid_values(data, all_arithmetic_operators):
             "ufunc '.*' not supported for the input types, and the inputs could not",
             "ufunc '.*' did not contain a loop with signature matching types",
             "Addition/subtraction of integers and integer-arrays with Timestamp",
+            "has no kernel",
+            "not implemented",
         ]
     )
-    with pytest.raises(TypeError, match=msg):
+    with pytest.raises(errs, match=msg):
         ops("foo")
-    with pytest.raises(TypeError, match=msg):
+    with pytest.raises(errs, match=msg):
         ops(pd.Timestamp("20180101"))
 
     # invalid array-likes
     str_ser = pd.Series("foo", index=s.index)
     # with pytest.raises(TypeError, match=msg):
-    if all_arithmetic_operators in [
-        "__mul__",
-        "__rmul__",
-    ]:  # (data[~data.isna()] >= 0).all():
+    if (
+        all_arithmetic_operators
+        in [
+            "__mul__",
+            "__rmul__",
+        ]
+        and not using_infer_string
+    ):  # (data[~data.isna()] >= 0).all():
         res = ops(str_ser)
         expected = pd.Series(["foo" * x for x in data], index=s.index)
         expected = expected.fillna(np.nan)
@@ -210,7 +223,7 @@ def test_error_invalid_values(data, all_arithmetic_operators):
         #  more-correct than np.nan here.
         tm.assert_series_equal(res, expected)
     else:
-        with pytest.raises(TypeError, match=msg):
+        with pytest.raises(errs, match=msg):
             ops(str_ser)
 
     msg = "|".join(
@@ -223,9 +236,11 @@ def test_error_invalid_values(data, all_arithmetic_operators):
             r"can only concatenate str \(not \"int\"\) to str",
             "not all arguments converted during string",
             "cannot subtract DatetimeArray from ndarray",
+            "has no kernel",
+            "not implemented",
         ]
     )
-    with pytest.raises(TypeError, match=msg):
+    with pytest.raises(errs, match=msg):
         ops(pd.Series(pd.date_range("20180101", periods=len(s))))
 
 

diff --git a/pandas/tests/arrays/integer/test_reduction.py b/pandas/tests/arrays/integer/test_reduction.py
@@ -102,7 +102,9 @@ def test_groupby_reductions(op, expected):
         ["all", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")],
     ],
 )
-def test_mixed_reductions(op, expected):
+def test_mixed_reductions(op, expected, using_infer_string):
+    if op in ["any", "all"] and using_infer_string:
+        expected = expected.astype("bool")
     df = DataFrame(
         {
             "A": ["a", "b", "b"],

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -191,7 +191,7 @@ def test_mul(dtype):
 @pytest.mark.xfail(reason="GH-28527")
 def test_add_strings(dtype):
     arr = pd.array(["a", "b", "c", "d"], dtype=dtype)
-    df = pd.DataFrame([["t", "y", "v", "w"]])
+    df = pd.DataFrame([["t", "y", "v", "w"]], dtype=object)
     assert arr.__add__(df) is NotImplemented
 
     result = arr + df
@@ -498,10 +498,17 @@ def test_arrow_array(dtype):
 
 
 @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
-def test_arrow_roundtrip(dtype, string_storage2):
+def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
     # roundtrip possible from arrow 1.0.0
     pa = pytest.importorskip("pyarrow")
 
+    if using_infer_string and string_storage2 != "pyarrow_numpy":
+        request.applymarker(
+            pytest.mark.xfail(
+                reason="infer_string takes precedence over string storage"
+            )
+        )
+
     data = pd.array(["a", "b", None], dtype=dtype)
     df = pd.DataFrame({"a": data})
     table = pa.table(df)
@@ -516,10 +523,19 @@ def test_arrow_roundtrip(dtype, string_storage2):
 
 
 @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
-def test_arrow_load_from_zero_chunks(dtype, string_storage2):
+def test_arrow_load_from_zero_chunks(
+    dtype, string_storage2, request, using_infer_string
+):
     # GH-41040
     pa = pytest.importorskip("pyarrow")
 
+    if using_infer_string and string_storage2 != "pyarrow_numpy":
+        request.applymarker(
+            pytest.mark.xfail(
+                reason="infer_string takes precedence over string storage"
+            )
+        )
+
     data = pd.array([], dtype=dtype)
     df = pd.DataFrame({"a": data})
     table = pa.table(df)