Skip to content

Commit

Permalink
Adjust tests in array folder for new string option (#56188)
Browse files Browse the repository at this point in the history
* Adjust tests in array directory for new string option

* BUG: value_counts not preserving object dtype

* Adjust tests in array folder for new string option

* Fixup

* Fix

* Fix

* Revert "BUG: value_counts not preserving object dtype"

This reverts commit f570a4f
  • Loading branch information
phofl committed Dec 9, 2023
1 parent f0b61c5 commit 8aa7a96
Show file tree
Hide file tree
Showing 12 changed files with 129 additions and 47 deletions.
5 changes: 4 additions & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -932,7 +932,10 @@ def value_counts_internal(
idx = Index(keys)
if idx.dtype == bool and keys.dtype == object:
idx = idx.astype(object)
elif idx.dtype != keys.dtype:
elif (
idx.dtype != keys.dtype # noqa: PLR1714 # # pylint: disable=R1714
and idx.dtype != "string[pyarrow_numpy]"
):
warnings.warn(
# GH#56161
"The behavior of value_counts with object-dtype is deprecated. "
Expand Down
16 changes: 13 additions & 3 deletions pandas/tests/arrays/boolean/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,16 @@ def test_op_int8(left_array, right_array, opname):
# -----------------------------------------------------------------------------


def test_error_invalid_values(data, all_arithmetic_operators):
def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
# invalid ops

if using_infer_string:
import pyarrow as pa

err = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError)
else:
err = TypeError

op = all_arithmetic_operators
s = pd.Series(data)
ops = getattr(s, op)
Expand All @@ -110,9 +117,10 @@ def test_error_invalid_values(data, all_arithmetic_operators):
[
r"unsupported operand type\(s\) for",
"Concatenation operation is not implemented for NumPy arrays",
"has no kernel",
]
)
with pytest.raises(TypeError, match=msg):
with pytest.raises(err, match=msg):
ops(pd.Timestamp("20180101"))

# invalid array-likes
Expand All @@ -123,7 +131,9 @@ def test_error_invalid_values(data, all_arithmetic_operators):
r"unsupported operand type\(s\) for",
"can only concatenate str",
"not all arguments converted during string formatting",
"has no kernel",
"not implemented",
]
)
with pytest.raises(TypeError, match=msg):
with pytest.raises(err, match=msg):
ops(pd.Series("foo", index=s.index))
2 changes: 1 addition & 1 deletion pandas/tests/arrays/categorical/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def test_astype(self, ordered):
expected = np.array(cat)
tm.assert_numpy_array_equal(result, expected)

msg = r"Cannot cast object dtype to float64"
msg = r"Cannot cast object|string dtype to float64"
with pytest.raises(ValueError, match=msg):
cat.astype(float)

Expand Down
3 changes: 3 additions & 0 deletions pandas/tests/arrays/categorical/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import numpy as np
import pytest

from pandas._config import using_pyarrow_string_dtype

from pandas.core.dtypes.common import (
is_float_dtype,
is_integer_dtype,
Expand Down Expand Up @@ -447,6 +449,7 @@ def test_constructor_str_unknown(self):
with pytest.raises(ValueError, match="Unknown dtype"):
Categorical([1, 2], dtype="foo")

@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="Can't be NumPy strings")
def test_constructor_np_strs(self):
# GH#31499 Hashtable.map_locations needs to work on np.str_ objects
cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")])
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/arrays/categorical/test_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def test_comparisons(self, factor):
cat > cat_unordered

# comparison (in both directions) with Series will raise
s = Series(["b", "b", "b"])
s = Series(["b", "b", "b"], dtype=object)
msg = (
"Cannot compare a Categorical for op __gt__ with type "
r"<class 'numpy\.ndarray'>"
Expand All @@ -108,7 +108,7 @@ def test_comparisons(self, factor):

# comparison with numpy.array will raise in both direction, but only on
# newer numpy versions
a = np.array(["b", "b", "b"])
a = np.array(["b", "b", "b"], dtype=object)
with pytest.raises(TypeError, match=msg):
cat > a
with pytest.raises(TypeError, match=msg):
Expand Down Expand Up @@ -248,7 +248,7 @@ def test_comparisons(self, data, reverse, base):
cat_base = Series(
Categorical(base, categories=cat.cat.categories, ordered=True)
)
s = Series(base)
s = Series(base, dtype=object if base == list("bbb") else None)
a = np.array(base)

# comparisons need to take categories ordering into account
Expand Down
30 changes: 22 additions & 8 deletions pandas/tests/arrays/categorical/test_repr.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
import numpy as np
import pytest

from pandas._config import using_pyarrow_string_dtype

from pandas import (
Categorical,
CategoricalDtype,
CategoricalIndex,
Index,
Series,
date_range,
option_context,
Expand All @@ -13,11 +17,17 @@


class TestCategoricalReprWithFactor:
def test_print(self, factor):
expected = [
"['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
"Categories (3, object): ['a' < 'b' < 'c']",
]
def test_print(self, factor, using_infer_string):
if using_infer_string:
expected = [
"['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
"Categories (3, string): [a < b < c]",
]
else:
expected = [
"['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
"Categories (3, object): ['a' < 'b' < 'c']",
]
expected = "\n".join(expected)
actual = repr(factor)
assert actual == expected
Expand All @@ -26,7 +36,7 @@ def test_print(self, factor):
class TestCategoricalRepr:
def test_big_print(self):
codes = np.array([0, 1, 2, 0, 1, 2] * 100)
dtype = CategoricalDtype(categories=["a", "b", "c"])
dtype = CategoricalDtype(categories=Index(["a", "b", "c"], dtype=object))
factor = Categorical.from_codes(codes, dtype=dtype)
expected = [
"['a', 'b', 'c', 'a', 'b', ..., 'b', 'c', 'a', 'b', 'c']",
Expand All @@ -40,13 +50,13 @@ def test_big_print(self):
assert actual == expected

def test_empty_print(self):
factor = Categorical([], ["a", "b", "c"])
factor = Categorical([], Index(["a", "b", "c"], dtype=object))
expected = "[], Categories (3, object): ['a', 'b', 'c']"
actual = repr(factor)
assert actual == expected

assert expected == actual
factor = Categorical([], ["a", "b", "c"], ordered=True)
factor = Categorical([], Index(["a", "b", "c"], dtype=object), ordered=True)
expected = "[], Categories (3, object): ['a' < 'b' < 'c']"
actual = repr(factor)
assert expected == actual
Expand All @@ -66,6 +76,10 @@ def test_print_none_width(self):
with option_context("display.width", None):
assert exp == repr(a)

@pytest.mark.skipif(
using_pyarrow_string_dtype(),
reason="Change once infer_string is set to True by default",
)
def test_unicode_print(self):
c = Categorical(["aaaaa", "bb", "cccc"] * 20)
expected = """\
Expand Down
21 changes: 16 additions & 5 deletions pandas/tests/arrays/floating/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,11 +122,18 @@ def test_arith_zero_dim_ndarray(other):
# -----------------------------------------------------------------------------


def test_error_invalid_values(data, all_arithmetic_operators):
def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
op = all_arithmetic_operators
s = pd.Series(data)
ops = getattr(s, op)

if using_infer_string:
import pyarrow as pa

errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError)
else:
errs = TypeError

# invalid scalars
msg = "|".join(
[
Expand All @@ -140,15 +147,17 @@ def test_error_invalid_values(data, all_arithmetic_operators):
"ufunc '.*' not supported for the input types, and the inputs could not",
"ufunc '.*' did not contain a loop with signature matching types",
"Concatenation operation is not implemented for NumPy arrays",
"has no kernel",
"not implemented",
]
)
with pytest.raises(TypeError, match=msg):
with pytest.raises(errs, match=msg):
ops("foo")
with pytest.raises(TypeError, match=msg):
with pytest.raises(errs, match=msg):
ops(pd.Timestamp("20180101"))

# invalid array-likes
with pytest.raises(TypeError, match=msg):
with pytest.raises(errs, match=msg):
ops(pd.Series("foo", index=s.index))

msg = "|".join(
Expand All @@ -167,9 +176,11 @@ def test_error_invalid_values(data, all_arithmetic_operators):
),
r"ufunc 'add' cannot use operands with types dtype\('float\d{2}'\)",
"cannot subtract DatetimeArray from ndarray",
"has no kernel",
"not implemented",
]
)
with pytest.raises(TypeError, match=msg):
with pytest.raises(errs, match=msg):
ops(pd.Series(pd.date_range("20180101", periods=len(s))))


Expand Down
33 changes: 24 additions & 9 deletions pandas/tests/arrays/integer/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,11 +172,18 @@ def test_numpy_zero_dim_ndarray(other):
# -----------------------------------------------------------------------------


def test_error_invalid_values(data, all_arithmetic_operators):
def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
op = all_arithmetic_operators
s = pd.Series(data)
ops = getattr(s, op)

if using_infer_string:
import pyarrow as pa

errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError)
else:
errs = TypeError

# invalid scalars
msg = "|".join(
[
Expand All @@ -188,20 +195,26 @@ def test_error_invalid_values(data, all_arithmetic_operators):
"ufunc '.*' not supported for the input types, and the inputs could not",
"ufunc '.*' did not contain a loop with signature matching types",
"Addition/subtraction of integers and integer-arrays with Timestamp",
"has no kernel",
"not implemented",
]
)
with pytest.raises(TypeError, match=msg):
with pytest.raises(errs, match=msg):
ops("foo")
with pytest.raises(TypeError, match=msg):
with pytest.raises(errs, match=msg):
ops(pd.Timestamp("20180101"))

# invalid array-likes
str_ser = pd.Series("foo", index=s.index)
# with pytest.raises(TypeError, match=msg):
if all_arithmetic_operators in [
"__mul__",
"__rmul__",
]: # (data[~data.isna()] >= 0).all():
if (
all_arithmetic_operators
in [
"__mul__",
"__rmul__",
]
and not using_infer_string
): # (data[~data.isna()] >= 0).all():
res = ops(str_ser)
expected = pd.Series(["foo" * x for x in data], index=s.index)
expected = expected.fillna(np.nan)
Expand All @@ -210,7 +223,7 @@ def test_error_invalid_values(data, all_arithmetic_operators):
# more-correct than np.nan here.
tm.assert_series_equal(res, expected)
else:
with pytest.raises(TypeError, match=msg):
with pytest.raises(errs, match=msg):
ops(str_ser)

msg = "|".join(
Expand All @@ -223,9 +236,11 @@ def test_error_invalid_values(data, all_arithmetic_operators):
r"can only concatenate str \(not \"int\"\) to str",
"not all arguments converted during string",
"cannot subtract DatetimeArray from ndarray",
"has no kernel",
"not implemented",
]
)
with pytest.raises(TypeError, match=msg):
with pytest.raises(errs, match=msg):
ops(pd.Series(pd.date_range("20180101", periods=len(s))))


Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/arrays/integer/test_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,9 @@ def test_groupby_reductions(op, expected):
["all", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")],
],
)
def test_mixed_reductions(op, expected):
def test_mixed_reductions(op, expected, using_infer_string):
if op in ["any", "all"] and using_infer_string:
expected = expected.astype("bool")
df = DataFrame(
{
"A": ["a", "b", "b"],
Expand Down
22 changes: 19 additions & 3 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def test_mul(dtype):
@pytest.mark.xfail(reason="GH-28527")
def test_add_strings(dtype):
arr = pd.array(["a", "b", "c", "d"], dtype=dtype)
df = pd.DataFrame([["t", "y", "v", "w"]])
df = pd.DataFrame([["t", "y", "v", "w"]], dtype=object)
assert arr.__add__(df) is NotImplemented

result = arr + df
Expand Down Expand Up @@ -498,10 +498,17 @@ def test_arrow_array(dtype):


@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
def test_arrow_roundtrip(dtype, string_storage2):
def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string):
# roundtrip possible from arrow 1.0.0
pa = pytest.importorskip("pyarrow")

if using_infer_string and string_storage2 != "pyarrow_numpy":
request.applymarker(
pytest.mark.xfail(
reason="infer_string takes precedence over string storage"
)
)

data = pd.array(["a", "b", None], dtype=dtype)
df = pd.DataFrame({"a": data})
table = pa.table(df)
Expand All @@ -516,10 +523,19 @@ def test_arrow_roundtrip(dtype, string_storage2):


@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
def test_arrow_load_from_zero_chunks(dtype, string_storage2):
def test_arrow_load_from_zero_chunks(
dtype, string_storage2, request, using_infer_string
):
# GH-41040
pa = pytest.importorskip("pyarrow")

if using_infer_string and string_storage2 != "pyarrow_numpy":
request.applymarker(
pytest.mark.xfail(
reason="infer_string takes precedence over string storage"
)
)

data = pd.array([], dtype=dtype)
df = pd.DataFrame({"a": data})
table = pa.table(df)
Expand Down

0 comments on commit 8aa7a96

Please sign in to comment.