From 74cd050f6c1c6db1131283c1905f6478b136a8b3 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 10 Nov 2022 11:39:19 -0800 Subject: [PATCH] CLN: test_nanops.py (#49423) --- pandas/tests/test_nanops.py | 487 +++++++++++++++++++++++++----------- 1 file changed, 334 insertions(+), 153 deletions(-) diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 0e64181bd46a7..ae8791a774ed5 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -29,6 +29,175 @@ def skipna(request): return request.param +@pytest.fixture +def disable_bottleneck(monkeypatch): + with monkeypatch.context() as m: + m.setattr(nanops, "_USE_BOTTLENECK", False) + yield + + +@pytest.fixture +def arr_shape(): + return 11, 7 + + +@pytest.fixture +def arr_float(arr_shape): + np.random.seed(11235) + return np.random.randn(*arr_shape) + + +@pytest.fixture +def arr_complex(arr_float): + return arr_float + arr_float * 1j + + +@pytest.fixture +def arr_int(arr_shape): + np.random.seed(11235) + return np.random.randint(-10, 10, arr_shape) + + +@pytest.fixture +def arr_bool(arr_shape): + np.random.seed(11235) + return np.random.randint(0, 2, arr_shape) == 0 + + +@pytest.fixture +def arr_str(arr_float): + return np.abs(arr_float).astype("S") + + +@pytest.fixture +def arr_utf(arr_float): + return np.abs(arr_float).astype("U") + + +@pytest.fixture +def arr_date(arr_shape): + np.random.seed(11235) + return np.random.randint(0, 20000, arr_shape).astype("M8[ns]") + + +@pytest.fixture +def arr_tdelta(arr_shape): + np.random.seed(11235) + return np.random.randint(0, 20000, arr_shape).astype("m8[ns]") + + +@pytest.fixture +def arr_nan(arr_shape): + return np.tile(np.nan, arr_shape) + + +@pytest.fixture +def arr_float_nan(arr_float, arr_nan): + return np.vstack([arr_float, arr_nan]) + + +@pytest.fixture +def arr_nan_float1(arr_nan, arr_float): + return np.vstack([arr_nan, arr_float]) + + +@pytest.fixture +def arr_nan_nan(arr_nan): + return np.vstack([arr_nan, arr_nan]) + + +@pytest.fixture +def arr_inf(arr_float): + return arr_float * np.inf + + +@pytest.fixture +def arr_float_inf(arr_float, arr_inf): + return np.vstack([arr_float, arr_inf]) + + +@pytest.fixture +def arr_nan_inf(arr_nan, arr_inf): + return np.vstack([arr_nan, arr_inf]) + + +@pytest.fixture +def arr_float_nan_inf(arr_float, arr_nan, arr_inf): + return np.vstack([arr_float, arr_nan, arr_inf]) + + +@pytest.fixture +def arr_nan_nan_inf(arr_nan, arr_inf): + return np.vstack([arr_nan, arr_nan, arr_inf]) + + +@pytest.fixture +def arr_obj( + arr_float, arr_int, arr_bool, arr_complex, arr_str, arr_utf, arr_date, arr_tdelta +): + return np.vstack( + [ + arr_float.astype("O"), + arr_int.astype("O"), + arr_bool.astype("O"), + arr_complex.astype("O"), + arr_str.astype("O"), + arr_utf.astype("O"), + arr_date.astype("O"), + arr_tdelta.astype("O"), + ] + ) + + +@pytest.fixture +def arr_nan_nanj(arr_nan): + with np.errstate(invalid="ignore"): + return arr_nan + arr_nan * 1j + + +@pytest.fixture +def arr_complex_nan(arr_complex, arr_nan_nanj): + with np.errstate(invalid="ignore"): + return np.vstack([arr_complex, arr_nan_nanj]) + + +@pytest.fixture +def arr_nan_infj(arr_inf): + with np.errstate(invalid="ignore"): + return arr_inf * 1j + + +@pytest.fixture +def arr_complex_nan_infj(arr_complex, arr_nan_infj): + with np.errstate(invalid="ignore"): + return np.vstack([arr_complex, arr_nan_infj]) + + +@pytest.fixture +def arr_float_1d(arr_float): + return arr_float[:, 0] + + +@pytest.fixture +def arr_nan_1d(arr_nan): + return arr_nan[:, 0] + + +@pytest.fixture +def arr_float_nan_1d(arr_float_nan): + return arr_float_nan[:, 0] + + +@pytest.fixture +def arr_float1_nan_1d(arr_float1_nan): + return arr_float1_nan[:, 0] + + +@pytest.fixture +def arr_nan_float1_1d(arr_nan_float1): + return arr_nan_float1[:, 0] + + class TestnanopsDataFrame: def setup_method(self): np.random.seed(11235) @@ -299,45 +468,6 @@ def test_nanmean(self, skipna): nanops.nanmean, np.mean, skipna, allow_obj=False, allow_date=False ) - @pytest.mark.parametrize("val", [2**55, -(2**55), 20150515061816532]) - def test_nanmean_overflow(self, val): - # GH 10155 - # In the previous implementation mean can overflow for int dtypes, it - # is now consistent with numpy - - ser = Series(val, index=range(500), dtype=np.int64) - result = ser.mean() - np_result = ser.values.mean() - assert result == val - assert result == np_result - assert result.dtype == np.float64 - - @pytest.mark.parametrize( - "dtype", - [ - np.int16, - np.int32, - np.int64, - np.float32, - np.float64, - getattr(np, "float128", None), - ], - ) - def test_returned_dtype(self, dtype): - if dtype is None: - # no float128 available - return - - ser = Series(range(10), dtype=dtype) - group_a = ["mean", "std", "var", "skew", "kurt"] - group_b = ["min", "max"] - for method in group_a + group_b: - result = getattr(ser, method)() - if is_integer_dtype(dtype) and method in group_a: - assert result.dtype == np.float64 - else: - assert result.dtype == dtype - def test_nanmedian(self, skipna): with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) @@ -623,124 +753,137 @@ def test_nancov(self): targ1 = np.cov(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0, 1] self.check_nancorr_nancov_1d(nanops.nancov, targ0, targ1) - @pytest.mark.parametrize( - "op,nanop", - [ - (operator.eq, nanops.naneq), - (operator.ne, nanops.nanne), - (operator.gt, nanops.nangt), - (operator.ge, nanops.nange), - (operator.lt, nanops.nanlt), - (operator.le, nanops.nanle), - ], - ) - def test_nan_comparison(self, op, nanop): - targ0 = op(self.arr_float, self.arr_float1) - arr_float = self.arr_float - arr_float1 = self.arr_float1 - arr_nan = self.arr_nan - arr_nan_nan = self.arr_nan_nan - arr_float_nan = self.arr_float_nan - arr_float1_nan = self.arr_float1_nan - arr_nan_float1 = self.arr_nan_float1 - - while targ0.ndim: - res0 = nanop(arr_float, arr_float1) - tm.assert_almost_equal(targ0, res0) - - if targ0.ndim > 1: - targ1 = np.vstack([targ0, arr_nan]) - else: - targ1 = np.hstack([targ0, arr_nan]) - res1 = nanop(arr_float_nan, arr_float1_nan) - tm.assert_numpy_array_equal(targ1, res1, check_dtype=False) - - targ2 = arr_nan_nan - res2 = nanop(arr_float_nan, arr_nan_float1) - tm.assert_numpy_array_equal(targ2, res2, check_dtype=False) - - # Lower dimension for next step in the loop - arr_float = np.take(arr_float, 0, axis=-1) - arr_float1 = np.take(arr_float1, 0, axis=-1) - arr_nan = np.take(arr_nan, 0, axis=-1) - arr_nan_nan = np.take(arr_nan_nan, 0, axis=-1) - arr_float_nan = np.take(arr_float_nan, 0, axis=-1) - arr_float1_nan = np.take(arr_float1_nan, 0, axis=-1) - arr_nan_float1 = np.take(arr_nan_float1, 0, axis=-1) - targ0 = np.take(targ0, 0, axis=-1) - @pytest.mark.parametrize( - "arr, correct", - [ - ("arr_complex", False), - ("arr_int", False), - ("arr_bool", False), - ("arr_str", False), - ("arr_utf", False), - ("arr_complex", False), - ("arr_complex_nan", False), - ("arr_nan_nanj", False), - ("arr_nan_infj", True), - ("arr_complex_nan_infj", True), - ], - ) - def test__has_infs_non_float(self, arr, correct): - val = getattr(self, arr) - while getattr(val, "ndim", True): - res0 = nanops._has_infs(val) - if correct: - assert res0 - else: - assert not res0 +@pytest.mark.parametrize( + "op,nanop", + [ + (operator.eq, nanops.naneq), + (operator.ne, nanops.nanne), + (operator.gt, nanops.nangt), + (operator.ge, nanops.nange), + (operator.lt, nanops.nanlt), + (operator.le, nanops.nanle), + ], +) +def test_nan_comparison(request, op, nanop, disable_bottleneck): + arr_float = request.getfixturevalue("arr_float") + arr_float1 = request.getfixturevalue("arr_float") + targ0 = op(arr_float, arr_float1) + arr_nan = request.getfixturevalue("arr_nan") + arr_nan_nan = request.getfixturevalue("arr_nan_nan") + arr_float_nan = request.getfixturevalue("arr_float_nan") + arr_float1_nan = request.getfixturevalue("arr_float_nan") + arr_nan_float1 = request.getfixturevalue("arr_nan_float1") + + while targ0.ndim: + res0 = nanop(arr_float, arr_float1) + tm.assert_almost_equal(targ0, res0) + + if targ0.ndim > 1: + targ1 = np.vstack([targ0, arr_nan]) + else: + targ1 = np.hstack([targ0, arr_nan]) + res1 = nanop(arr_float_nan, arr_float1_nan) + tm.assert_numpy_array_equal(targ1, res1, check_dtype=False) + + targ2 = arr_nan_nan + res2 = nanop(arr_float_nan, arr_nan_float1) + tm.assert_numpy_array_equal(targ2, res2, check_dtype=False) + + # Lower dimension for next step in the loop + arr_float = np.take(arr_float, 0, axis=-1) + arr_float1 = np.take(arr_float1, 0, axis=-1) + arr_nan = np.take(arr_nan, 0, axis=-1) + arr_nan_nan = np.take(arr_nan_nan, 0, axis=-1) + arr_float_nan = np.take(arr_float_nan, 0, axis=-1) + arr_float1_nan = np.take(arr_float1_nan, 0, axis=-1) + arr_nan_float1 = np.take(arr_nan_float1, 0, axis=-1) + targ0 = np.take(targ0, 0, axis=-1) - if not hasattr(val, "ndim"): - break - # Reduce dimension for next step in the loop - val = np.take(val, 0, axis=-1) +@pytest.mark.parametrize( + "arr, correct", + [ + ("arr_complex", False), + ("arr_int", False), + ("arr_bool", False), + ("arr_str", False), + ("arr_utf", False), + ("arr_complex", False), + ("arr_complex_nan", False), + ("arr_nan_nanj", False), + ("arr_nan_infj", True), + ("arr_complex_nan_infj", True), + ], +) +def test_has_infs_non_float(request, arr, correct, disable_bottleneck): + val = request.getfixturevalue(arr) + while getattr(val, "ndim", True): + res0 = nanops._has_infs(val) + if correct: + assert res0 + else: + assert not res0 + + if not hasattr(val, "ndim"): + break + + # Reduce dimension for next step in the loop + val = np.take(val, 0, axis=-1) - @pytest.mark.parametrize( - "arr, correct", - [ - ("arr_float", False), - ("arr_nan", False), - ("arr_float_nan", False), - ("arr_nan_nan", False), - ("arr_float_inf", True), - ("arr_inf", True), - ("arr_nan_inf", True), - ("arr_float_nan_inf", True), - ("arr_nan_nan_inf", True), - ], - ) - @pytest.mark.parametrize("astype", [None, "f4", "f2"]) - def test__has_infs_floats(self, arr, correct, astype): - val = getattr(self, arr) - if astype is not None: - val = val.astype(astype) - while getattr(val, "ndim", True): - res0 = nanops._has_infs(val) - if correct: - assert res0 - else: - assert not res0 - if not hasattr(val, "ndim"): - break +@pytest.mark.parametrize( + "arr, correct", + [ + ("arr_float", False), + ("arr_nan", False), + ("arr_float_nan", False), + ("arr_nan_nan", False), + ("arr_float_inf", True), + ("arr_inf", True), + ("arr_nan_inf", True), + ("arr_float_nan_inf", True), + ("arr_nan_nan_inf", True), + ], +) +@pytest.mark.parametrize("astype", [None, "f4", "f2"]) +def test_has_infs_floats(request, arr, correct, astype, disable_bottleneck): + val = request.getfixturevalue(arr) + if astype is not None: + val = val.astype(astype) + while getattr(val, "ndim", True): + res0 = nanops._has_infs(val) + if correct: + assert res0 + else: + assert not res0 - # Reduce dimension for next step in the loop - val = np.take(val, 0, axis=-1) + if not hasattr(val, "ndim"): + break - def test__bn_ok_dtype(self): - assert nanops._bn_ok_dtype(self.arr_float.dtype, "test") - assert nanops._bn_ok_dtype(self.arr_complex.dtype, "test") - assert nanops._bn_ok_dtype(self.arr_int.dtype, "test") - assert nanops._bn_ok_dtype(self.arr_bool.dtype, "test") - assert nanops._bn_ok_dtype(self.arr_str.dtype, "test") - assert nanops._bn_ok_dtype(self.arr_utf.dtype, "test") - assert not nanops._bn_ok_dtype(self.arr_date.dtype, "test") - assert not nanops._bn_ok_dtype(self.arr_tdelta.dtype, "test") - assert not nanops._bn_ok_dtype(self.arr_obj.dtype, "test") + # Reduce dimension for next step in the loop + val = np.take(val, 0, axis=-1) + + +@pytest.mark.parametrize( + "fixture", ["arr_float", "arr_complex", "arr_int", "arr_bool", "arr_str", "arr_utf"] +) +def test_bn_ok_dtype(fixture, request, disable_bottleneck): + obj = request.getfixturevalue(fixture) + assert nanops._bn_ok_dtype(obj.dtype, "test") + + +@pytest.mark.parametrize( + "fixture", + [ + "arr_date", + "arr_tdelta", + "arr_obj", + ], +) +def test_bn_not_ok_dtype(fixture, request, disable_bottleneck): + obj = request.getfixturevalue(fixture) + assert not nanops._bn_ok_dtype(obj.dtype, "test") class TestEnsureNumeric: @@ -1111,7 +1254,7 @@ def test_nanops_independent_of_mask_param(operation): @pytest.mark.parametrize("min_count", [-1, 0]) -def test_check_below_min_count__negative_or_zero_min_count(min_count): +def test_check_below_min_count_negative_or_zero_min_count(min_count): # GH35227 result = nanops.check_below_min_count((21, 37), None, min_count) expected_result = False @@ -1122,7 +1265,7 @@ def test_check_below_min_count__negative_or_zero_min_count(min_count): "mask", [None, np.array([False, False, True]), np.array([True] + 9 * [False])] ) @pytest.mark.parametrize("min_count, expected_result", [(1, False), (101, True)]) -def test_check_below_min_count__positive_min_count(mask, min_count, expected_result): +def test_check_below_min_count_positive_min_count(mask, min_count, expected_result): # GH35227 shape = (10, 10) result = nanops.check_below_min_count(shape, mask, min_count) @@ -1132,7 +1275,7 @@ def test_check_below_min_count__positive_min_count(mask, min_count, expected_res @td.skip_if_windows @td.skip_if_32bit @pytest.mark.parametrize("min_count, expected_result", [(1, False), (2812191852, True)]) -def test_check_below_min_count__large_shape(min_count, expected_result): +def test_check_below_min_count_large_shape(min_count, expected_result): # GH35227 large shape used to show that the issue is fixed shape = (2244367, 1253) result = nanops.check_below_min_count(shape, mask=None, min_count=min_count) @@ -1143,3 +1286,41 @@ def test_check_below_min_count__large_shape(min_count, expected_result): def test_check_bottleneck_disallow(any_real_numpy_dtype, func): # GH 42878 bottleneck sometimes produces unreliable results for mean and sum assert not nanops._bn_ok_dtype(np.dtype(any_real_numpy_dtype).type, func) + + +@pytest.mark.parametrize("val", [2**55, -(2**55), 20150515061816532]) +def test_nanmean_overflow(disable_bottleneck, val): + # GH 10155 + # In the previous implementation mean can overflow for int dtypes, it + # is now consistent with numpy + + ser = Series(val, index=range(500), dtype=np.int64) + result = ser.mean() + np_result = ser.values.mean() + assert result == val + assert result == np_result + assert result.dtype == np.float64 + + +@pytest.mark.parametrize( + "dtype", + [ + np.int16, + np.int32, + np.int64, + np.float32, + np.float64, + getattr(np, "float128", None), + ], +) +@pytest.mark.parametrize("method", ["mean", "std", "var", "skew", "kurt", "min", "max"]) +def test_returned_dtype(disable_bottleneck, dtype, method): + if dtype is None: + pytest.skip("np.float128 not available") + + ser = Series(range(10), dtype=dtype) + result = getattr(ser, method)() + if is_integer_dtype(dtype) and method not in ["min", "max"]: + assert result.dtype == np.float64 + else: + assert result.dtype == dtype