diff --git a/doc/source/whatsnew/v1.4.2.rst b/doc/source/whatsnew/v1.4.2.rst index 76b2a5d6ffd47..66068f9faa923 100644 --- a/doc/source/whatsnew/v1.4.2.rst +++ b/doc/source/whatsnew/v1.4.2.rst @@ -32,6 +32,8 @@ Bug fixes - Fix some cases for subclasses that define their ``_constructor`` properties as general callables (:issue:`46018`) - Fixed "longtable" formatting in :meth:`.Styler.to_latex` when ``column_format`` is given in extended format (:issue:`46037`) - Fixed incorrect rendering in :meth:`.Styler.format` with ``hyperlinks="html"`` when the url contains a colon or other special characters (:issue:`46389`) +- Improved error message in :class:`~pandas.core.window.Rolling` when ``window`` is a frequency and ``NaT`` is in the rolling axis (:issue:`46087`) +- Fixed :meth:`Groupby.rolling` with a frequency window that would raise a ``ValueError`` even if the datetimes within each group were monotonic (:issue:`46061`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 72ebbcbc65e5e..6d74c6db1f7ed 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -837,12 +837,6 @@ def _gotitem(self, key, ndim, subset=None): subset = self.obj.set_index(self._on) return super()._gotitem(key, ndim, subset=subset) - def _validate_monotonic(self): - """ - Validate that "on" is monotonic; already validated at a higher level. - """ - pass - class Window(BaseWindow): """ @@ -1687,7 +1681,7 @@ def _validate(self): or isinstance(self._on, (DatetimeIndex, TimedeltaIndex, PeriodIndex)) ) and isinstance(self.window, (str, BaseOffset, timedelta)): - self._validate_monotonic() + self._validate_datetimelike_monotonic() # this will raise ValueError on non-fixed freqs try: @@ -1712,18 +1706,24 @@ def _validate(self): elif not is_integer(self.window) or self.window < 0: raise ValueError("window must be an integer 0 or greater") - def _validate_monotonic(self): + def _validate_datetimelike_monotonic(self): """ - Validate monotonic (increasing or decreasing). + Validate self._on is monotonic (increasing or decreasing) and has + no NaT values for frequency windows. """ + if self._on.hasnans: + self._raise_monotonic_error("values must not have NaT") if not (self._on.is_monotonic_increasing or self._on.is_monotonic_decreasing): - self._raise_monotonic_error() + self._raise_monotonic_error("values must be monotonic") - def _raise_monotonic_error(self): - formatted = self.on - if self.on is None: - formatted = "index" - raise ValueError(f"{formatted} must be monotonic") + def _raise_monotonic_error(self, msg: str): + on = self.on + if on is None: + if self.axis == 0: + on = "index" + else: + on = "column" + raise ValueError(f"{on} {msg}") @doc( _shared_docs["aggregate"], @@ -2631,12 +2631,20 @@ def _get_window_indexer(self) -> GroupbyIndexer: ) return window_indexer - def _validate_monotonic(self): + def _validate_datetimelike_monotonic(self): """ - Validate that on is monotonic; + Validate that each group in self._on is monotonic """ - if ( - not (self._on.is_monotonic_increasing or self._on.is_monotonic_decreasing) - or self._on.hasnans - ): - self._raise_monotonic_error() + # GH 46061 + if self._on.hasnans: + self._raise_monotonic_error("values must not have NaT") + for group_indices in self._grouper.indices.values(): + group_on = self._on.take(group_indices) + if not ( + group_on.is_monotonic_increasing or group_on.is_monotonic_decreasing + ): + on = "index" if self.on is None else self.on + raise ValueError( + f"Each group within {on} must be monotonic. " + f"Sort the values in {on} first." + ) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 6ec19e4899d53..f1d84f7ae1cbf 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -651,7 +651,7 @@ def test_groupby_rolling_nans_in_index(self, rollings, key): ) if key == "index": df = df.set_index("a") - with pytest.raises(ValueError, match=f"{key} must be monotonic"): + with pytest.raises(ValueError, match=f"{key} values must not have NaT"): df.groupby("c").rolling("60min", **rollings) @pytest.mark.parametrize("group_keys", [True, False]) @@ -895,6 +895,83 @@ def test_nan_and_zero_endpoints(self): ) tm.assert_series_equal(result, expected) + def test_groupby_rolling_non_monotonic(self): + # GH 43909 + + shuffled = [3, 0, 1, 2] + sec = 1_000 + df = DataFrame( + [{"t": Timestamp(2 * x * sec), "x": x + 1, "c": 42} for x in shuffled] + ) + with pytest.raises(ValueError, match=r".* must be monotonic"): + df.groupby("c").rolling(on="t", window="3s") + + def test_groupby_monotonic(self): + + # GH 15130 + # we don't need to validate monotonicity when grouping + + # GH 43909 we should raise an error here to match + # behaviour of non-groupby rolling. + + data = [ + ["David", "1/1/2015", 100], + ["David", "1/5/2015", 500], + ["David", "5/30/2015", 50], + ["David", "7/25/2015", 50], + ["Ryan", "1/4/2014", 100], + ["Ryan", "1/19/2015", 500], + ["Ryan", "3/31/2016", 50], + ["Joe", "7/1/2015", 100], + ["Joe", "9/9/2015", 500], + ["Joe", "10/15/2015", 50], + ] + + df = DataFrame(data=data, columns=["name", "date", "amount"]) + df["date"] = to_datetime(df["date"]) + df = df.sort_values("date") + + expected = ( + df.set_index("date") + .groupby("name") + .apply(lambda x: x.rolling("180D")["amount"].sum()) + ) + result = df.groupby("name").rolling("180D", on="date")["amount"].sum() + tm.assert_series_equal(result, expected) + + def test_datelike_on_monotonic_within_each_group(self): + # GH 13966 (similar to #15130, closed by #15175) + + # superseded by 43909 + # GH 46061: OK if the on is monotonic relative to each each group + + dates = date_range(start="2016-01-01 09:30:00", periods=20, freq="s") + df = DataFrame( + { + "A": [1] * 20 + [2] * 12 + [3] * 8, + "B": np.concatenate((dates, dates)), + "C": np.arange(40), + } + ) + + expected = ( + df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean()) + ) + result = df.groupby("A").rolling("4s", on="B").C.mean() + tm.assert_series_equal(result, expected) + + def test_datelike_on_not_monotonic_within_each_group(self): + # GH 46061 + df = DataFrame( + { + "A": [1] * 3 + [2] * 3, + "B": [Timestamp(year, 1, 1) for year in [2020, 2021, 2019]] * 2, + "C": range(6), + } + ) + with pytest.raises(ValueError, match="Each group within B must be monotonic."): + df.groupby("A").rolling("365D", on="B") + class TestExpanding: def setup_method(self): diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 814bd6b998182..b60f2e60e1035 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1419,18 +1419,6 @@ def test_groupby_rolling_nan_included(): tm.assert_frame_equal(result, expected) -def test_groupby_rolling_non_monotonic(): - # GH 43909 - - shuffled = [3, 0, 1, 2] - sec = 1_000 - df = DataFrame( - [{"t": Timestamp(2 * x * sec), "x": x + 1, "c": 42} for x in shuffled] - ) - with pytest.raises(ValueError, match=r".* must be monotonic"): - df.groupby("c").rolling(on="t", window="3s") - - @pytest.mark.parametrize("method", ["skew", "kurt"]) def test_rolling_skew_kurt_numerical_stability(method): # GH#6929 diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index f2cf7bd47e15b..274c986ac28f5 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -5,10 +5,10 @@ DataFrame, Index, MultiIndex, + NaT, Series, Timestamp, date_range, - to_datetime, ) import pandas._testing as tm @@ -134,7 +134,7 @@ def test_non_monotonic_on(self): assert not df.index.is_monotonic - msg = "index must be monotonic" + msg = "index values must be monotonic" with pytest.raises(ValueError, match=msg): df.rolling("2s").sum() @@ -643,65 +643,6 @@ def agg_by_day(x): tm.assert_frame_equal(result, expected) - def test_groupby_monotonic(self): - - # GH 15130 - # we don't need to validate monotonicity when grouping - - # GH 43909 we should raise an error here to match - # behaviour of non-groupby rolling. - - data = [ - ["David", "1/1/2015", 100], - ["David", "1/5/2015", 500], - ["David", "5/30/2015", 50], - ["David", "7/25/2015", 50], - ["Ryan", "1/4/2014", 100], - ["Ryan", "1/19/2015", 500], - ["Ryan", "3/31/2016", 50], - ["Joe", "7/1/2015", 100], - ["Joe", "9/9/2015", 500], - ["Joe", "10/15/2015", 50], - ] - - df = DataFrame(data=data, columns=["name", "date", "amount"]) - df["date"] = to_datetime(df["date"]) - df = df.sort_values("date") - - expected = ( - df.set_index("date") - .groupby("name") - .apply(lambda x: x.rolling("180D")["amount"].sum()) - ) - result = df.groupby("name").rolling("180D", on="date")["amount"].sum() - tm.assert_series_equal(result, expected) - - def test_non_monotonic_raises(self): - # GH 13966 (similar to #15130, closed by #15175) - - # superseded by 43909 - - dates = date_range(start="2016-01-01 09:30:00", periods=20, freq="s") - df = DataFrame( - { - "A": [1] * 20 + [2] * 12 + [3] * 8, - "B": np.concatenate((dates, dates)), - "C": np.arange(40), - } - ) - - expected = ( - df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean()) - ) - with pytest.raises(ValueError, match=r".* must be monotonic"): - df.groupby("A").rolling( - "4s", on="B" - ).C.mean() # should raise for non-monotonic t series - - df2 = df.sort_values("B") - result = df2.groupby("A").rolling("4s", on="B").C.mean() - tm.assert_series_equal(result, expected) - def test_rolling_cov_offset(self): # GH16058 @@ -757,3 +698,12 @@ def test_rolling_on_multi_index_level(self): {"column": [0.0, 1.0, 3.0, 6.0, 10.0, 15.0]}, index=df.index ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("msg, axis", [["column", 1], ["index", 0]]) +def test_nat_axis_error(msg, axis): + idx = [Timestamp("2020"), NaT] + kwargs = {"columns" if axis == 1 else "index": idx} + df = DataFrame(np.eye(2), **kwargs) + with pytest.raises(ValueError, match=f"{msg} values must not have NaT"): + df.rolling("D", axis=axis).mean()