Skip to content

Commit

Permalink
Backport PR #46567 on branch 1.4.x (BUG: groupby().rolling(freq) with…
Browse files Browse the repository at this point in the history
… monotonic dates within groups #46065 ) (#46592)

* ENH: Improve error message when rolling over NaT value (#46087)

* BUG: groupby().rolling(freq) with monotonic dates within groups #46065  (#46567)

(cherry picked from commit d2aa44f)

Co-authored-by: Matthew Roeschke <emailformattr@gmail.com>
  • Loading branch information
simonjayhawkins and mroeschke committed Apr 1, 2022
1 parent 3ff438e commit 9c6d84c
Show file tree
Hide file tree
Showing 5 changed files with 121 additions and 96 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.4.2.rst
Expand Up @@ -32,6 +32,8 @@ Bug fixes
- Fix some cases for subclasses that define their ``_constructor`` properties as general callables (:issue:`46018`)
- Fixed "longtable" formatting in :meth:`.Styler.to_latex` when ``column_format`` is given in extended format (:issue:`46037`)
- Fixed incorrect rendering in :meth:`.Styler.format` with ``hyperlinks="html"`` when the url contains a colon or other special characters (:issue:`46389`)
- Improved error message in :class:`~pandas.core.window.Rolling` when ``window`` is a frequency and ``NaT`` is in the rolling axis (:issue:`46087`)
- Fixed :meth:`Groupby.rolling` with a frequency window that would raise a ``ValueError`` even if the datetimes within each group were monotonic (:issue:`46061`)

.. ---------------------------------------------------------------------------
Expand Down
52 changes: 30 additions & 22 deletions pandas/core/window/rolling.py
Expand Up @@ -837,12 +837,6 @@ def _gotitem(self, key, ndim, subset=None):
subset = self.obj.set_index(self._on)
return super()._gotitem(key, ndim, subset=subset)

def _validate_monotonic(self):
"""
Validate that "on" is monotonic; already validated at a higher level.
"""
pass


class Window(BaseWindow):
"""
Expand Down Expand Up @@ -1687,7 +1681,7 @@ def _validate(self):
or isinstance(self._on, (DatetimeIndex, TimedeltaIndex, PeriodIndex))
) and isinstance(self.window, (str, BaseOffset, timedelta)):

self._validate_monotonic()
self._validate_datetimelike_monotonic()

# this will raise ValueError on non-fixed freqs
try:
Expand All @@ -1712,18 +1706,24 @@ def _validate(self):
elif not is_integer(self.window) or self.window < 0:
raise ValueError("window must be an integer 0 or greater")

def _validate_monotonic(self):
def _validate_datetimelike_monotonic(self):
"""
Validate monotonic (increasing or decreasing).
Validate self._on is monotonic (increasing or decreasing) and has
no NaT values for frequency windows.
"""
if self._on.hasnans:
self._raise_monotonic_error("values must not have NaT")
if not (self._on.is_monotonic_increasing or self._on.is_monotonic_decreasing):
self._raise_monotonic_error()
self._raise_monotonic_error("values must be monotonic")

def _raise_monotonic_error(self):
formatted = self.on
if self.on is None:
formatted = "index"
raise ValueError(f"{formatted} must be monotonic")
def _raise_monotonic_error(self, msg: str):
on = self.on
if on is None:
if self.axis == 0:
on = "index"
else:
on = "column"
raise ValueError(f"{on} {msg}")

@doc(
_shared_docs["aggregate"],
Expand Down Expand Up @@ -2631,12 +2631,20 @@ def _get_window_indexer(self) -> GroupbyIndexer:
)
return window_indexer

def _validate_monotonic(self):
def _validate_datetimelike_monotonic(self):
"""
Validate that on is monotonic;
Validate that each group in self._on is monotonic
"""
if (
not (self._on.is_monotonic_increasing or self._on.is_monotonic_decreasing)
or self._on.hasnans
):
self._raise_monotonic_error()
# GH 46061
if self._on.hasnans:
self._raise_monotonic_error("values must not have NaT")
for group_indices in self._grouper.indices.values():
group_on = self._on.take(group_indices)
if not (
group_on.is_monotonic_increasing or group_on.is_monotonic_decreasing
):
on = "index" if self.on is None else self.on
raise ValueError(
f"Each group within {on} must be monotonic. "
f"Sort the values in {on} first."
)
79 changes: 78 additions & 1 deletion pandas/tests/window/test_groupby.py
Expand Up @@ -651,7 +651,7 @@ def test_groupby_rolling_nans_in_index(self, rollings, key):
)
if key == "index":
df = df.set_index("a")
with pytest.raises(ValueError, match=f"{key} must be monotonic"):
with pytest.raises(ValueError, match=f"{key} values must not have NaT"):
df.groupby("c").rolling("60min", **rollings)

@pytest.mark.parametrize("group_keys", [True, False])
Expand Down Expand Up @@ -895,6 +895,83 @@ def test_nan_and_zero_endpoints(self):
)
tm.assert_series_equal(result, expected)

def test_groupby_rolling_non_monotonic(self):
# GH 43909

shuffled = [3, 0, 1, 2]
sec = 1_000
df = DataFrame(
[{"t": Timestamp(2 * x * sec), "x": x + 1, "c": 42} for x in shuffled]
)
with pytest.raises(ValueError, match=r".* must be monotonic"):
df.groupby("c").rolling(on="t", window="3s")

def test_groupby_monotonic(self):

# GH 15130
# we don't need to validate monotonicity when grouping

# GH 43909 we should raise an error here to match
# behaviour of non-groupby rolling.

data = [
["David", "1/1/2015", 100],
["David", "1/5/2015", 500],
["David", "5/30/2015", 50],
["David", "7/25/2015", 50],
["Ryan", "1/4/2014", 100],
["Ryan", "1/19/2015", 500],
["Ryan", "3/31/2016", 50],
["Joe", "7/1/2015", 100],
["Joe", "9/9/2015", 500],
["Joe", "10/15/2015", 50],
]

df = DataFrame(data=data, columns=["name", "date", "amount"])
df["date"] = to_datetime(df["date"])
df = df.sort_values("date")

expected = (
df.set_index("date")
.groupby("name")
.apply(lambda x: x.rolling("180D")["amount"].sum())
)
result = df.groupby("name").rolling("180D", on="date")["amount"].sum()
tm.assert_series_equal(result, expected)

def test_datelike_on_monotonic_within_each_group(self):
# GH 13966 (similar to #15130, closed by #15175)

# superseded by 43909
# GH 46061: OK if the on is monotonic relative to each each group

dates = date_range(start="2016-01-01 09:30:00", periods=20, freq="s")
df = DataFrame(
{
"A": [1] * 20 + [2] * 12 + [3] * 8,
"B": np.concatenate((dates, dates)),
"C": np.arange(40),
}
)

expected = (
df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean())
)
result = df.groupby("A").rolling("4s", on="B").C.mean()
tm.assert_series_equal(result, expected)

def test_datelike_on_not_monotonic_within_each_group(self):
# GH 46061
df = DataFrame(
{
"A": [1] * 3 + [2] * 3,
"B": [Timestamp(year, 1, 1) for year in [2020, 2021, 2019]] * 2,
"C": range(6),
}
)
with pytest.raises(ValueError, match="Each group within B must be monotonic."):
df.groupby("A").rolling("365D", on="B")


class TestExpanding:
def setup_method(self):
Expand Down
12 changes: 0 additions & 12 deletions pandas/tests/window/test_rolling.py
Expand Up @@ -1419,18 +1419,6 @@ def test_groupby_rolling_nan_included():
tm.assert_frame_equal(result, expected)


def test_groupby_rolling_non_monotonic():
# GH 43909

shuffled = [3, 0, 1, 2]
sec = 1_000
df = DataFrame(
[{"t": Timestamp(2 * x * sec), "x": x + 1, "c": 42} for x in shuffled]
)
with pytest.raises(ValueError, match=r".* must be monotonic"):
df.groupby("c").rolling(on="t", window="3s")


@pytest.mark.parametrize("method", ["skew", "kurt"])
def test_rolling_skew_kurt_numerical_stability(method):
# GH#6929
Expand Down
72 changes: 11 additions & 61 deletions pandas/tests/window/test_timeseries_window.py
Expand Up @@ -5,10 +5,10 @@
DataFrame,
Index,
MultiIndex,
NaT,
Series,
Timestamp,
date_range,
to_datetime,
)
import pandas._testing as tm

Expand Down Expand Up @@ -134,7 +134,7 @@ def test_non_monotonic_on(self):

assert not df.index.is_monotonic

msg = "index must be monotonic"
msg = "index values must be monotonic"
with pytest.raises(ValueError, match=msg):
df.rolling("2s").sum()

Expand Down Expand Up @@ -643,65 +643,6 @@ def agg_by_day(x):

tm.assert_frame_equal(result, expected)

def test_groupby_monotonic(self):

# GH 15130
# we don't need to validate monotonicity when grouping

# GH 43909 we should raise an error here to match
# behaviour of non-groupby rolling.

data = [
["David", "1/1/2015", 100],
["David", "1/5/2015", 500],
["David", "5/30/2015", 50],
["David", "7/25/2015", 50],
["Ryan", "1/4/2014", 100],
["Ryan", "1/19/2015", 500],
["Ryan", "3/31/2016", 50],
["Joe", "7/1/2015", 100],
["Joe", "9/9/2015", 500],
["Joe", "10/15/2015", 50],
]

df = DataFrame(data=data, columns=["name", "date", "amount"])
df["date"] = to_datetime(df["date"])
df = df.sort_values("date")

expected = (
df.set_index("date")
.groupby("name")
.apply(lambda x: x.rolling("180D")["amount"].sum())
)
result = df.groupby("name").rolling("180D", on="date")["amount"].sum()
tm.assert_series_equal(result, expected)

def test_non_monotonic_raises(self):
# GH 13966 (similar to #15130, closed by #15175)

# superseded by 43909

dates = date_range(start="2016-01-01 09:30:00", periods=20, freq="s")
df = DataFrame(
{
"A": [1] * 20 + [2] * 12 + [3] * 8,
"B": np.concatenate((dates, dates)),
"C": np.arange(40),
}
)

expected = (
df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean())
)
with pytest.raises(ValueError, match=r".* must be monotonic"):
df.groupby("A").rolling(
"4s", on="B"
).C.mean() # should raise for non-monotonic t series

df2 = df.sort_values("B")
result = df2.groupby("A").rolling("4s", on="B").C.mean()
tm.assert_series_equal(result, expected)

def test_rolling_cov_offset(self):
# GH16058

Expand Down Expand Up @@ -757,3 +698,12 @@ def test_rolling_on_multi_index_level(self):
{"column": [0.0, 1.0, 3.0, 6.0, 10.0, 15.0]}, index=df.index
)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("msg, axis", [["column", 1], ["index", 0]])
def test_nat_axis_error(msg, axis):
idx = [Timestamp("2020"), NaT]
kwargs = {"columns" if axis == 1 else "index": idx}
df = DataFrame(np.eye(2), **kwargs)
with pytest.raises(ValueError, match=f"{msg} values must not have NaT"):
df.rolling("D", axis=axis).mean()

0 comments on commit 9c6d84c

Please sign in to comment.