Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: groupby().rolling(freq) with monotonic dates within groups #46065 #46567

Merged
merged 6 commits into from Mar 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.2.rst
Expand Up @@ -32,6 +32,7 @@ Bug fixes
- Fix some cases for subclasses that define their ``_constructor`` properties as general callables (:issue:`46018`)
- Fixed "longtable" formatting in :meth:`.Styler.to_latex` when ``column_format`` is given in extended format (:issue:`46037`)
- Fixed incorrect rendering in :meth:`.Styler.format` with ``hyperlinks="html"`` when the url contains a colon or other special characters (:issue:`46389`)
- Fixed :meth:`Groupby.rolling` with a frequency window that would raise a ``ValueError`` even if the datetimes within each group were monotonic (:issue:`46061`)

.. ---------------------------------------------------------------------------

Expand Down
18 changes: 18 additions & 0 deletions pandas/core/window/rolling.py
Expand Up @@ -2680,3 +2680,21 @@ def _get_window_indexer(self) -> GroupbyIndexer:
indexer_kwargs=indexer_kwargs,
)
return window_indexer

def _validate_datetimelike_monotonic(self):
"""
Validate that each group in self._on is monotonic
"""
# GH 46061
if self._on.hasnans:
self._raise_monotonic_error("values must not have NaT")
for group_indices in self._grouper.indices.values():
group_on = self._on.take(group_indices)
if not (
group_on.is_monotonic_increasing or group_on.is_monotonic_decreasing
):
on = "index" if self.on is None else self.on
raise ValueError(
f"Each group within {on} must be monotonic. "
f"Sort the values in {on} first."
)
77 changes: 77 additions & 0 deletions pandas/tests/window/test_groupby.py
Expand Up @@ -927,6 +927,83 @@ def test_nan_and_zero_endpoints(self):
)
tm.assert_series_equal(result, expected)

def test_groupby_rolling_non_monotonic(self):
# GH 43909

shuffled = [3, 0, 1, 2]
sec = 1_000
df = DataFrame(
[{"t": Timestamp(2 * x * sec), "x": x + 1, "c": 42} for x in shuffled]
)
with pytest.raises(ValueError, match=r".* must be monotonic"):
df.groupby("c").rolling(on="t", window="3s")

def test_groupby_monotonic(self):

# GH 15130
# we don't need to validate monotonicity when grouping

# GH 43909 we should raise an error here to match
# behaviour of non-groupby rolling.

data = [
["David", "1/1/2015", 100],
["David", "1/5/2015", 500],
["David", "5/30/2015", 50],
["David", "7/25/2015", 50],
["Ryan", "1/4/2014", 100],
["Ryan", "1/19/2015", 500],
["Ryan", "3/31/2016", 50],
["Joe", "7/1/2015", 100],
["Joe", "9/9/2015", 500],
["Joe", "10/15/2015", 50],
]

df = DataFrame(data=data, columns=["name", "date", "amount"])
df["date"] = to_datetime(df["date"])
df = df.sort_values("date")

expected = (
df.set_index("date")
.groupby("name")
.apply(lambda x: x.rolling("180D")["amount"].sum())
)
result = df.groupby("name").rolling("180D", on="date")["amount"].sum()
tm.assert_series_equal(result, expected)

def test_datelike_on_monotonic_within_each_group(self):
# GH 13966 (similar to #15130, closed by #15175)

# superseded by 43909
# GH 46061: OK if the on is monotonic relative to each each group

dates = date_range(start="2016-01-01 09:30:00", periods=20, freq="s")
df = DataFrame(
{
"A": [1] * 20 + [2] * 12 + [3] * 8,
"B": np.concatenate((dates, dates)),
"C": np.arange(40),
}
)

expected = (
df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean())
)
result = df.groupby("A").rolling("4s", on="B").C.mean()
tm.assert_series_equal(result, expected)

def test_datelike_on_not_monotonic_within_each_group(self):
# GH 46061
df = DataFrame(
{
"A": [1] * 3 + [2] * 3,
"B": [Timestamp(year, 1, 1) for year in [2020, 2021, 2019]] * 2,
"C": range(6),
}
)
with pytest.raises(ValueError, match="Each group within B must be monotonic."):
df.groupby("A").rolling("365D", on="B")


class TestExpanding:
def setup_method(self):
Expand Down
12 changes: 0 additions & 12 deletions pandas/tests/window/test_rolling.py
Expand Up @@ -1456,18 +1456,6 @@ def test_groupby_rolling_nan_included():
tm.assert_frame_equal(result, expected)


def test_groupby_rolling_non_monotonic():
# GH 43909

shuffled = [3, 0, 1, 2]
sec = 1_000
df = DataFrame(
[{"t": Timestamp(2 * x * sec), "x": x + 1, "c": 42} for x in shuffled]
)
with pytest.raises(ValueError, match=r".* must be monotonic"):
df.groupby("c").rolling(on="t", window="3s")


@pytest.mark.parametrize("method", ["skew", "kurt"])
def test_rolling_skew_kurt_numerical_stability(method):
# GH#6929
Expand Down
60 changes: 0 additions & 60 deletions pandas/tests/window/test_timeseries_window.py
Expand Up @@ -9,7 +9,6 @@
Series,
Timestamp,
date_range,
to_datetime,
)
import pandas._testing as tm

Expand Down Expand Up @@ -649,65 +648,6 @@ def agg_by_day(x):

tm.assert_frame_equal(result, expected)

def test_groupby_monotonic(self):

# GH 15130
# we don't need to validate monotonicity when grouping

# GH 43909 we should raise an error here to match
# behaviour of non-groupby rolling.

data = [
["David", "1/1/2015", 100],
["David", "1/5/2015", 500],
["David", "5/30/2015", 50],
["David", "7/25/2015", 50],
["Ryan", "1/4/2014", 100],
["Ryan", "1/19/2015", 500],
["Ryan", "3/31/2016", 50],
["Joe", "7/1/2015", 100],
["Joe", "9/9/2015", 500],
["Joe", "10/15/2015", 50],
]

df = DataFrame(data=data, columns=["name", "date", "amount"])
df["date"] = to_datetime(df["date"])
df = df.sort_values("date")

expected = (
df.set_index("date")
.groupby("name")
.apply(lambda x: x.rolling("180D")["amount"].sum())
)
result = df.groupby("name").rolling("180D", on="date")["amount"].sum()
tm.assert_series_equal(result, expected)

def test_non_monotonic_raises(self):
# GH 13966 (similar to #15130, closed by #15175)

# superseded by 43909

dates = date_range(start="2016-01-01 09:30:00", periods=20, freq="s")
df = DataFrame(
{
"A": [1] * 20 + [2] * 12 + [3] * 8,
"B": np.concatenate((dates, dates)),
"C": np.arange(40),
}
)

expected = (
df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean())
)
with pytest.raises(ValueError, match=r".* must be monotonic"):
df.groupby("A").rolling(
"4s", on="B"
).C.mean() # should raise for non-monotonic t series

df2 = df.sort_values("B")
result = df2.groupby("A").rolling("4s", on="B").C.mean()
tm.assert_series_equal(result, expected)

def test_rolling_cov_offset(self):
# GH16058

Expand Down