Skip to content

Commit

Permalink
Backport PR pandas-dev#38057: PERF: fix regression in creation of res…
Browse files Browse the repository at this point in the history
…ulting index in RollingGroupby (pandas-dev#38211)

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
  • Loading branch information
simonjayhawkins and jorisvandenbossche committed Dec 1, 2020
1 parent b376fb9 commit 993557b
Show file tree
Hide file tree
Showing 4 changed files with 107 additions and 20 deletions.
14 changes: 14 additions & 0 deletions asv_bench/benchmarks/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,4 +216,18 @@ def time_rolling_offset(self, method):
getattr(self.groupby_roll_offset, method)()


class GroupbyLargeGroups:
# https://github.com/pandas-dev/pandas/issues/38038
# specific example where the rolling operation on a larger dataframe
# is relatively cheap (few but large groups), but creation of
# MultiIndex of result can be expensive

def setup(self):
N = 100000
self.df = pd.DataFrame({"A": [1, 2] * int(N / 2), "B": np.random.randn(N)})

def time_rolling_multiindex_creation(self):
self.df.groupby("A").rolling(3).mean()


from .pandas_vb_common import setup # noqa: F401 isort:skip
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.5.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ Fixed regressions
- Fixed regression in :meth:`DataFrame.groupby` aggregation with out-of-bounds datetime objects in an object-dtype column (:issue:`36003`)
- Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`)
- Fixed regression in :meth:`DataFrame.fillna` not filling ``NaN`` after other operations such as :meth:`DataFrame.pivot` (:issue:`36495`).
- Fixed performance regression in ``df.groupby(..).rolling(..)`` (:issue:`38038`)
- Fixed regression in :meth:`MultiIndex.intersection` returning duplicates when at least one of the indexes had duplicates (:issue:`36915`)

.. ---------------------------------------------------------------------------
Expand Down
37 changes: 22 additions & 15 deletions pandas/core/window/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -2216,22 +2216,29 @@ def _apply(
# Our result will have still kept the column in the result
result = result.drop(columns=column_keys, errors="ignore")

result_index_data = []
for key, values in self._groupby.grouper.indices.items():
for value in values:
data = [
*com.maybe_make_list(key),
*com.maybe_make_list(
grouped_object_index[value]
if grouped_object_index is not None
else []
),
]
result_index_data.append(tuple(data))

result_index = MultiIndex.from_tuples(
result_index_data, names=result_index_names
codes = self._groupby.grouper.codes
levels = self._groupby.grouper.levels

group_indices = self._groupby.grouper.indices.values()
if group_indices:
indexer = np.concatenate(list(group_indices))
else:
indexer = np.array([], dtype=np.intp)
codes = [c.take(indexer) for c in codes]

# if the index of the original dataframe needs to be preserved, append
# this index (but reordered) to the codes/levels from the groupby
if grouped_object_index is not None:
idx = grouped_object_index.take(indexer)
if not isinstance(idx, MultiIndex):
idx = MultiIndex.from_arrays([idx])
codes.extend(list(idx.codes))
levels.extend(list(idx.levels))

result_index = MultiIndex(
levels, codes, names=result_index_names, verify_integrity=False
)

result.index = result_index
return result

Expand Down
75 changes: 70 additions & 5 deletions pandas/tests/window/test_grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pytest

import pandas as pd
from pandas import DataFrame, MultiIndex, Series
from pandas import DataFrame, Index, MultiIndex, Series
import pandas._testing as tm
from pandas.core.groupby.groupby import get_groupby

Expand Down Expand Up @@ -396,14 +396,25 @@ def test_groupby_rolling_index_changed(self, func):

def test_groupby_rolling_empty_frame(self):
# GH 36197
expected = pd.DataFrame({"s1": []})
expected = DataFrame({"s1": []})
result = expected.groupby("s1").rolling(window=1).sum()
expected.index = pd.MultiIndex.from_tuples([], names=["s1", None])
# GH-38057 from_tuples gives empty object dtype, we now get float/int levels
# expected.index = MultiIndex.from_tuples([], names=["s1", None])
expected.index = MultiIndex.from_product(
[Index([], dtype="float64"), Index([], dtype="int64")], names=["s1", None]
)
tm.assert_frame_equal(result, expected)

expected = pd.DataFrame({"s1": [], "s2": []})
expected = DataFrame({"s1": [], "s2": []})
result = expected.groupby(["s1", "s2"]).rolling(window=1).sum()
expected.index = pd.MultiIndex.from_tuples([], names=["s1", "s2", None])
expected.index = MultiIndex.from_product(
[
Index([], dtype="float64"),
Index([], dtype="float64"),
Index([], dtype="int64"),
],
names=["s1", "s2", None],
)
tm.assert_frame_equal(result, expected)

def test_groupby_rolling_string_index(self):
Expand Down Expand Up @@ -479,3 +490,57 @@ def test_groupby_rolling_index_level_and_column_label(self):
),
)
tm.assert_frame_equal(result, expected)

def test_groupby_rolling_resulting_multiindex(self):
# a few different cases checking the created MultiIndex of the result
# https://github.com/pandas-dev/pandas/pull/38057

# grouping by 1 columns -> 2-level MI as result
df = DataFrame({"a": np.arange(8.0), "b": [1, 2] * 4})
result = df.groupby("b").rolling(3).mean()
expected_index = MultiIndex.from_tuples(
[(1, 0), (1, 2), (1, 4), (1, 6), (2, 1), (2, 3), (2, 5), (2, 7)],
names=["b", None],
)
tm.assert_index_equal(result.index, expected_index)

# grouping by 2 columns -> 3-level MI as result
df = DataFrame({"a": np.arange(12.0), "b": [1, 2] * 6, "c": [1, 2, 3, 4] * 3})
result = df.groupby(["b", "c"]).rolling(2).sum()
expected_index = MultiIndex.from_tuples(
[
(1, 1, 0),
(1, 1, 4),
(1, 1, 8),
(1, 3, 2),
(1, 3, 6),
(1, 3, 10),
(2, 2, 1),
(2, 2, 5),
(2, 2, 9),
(2, 4, 3),
(2, 4, 7),
(2, 4, 11),
],
names=["b", "c", None],
)
tm.assert_index_equal(result.index, expected_index)

# grouping with 1 level on dataframe with 2-level MI -> 3-level MI as result
df = DataFrame({"a": np.arange(8.0), "b": [1, 2] * 4, "c": [1, 2, 3, 4] * 2})
df = df.set_index("c", append=True)
result = df.groupby("b").rolling(3).mean()
expected_index = MultiIndex.from_tuples(
[
(1, 0, 1),
(1, 2, 3),
(1, 4, 1),
(1, 6, 3),
(2, 1, 2),
(2, 3, 4),
(2, 5, 2),
(2, 7, 4),
],
names=["b", None, "c"],
)
tm.assert_index_equal(result.index, expected_index)

0 comments on commit 993557b

Please sign in to comment.