From 9a0bbdf4ea38d70f222f297cb3c89032953ce227 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Tue, 29 Nov 2022 14:49:03 -0800 Subject: [PATCH 1/4] fix _mul_cols for empty index --- dask/dataframe/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask/dataframe/groupby.py b/dask/dataframe/groupby.py index 7fb4fa637a6..242555762f3 100644 --- a/dask/dataframe/groupby.py +++ b/dask/dataframe/groupby.py @@ -531,7 +531,7 @@ def _mul_cols(df, cols): # Fix index in a groupby().apply() context # https://github.com/dask/dask/issues/8137 # https://github.com/pandas-dev/pandas/issues/43568 - _df.index = [0] * len(_df) + _df.index = np.repeat(np.array([0], dtype="int64"), len(_df)) return _df From 706603268a530cec2ea5eaa4d30c92cf94129645 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Tue, 29 Nov 2022 15:41:14 -0800 Subject: [PATCH 2/4] add note about int64 dtype --- dask/dataframe/groupby.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dask/dataframe/groupby.py b/dask/dataframe/groupby.py index 242555762f3..b00ff2abdb1 100644 --- a/dask/dataframe/groupby.py +++ b/dask/dataframe/groupby.py @@ -531,6 +531,8 @@ def _mul_cols(df, cols): # Fix index in a groupby().apply() context # https://github.com/dask/dask/issues/8137 # https://github.com/pandas-dev/pandas/issues/43568 + # Make sure index dtype is "int64" (even if _df is empty) + # https://github.com/dask/dask/pull/9701 _df.index = np.repeat(np.array([0], dtype="int64"), len(_df)) return _df From 17ba39ae318c0fc4d9cad468fd22608fedd716c7 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Wed, 30 Nov 2022 16:15:56 -0600 Subject: [PATCH 3/4] Update dask/dataframe/groupby.py Co-authored-by: James Bourbeau --- dask/dataframe/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask/dataframe/groupby.py b/dask/dataframe/groupby.py index b00ff2abdb1..a5212e3b0c3 100644 --- a/dask/dataframe/groupby.py +++ b/dask/dataframe/groupby.py @@ -533,7 +533,7 @@ def _mul_cols(df, cols): # https://github.com/pandas-dev/pandas/issues/43568 # Make sure index dtype is "int64" (even if _df is empty) # https://github.com/dask/dask/pull/9701 - _df.index = np.repeat(np.array([0], dtype="int64"), len(_df)) + _df.index = np.zeros(len(_df), dtype=int) return _df From 30c20221272e7f0d0a0b12680e447ef41e1f3a46 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Wed, 30 Nov 2022 17:07:00 -0600 Subject: [PATCH 4/4] Update _drop_duplicates_reindex --- dask/dataframe/groupby.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dask/dataframe/groupby.py b/dask/dataframe/groupby.py index a5212e3b0c3..3240cb1fc5f 100644 --- a/dask/dataframe/groupby.py +++ b/dask/dataframe/groupby.py @@ -531,7 +531,7 @@ def _mul_cols(df, cols): # Fix index in a groupby().apply() context # https://github.com/dask/dask/issues/8137 # https://github.com/pandas-dev/pandas/issues/43568 - # Make sure index dtype is "int64" (even if _df is empty) + # Make sure index dtype is int (even if _df is empty) # https://github.com/dask/dask/pull/9701 _df.index = np.zeros(len(_df), dtype=int) return _df @@ -642,8 +642,10 @@ def _drop_duplicates_reindex(df): # Fix index in a groupby().apply() context # https://github.com/dask/dask/issues/8137 # https://github.com/pandas-dev/pandas/issues/43568 + # Make sure index dtype is int (even if result is empty) + # https://github.com/dask/dask/pull/9701 result = df.drop_duplicates() - result.index = [0] * len(result) + result.index = np.zeros(len(result), dtype=int) return result