From 25f944340662d381283d8a700a306a0e10bb2de0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 19 Aug 2022 11:41:28 -0700 Subject: [PATCH 1/2] BUG/REGR: Fix subset for DataFrameGroupBy.value_counts --- doc/source/whatsnew/v1.4.4.rst | 1 + pandas/core/groupby/generic.py | 21 +++++++++++-------- .../tests/groupby/test_frame_value_counts.py | 17 +++++++++++++++ 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.4.4.rst b/doc/source/whatsnew/v1.4.4.rst index ee94c21a29cc9..2999741c460b7 100644 --- a/doc/source/whatsnew/v1.4.4.rst +++ b/doc/source/whatsnew/v1.4.4.rst @@ -32,6 +32,7 @@ Bug fixes ~~~~~~~~~ - The :class:`errors.FutureWarning` raised when passing arguments (other than ``filepath_or_buffer``) as positional in :func:`read_csv` is now raised at the correct stacklevel (:issue:`47385`) - Bug in :meth:`DataFrame.to_sql` when ``method`` was a ``callable`` that did not return an ``int`` and would raise a ``TypeError`` (:issue:`46891`) +- Bug in :meth:`DataFrameGroupBy.value_counts` where ``subset`` had no effect (:issue:`44267`) - Bug in :meth:`loc.__getitem__` with a list of keys causing an internal inconsistency that could lead to a disconnect between ``frame.at[x, y]`` vs ``frame[y].loc[x]`` (:issue:`22372`) - Bug in the :meth:`Series.dt.strftime` accessor return a float instead of object dtype Series for all-NaT input, which also causes a spurious deprecation warning (:issue:`45858`) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8a261f09e7118..cb815601524eb 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1805,21 +1805,24 @@ def value_counts( name = self._selected_obj.name keys = [] if name in in_axis_names else [self._selected_obj] else: + if subset is not None: + subsetted = set(subset) + clashing = subsetted & set(in_axis_names) + if clashing: + raise ValueError( + f"Keys {clashing} in subset cannot be in " + "the groupby column keys" + ) + else: + subsetted = set(self._selected_obj.columns) + keys = [ # Can't use .values because the column label needs to be preserved self._selected_obj.iloc[:, idx] for idx, name in enumerate(self._selected_obj.columns) - if name not in in_axis_names + if name not in in_axis_names and name in subsetted ] - if subset is not None: - clashing = set(subset) & set(in_axis_names) - if clashing: - raise ValueError( - f"Keys {clashing} in subset cannot be in " - "the groupby column keys" - ) - groupings = list(self.grouper.groupings) for key in keys: grouper, _, _ = get_grouper( diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 1e679ad4e7aad..686f3e11384ef 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -738,3 +738,20 @@ def test_ambiguous_grouping(): result = gb.value_counts() expected = Series([2], index=MultiIndex.from_tuples([[1, 1]], names=[None, "a"])) tm.assert_series_equal(result, expected) + + +def test_subset_overlaps_gb_key_raises(): + # GH 46383 + df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1]) + with pytest.raises(ValueError, match="Keys {'c1'}"): + df.groupby("c1").value_counts(subset=["c1"]) + + +def test_subset(): + # GH 46383 + df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1]) + result = df.groupby(level=0).value_counts(subset=["c2"]) + expected = Series( + [1, 2], index=MultiIndex.from_arrays([[0, 1], ["x", "y"]], names=[None, "c2"]) + ) + tm.assert_series_equal(result, expected) From 5879e4e6de591ca3e21419326e1b153b95fb4676 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 22 Aug 2022 15:00:56 -0700 Subject: [PATCH 2/2] Test subset not in columns; duplicate columns --- pandas/core/groupby/generic.py | 11 ++++++-- .../tests/groupby/test_frame_value_counts.py | 28 ++++++++++++++++++- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a83fed6dd42c9..cd91e89554b67 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1805,16 +1805,23 @@ def value_counts( name = self._selected_obj.name keys = [] if name in in_axis_names else [self._selected_obj] else: + unique_cols = set(self._selected_obj.columns) if subset is not None: subsetted = set(subset) clashing = subsetted & set(in_axis_names) if clashing: raise ValueError( f"Keys {clashing} in subset cannot be in " - "the groupby column keys" + "the groupby column keys." + ) + doesnt_exist = subsetted - unique_cols + if doesnt_exist: + raise ValueError( + f"Keys {doesnt_exist} in subset do not " + f"exist in the DataFrame." ) else: - subsetted = set(self._selected_obj.columns) + subsetted = unique_cols keys = [ # Can't use .values because the column label needs to be preserved diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py index 686f3e11384ef..8255fbab40dce 100644 --- a/pandas/tests/groupby/test_frame_value_counts.py +++ b/pandas/tests/groupby/test_frame_value_counts.py @@ -743,10 +743,19 @@ def test_ambiguous_grouping(): def test_subset_overlaps_gb_key_raises(): # GH 46383 df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1]) - with pytest.raises(ValueError, match="Keys {'c1'}"): + msg = "Keys {'c1'} in subset cannot be in the groupby column keys." + with pytest.raises(ValueError, match=msg): df.groupby("c1").value_counts(subset=["c1"]) +def test_subset_doesnt_exist_in_frame(): + # GH 46383 + df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1]) + msg = "Keys {'c3'} in subset do not exist in the DataFrame." + with pytest.raises(ValueError, match=msg): + df.groupby("c1").value_counts(subset=["c3"]) + + def test_subset(): # GH 46383 df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1]) @@ -755,3 +764,20 @@ def test_subset(): [1, 2], index=MultiIndex.from_arrays([[0, 1], ["x", "y"]], names=[None, "c2"]) ) tm.assert_series_equal(result, expected) + + +def test_subset_duplicate_columns(): + # GH 46383 + df = DataFrame( + [["a", "x", "x"], ["b", "y", "y"], ["b", "y", "y"]], + index=[0, 1, 1], + columns=["c1", "c2", "c2"], + ) + result = df.groupby(level=0).value_counts(subset=["c2"]) + expected = Series( + [1, 2], + index=MultiIndex.from_arrays( + [[0, 1], ["x", "y"], ["x", "y"]], names=[None, "c2", "c2"] + ), + ) + tm.assert_series_equal(result, expected)