From 25f944340662d381283d8a700a306a0e10bb2de0 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Fri, 19 Aug 2022 11:41:28 -0700
Subject: [PATCH 1/2] BUG/REGR: Fix subset for DataFrameGroupBy.value_counts

---
 doc/source/whatsnew/v1.4.4.rst                |  1 +
 pandas/core/groupby/generic.py                | 21 +++++++++++--------
 .../tests/groupby/test_frame_value_counts.py  | 17 +++++++++++++++
 3 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/doc/source/whatsnew/v1.4.4.rst b/doc/source/whatsnew/v1.4.4.rst
index ee94c21a29cc9..2999741c460b7 100644
--- a/doc/source/whatsnew/v1.4.4.rst
+++ b/doc/source/whatsnew/v1.4.4.rst
@@ -32,6 +32,7 @@ Bug fixes
 ~~~~~~~~~
 - The :class:`errors.FutureWarning` raised when passing arguments (other than ``filepath_or_buffer``) as positional in :func:`read_csv` is now raised at the correct stacklevel (:issue:`47385`)
 - Bug in :meth:`DataFrame.to_sql` when ``method`` was a ``callable`` that did not return an ``int`` and would raise a ``TypeError`` (:issue:`46891`)
+- Bug in :meth:`DataFrameGroupBy.value_counts` where ``subset`` had no effect (:issue:`44267`)
 - Bug in :meth:`loc.__getitem__` with a list of keys causing an internal inconsistency that could lead to a disconnect between ``frame.at[x, y]`` vs ``frame[y].loc[x]`` (:issue:`22372`)
 - Bug in the :meth:`Series.dt.strftime` accessor return a float instead of object dtype Series for all-NaT input, which also causes a spurious deprecation warning (:issue:`45858`)
 
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
index 8a261f09e7118..cb815601524eb 100644
--- a/pandas/core/groupby/generic.py
+++ b/pandas/core/groupby/generic.py
@@ -1805,21 +1805,24 @@ def value_counts(
                 name = self._selected_obj.name
                 keys = [] if name in in_axis_names else [self._selected_obj]
             else:
+                if subset is not None:
+                    subsetted = set(subset)
+                    clashing = subsetted & set(in_axis_names)
+                    if clashing:
+                        raise ValueError(
+                            f"Keys {clashing} in subset cannot be in "
+                            "the groupby column keys"
+                        )
+                else:
+                    subsetted = set(self._selected_obj.columns)
+
                 keys = [
                     # Can't use .values because the column label needs to be preserved
                     self._selected_obj.iloc[:, idx]
                     for idx, name in enumerate(self._selected_obj.columns)
-                    if name not in in_axis_names
+                    if name not in in_axis_names and name in subsetted
                 ]
 
-            if subset is not None:
-                clashing = set(subset) & set(in_axis_names)
-                if clashing:
-                    raise ValueError(
-                        f"Keys {clashing} in subset cannot be in "
-                        "the groupby column keys"
-                    )
-
             groupings = list(self.grouper.groupings)
             for key in keys:
                 grouper, _, _ = get_grouper(
diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py
index 1e679ad4e7aad..686f3e11384ef 100644
--- a/pandas/tests/groupby/test_frame_value_counts.py
+++ b/pandas/tests/groupby/test_frame_value_counts.py
@@ -738,3 +738,20 @@ def test_ambiguous_grouping():
     result = gb.value_counts()
     expected = Series([2], index=MultiIndex.from_tuples([[1, 1]], names=[None, "a"]))
     tm.assert_series_equal(result, expected)
+
+
+def test_subset_overlaps_gb_key_raises():
+    # GH 46383
+    df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
+    with pytest.raises(ValueError, match="Keys {'c1'}"):
+        df.groupby("c1").value_counts(subset=["c1"])
+
+
+def test_subset():
+    # GH 46383
+    df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
+    result = df.groupby(level=0).value_counts(subset=["c2"])
+    expected = Series(
+        [1, 2], index=MultiIndex.from_arrays([[0, 1], ["x", "y"]], names=[None, "c2"])
+    )
+    tm.assert_series_equal(result, expected)

From 5879e4e6de591ca3e21419326e1b153b95fb4676 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 22 Aug 2022 15:00:56 -0700
Subject: [PATCH 2/2] Test subset not in columns; duplicate columns

---
 pandas/core/groupby/generic.py                | 11 ++++++--
 .../tests/groupby/test_frame_value_counts.py  | 28 ++++++++++++++++++-
 2 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
index a83fed6dd42c9..cd91e89554b67 100644
--- a/pandas/core/groupby/generic.py
+++ b/pandas/core/groupby/generic.py
@@ -1805,16 +1805,23 @@ def value_counts(
                 name = self._selected_obj.name
                 keys = [] if name in in_axis_names else [self._selected_obj]
             else:
+                unique_cols = set(self._selected_obj.columns)
                 if subset is not None:
                     subsetted = set(subset)
                     clashing = subsetted & set(in_axis_names)
                     if clashing:
                         raise ValueError(
                             f"Keys {clashing} in subset cannot be in "
-                            "the groupby column keys"
+                            "the groupby column keys."
+                        )
+                    doesnt_exist = subsetted - unique_cols
+                    if doesnt_exist:
+                        raise ValueError(
+                            f"Keys {doesnt_exist} in subset do not "
+                            f"exist in the DataFrame."
                         )
                 else:
-                    subsetted = set(self._selected_obj.columns)
+                    subsetted = unique_cols
 
                 keys = [
                     # Can't use .values because the column label needs to be preserved
diff --git a/pandas/tests/groupby/test_frame_value_counts.py b/pandas/tests/groupby/test_frame_value_counts.py
index 686f3e11384ef..8255fbab40dce 100644
--- a/pandas/tests/groupby/test_frame_value_counts.py
+++ b/pandas/tests/groupby/test_frame_value_counts.py
@@ -743,10 +743,19 @@ def test_ambiguous_grouping():
 def test_subset_overlaps_gb_key_raises():
     # GH 46383
     df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
-    with pytest.raises(ValueError, match="Keys {'c1'}"):
+    msg = "Keys {'c1'} in subset cannot be in the groupby column keys."
+    with pytest.raises(ValueError, match=msg):
         df.groupby("c1").value_counts(subset=["c1"])
 
 
+def test_subset_doesnt_exist_in_frame():
+    # GH 46383
+    df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
+    msg = "Keys {'c3'} in subset do not exist in the DataFrame."
+    with pytest.raises(ValueError, match=msg):
+        df.groupby("c1").value_counts(subset=["c3"])
+
+
 def test_subset():
     # GH 46383
     df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
@@ -755,3 +764,20 @@ def test_subset():
         [1, 2], index=MultiIndex.from_arrays([[0, 1], ["x", "y"]], names=[None, "c2"])
     )
     tm.assert_series_equal(result, expected)
+
+
+def test_subset_duplicate_columns():
+    # GH 46383
+    df = DataFrame(
+        [["a", "x", "x"], ["b", "y", "y"], ["b", "y", "y"]],
+        index=[0, 1, 1],
+        columns=["c1", "c2", "c2"],
+    )
+    result = df.groupby(level=0).value_counts(subset=["c2"])
+    expected = Series(
+        [1, 2],
+        index=MultiIndex.from_arrays(
+            [[0, 1], ["x", "y"], ["x", "y"]], names=[None, "c2", "c2"]
+        ),
+    )
+    tm.assert_series_equal(result, expected)