REGR: groupby.transform producing segfault (#46585)

pandas-dev · Mar 31, 2022 · 382aefc · 382aefc
1 parent 2555468
commit 382aefc
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 13 deletions.
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1106,7 +1106,7 @@ def _set_result_index_ordered(
         # set the result index on the passed values object and
         # return the new object, xref 8046
 
-        if self.grouper.is_monotonic:
+        if self.grouper.is_monotonic and not self.grouper.has_dropped_na:
             # shortcut if we have an already ordered grouper
             result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True)
             return result

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -818,7 +818,10 @@ def result_ilocs(self) -> npt.NDArray[np.intp]:
         # Original indices are where group_index would go via sorting.
         # But when dropna is true, we need to remove null values while accounting for
         # any gaps that then occur because of them.
-        group_index = get_group_index(self.codes, self.shape, sort=False, xnull=True)
+        group_index = get_group_index(
+            self.codes, self.shape, sort=self._sort, xnull=True
+        )
+        group_index, _ = compress_group_index(group_index, sort=self._sort)
 
         if self.has_dropped_na:
             mask = np.where(group_index >= 0)

diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
@@ -1303,23 +1303,34 @@ def test_transform_cumcount():
     tm.assert_series_equal(result, expected)
 
 
-def test_null_group_lambda_self(sort, dropna):
+@pytest.mark.parametrize("keys", [["A1"], ["A1", "A2"]])
+def test_null_group_lambda_self(request, sort, dropna, keys):
     # GH 17093
-    np.random.seed(0)
-    keys = np.random.randint(0, 5, size=50).astype(float)
-    nulls = np.random.choice([0, 1], keys.shape).astype(bool)
-    keys[nulls] = np.nan
-    values = np.random.randint(0, 5, size=keys.shape)
-    df = DataFrame({"A": keys, "B": values})
+    if not sort and not dropna:
+        msg = "GH#46584: null values get sorted when sort=False"
+        request.node.add_marker(pytest.mark.xfail(reason=msg, strict=False))
+
+    size = 50
+    nulls1 = np.random.choice([False, True], size)
+    nulls2 = np.random.choice([False, True], size)
+    # Whether a group contains a null value or not
+    nulls_grouper = nulls1 if len(keys) == 1 else nulls1 | nulls2
+
+    a1 = np.random.randint(0, 5, size=size).astype(float)
+    a1[nulls1] = np.nan
+    a2 = np.random.randint(0, 5, size=size).astype(float)
+    a2[nulls2] = np.nan
+    values = np.random.randint(0, 5, size=a1.shape)
+    df = DataFrame({"A1": a1, "A2": a2, "B": values})
 
     expected_values = values
-    if dropna and nulls.any():
+    if dropna and nulls_grouper.any():
         expected_values = expected_values.astype(float)
-        expected_values[nulls] = np.nan
+        expected_values[nulls_grouper] = np.nan
     expected = DataFrame(expected_values, columns=["B"])
 
-    gb = df.groupby("A", dropna=dropna, sort=sort)
-    result = gb.transform(lambda x: x)
+    gb = df.groupby(keys, dropna=dropna, sort=sort)
+    result = gb[["B"]].transform(lambda x: x)
     tm.assert_frame_equal(result, expected)