Avoid passing groupby key list of length one #9495

jrbourbeau · 2022-09-14T17:34:12Z

This helps avoid the following deprecation being emitted from pandas

_____________________________________________________________________________________________________________ test_append_with_partition[pyarrow] _____________________________________________________________________________________________________________

tmpdir = local('/private/var/folders/k_/lx1rdvqn253gd1wrcx__5frm0000gn/T/pytest-of-james/pytest-43/test_append_with_partition_pya0'), engine = 'pyarrow'

    def test_append_with_partition(tmpdir, engine):
        tmp = str(tmpdir)
        df0 = pd.DataFrame(
            {
                "lat": np.arange(0, 10, dtype="int64"),
                "lon": np.arange(10, 20, dtype="int64"),
                "value": np.arange(100, 110, dtype="int64"),
            }
        )
        df0.index.name = "index"
        df1 = pd.DataFrame(
            {
                "lat": np.arange(10, 20, dtype="int64"),
                "lon": np.arange(10, 20, dtype="int64"),
                "value": np.arange(120, 130, dtype="int64"),
            }
        )
        df1.index.name = "index"

        # Check that nullable dtypes work
        # (see: https://github.com/dask/dask/issues/8373)
        df0["lat"] = df0["lat"].astype("Int64")
        df1["lat"].iloc[0] = np.nan
        df1["lat"] = df1["lat"].astype("Int64")

        dd_df0 = dd.from_pandas(df0, npartitions=1)
        dd_df1 = dd.from_pandas(df1, npartitions=1)
>       dd.to_parquet(dd_df0, tmp, partition_on=["lon"], engine=engine)

dask/dataframe/io/tests/test_parquet.py:710:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
dask/dataframe/io/parquet/core.py:963: in to_parquet
    out = out.compute(**compute_kwargs)
dask/base.py:315: in compute
    (result,) = compute(self, traverse=False, **kwargs)
dask/base.py:600: in compute
    results = schedule(dsk, keys, **kwargs)
dask/threaded.py:89: in get
    results = get_async(
dask/local.py:511: in get_async
    raise_exception(exc, tb)
dask/local.py:319: in reraise
    raise exc
dask/local.py:224: in execute_task
    result = _execute_task(task, data)
dask/core.py:119: in _execute_task
    return func(*(_execute_task(a, cache) for a in args))
dask/optimization.py:990: in __call__
    return core.get(self.dsk, self.outkey, dict(zip(self.inkeys, args)))
dask/core.py:149: in get
    result = _execute_task(task, cache)
dask/core.py:119: in _execute_task
    return func(*(_execute_task(a, cache) for a in args))
dask/dataframe/io/parquet/core.py:163: in __call__
    return self.engine.write_partition(
dask/dataframe/io/parquet/arrow.py:694: in write_partition
    md_list = _write_partitioned(
dask/dataframe/io/parquet/arrow.py:115: in _write_partitioned
    for keys, subgroup in data_df.groupby(partition_keys):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = <pandas.core.groupby.generic.DataFrameGroupBy object at 0x166376b60>

    @final
    def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]:
        """
        Groupby iterator.

        Returns
        -------
        Generator yielding sequence of (name, subsetted object)
        for each group
        """
        keys = self.keys
        if isinstance(keys, list) and len(keys) == 1:
>           warnings.warn(
                (
                    "In a future version of pandas, a length 1 "
                    "tuple will be returned when iterating over a "
                    "groupby with a grouper equal to a list of "
                    "length 1. Don't supply a list with a single grouper "
                    "to avoid this warning."
                ),
                FutureWarning,
                stacklevel=find_stack_level(inspect.currentframe()),
            )
E           FutureWarning: In a future version of pandas, a length 1 tuple will be returned when iterating over a groupby with a grouper equal to a list of length 1. Don't supply a list with a single grouper to avoid this warning.

../../../mambaforge/envs/dask/lib/python3.10/site-packages/pandas/core/groupby/groupby.py:826: FutureWarning

This was already included in #9472, but am factoring out as a separate PR here

ncclementi

It looks good to me. Thank you @jrbourbeau

jrbourbeau · 2022-09-14T18:11:46Z

Thanks for reviewing @ncclementi!

jrbourbeau added 2 commits September 14, 2022 12:32

Avoid passing groupby key list of length one

8871c74

test-upstream

9807032

github-actions bot added dataframe io labels Sep 14, 2022

jrbourbeau mentioned this pull request Sep 14, 2022

Release 2022.9.1 dask/community#275

Closed

7 tasks

ncclementi approved these changes Sep 14, 2022

View reviewed changes

jrbourbeau merged commit 6c15102 into dask:main Sep 14, 2022

jrbourbeau deleted the groupby-key-len-deprecation branch September 14, 2022 18:12

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Avoid passing groupby key list of length one #9495

Avoid passing groupby key list of length one #9495

jrbourbeau commented Sep 14, 2022

ncclementi left a comment

jrbourbeau commented Sep 14, 2022

Avoid passing groupby key list of length one #9495

Avoid passing groupby key list of length one #9495

Conversation

jrbourbeau commented Sep 14, 2022

ncclementi left a comment

Choose a reason for hiding this comment

jrbourbeau commented Sep 14, 2022