Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid pandas constructors in dask.dataframe.core #9570

Merged
merged 14 commits into from Nov 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
45 changes: 26 additions & 19 deletions dask/dataframe/core.py
Expand Up @@ -62,6 +62,8 @@
is_index_like,
is_series_like,
make_meta,
meta_frame_constructor,
meta_series_constructor,
raise_on_meta_error,
valid_divisions,
)
Expand Down Expand Up @@ -2482,8 +2484,8 @@ def _convert_time_cols_to_numeric(self, time_cols, axis, meta, skipna):
# since each standard deviation will just be NaN
needs_time_conversion = False
numeric_dd = from_pandas(
pd.DataFrame(
{"_": pd.Series([np.nan])},
meta_frame_constructor(self)(
{"_": meta_series_constructor(self)([np.nan])},
index=self.index,
),
npartitions=self.npartitions,
Expand Down Expand Up @@ -2813,7 +2815,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, method="default"):
graph = HighLevelGraph.from_collections(
keyname, layer, dependencies=quantiles
)
return DataFrame(graph, keyname, meta, quantiles[0].divisions)
return new_dd_object(graph, keyname, meta, quantiles[0].divisions)
rjzamora marked this conversation as resolved.
Show resolved Hide resolved

@derived_from(pd.DataFrame)
def describe(
Expand Down Expand Up @@ -3062,7 +3064,7 @@ def _cum_agg(
_take_last,
cumpart,
skipna,
meta=pd.Series([], dtype="float"),
meta=meta_series_constructor(self)([], dtype="float"),
token=name2,
)

Expand Down Expand Up @@ -3260,7 +3262,7 @@ def dot(self, other, meta=no_default):
def _dot_series(*args, **kwargs):
# .sum() is invoked on each partition before being applied to all
# partitions. The return type is expected to be a series, not a numpy object
return pd.Series(M.dot(*args, **kwargs))
return meta_series_constructor(self)(M.dot(*args, **kwargs))

return self.map_partitions(_dot_series, other, token="dot", meta=meta).sum(
skipna=False
Expand Down Expand Up @@ -3532,7 +3534,7 @@ def __array_wrap__(self, array, context=None):
f"{method_name} is not implemented for `dask.dataframe.Series`."
)

return pd.Series(array, index=index, name=self.name)
return meta_series_constructor(self)(array, index=index, name=self.name)

@property
def axes(self):
Expand Down Expand Up @@ -4513,7 +4515,7 @@ def __array_wrap__(self, array, context=None):
f"{method_name} is not implemented for `dask.dataframe.DataFrame`."
)

return pd.DataFrame(array, index=index, columns=self.columns)
return meta_frame_constructor(self)(array, index=index, columns=self.columns)

@property
def axes(self):
Expand Down Expand Up @@ -6323,7 +6325,7 @@ def split_evenly(df, k):
def split_out_on_index(df):
h = df.index
if isinstance(h, pd.MultiIndex):
h = pd.DataFrame([], index=h).reset_index()
h = meta_frame_constructor(df)([], index=h).reset_index()
return h


Expand Down Expand Up @@ -7048,14 +7050,17 @@ def cov_corr(df, min_periods=None, corr=False, scalar=False, split_every=False):
min_periods,
corr,
scalar,
df._meta,
)
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[df])
if scalar:
return Scalar(graph, name, "f8")
meta = make_meta(
[(c, "f8") for c in df.columns], index=df.columns, parent_meta=df._meta
[(c, "f8") for c in df.columns],
index=meta_series_constructor(df)(df.columns),
parent_meta=df._meta,
)
return DataFrame(graph, name, meta, (df.columns.min(), df.columns.max()))
return new_dd_object(graph, name, meta, (df.columns.min(), df.columns.max()))


def cov_corr_chunk(df, corr=False):
Expand Down Expand Up @@ -7128,7 +7133,7 @@ def cov_corr_combine(data_in, corr=False):
return out


def cov_corr_agg(data, cols, min_periods=2, corr=False, scalar=False):
def cov_corr_agg(data, cols, min_periods=2, corr=False, scalar=False, like_df=None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is data here? Can we get the DataFrame type from that?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

data is list[dict[str, np/cupy.ndarray]] here. Therfore, we would need to extend serial_frame_constructor to handle array-like data if we want to avoid the like_df argument.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1ba0353 adds a serial_constructor_from_array dispatch (but doesn't actually use it yet) to illustrate what it would probably look like.

out = cov_corr_combine(data, corr)
counts = out["count"]
C = out["cov"]
Expand All @@ -7142,7 +7147,9 @@ def cov_corr_agg(data, cols, min_periods=2, corr=False, scalar=False):
mat = C / den
if scalar:
return float(mat[0, 1])
return pd.DataFrame(mat, columns=cols, index=cols)
return (pd.DataFrame if like_df is None else meta_frame_constructor(like_df))(
mat, columns=cols, index=cols
)


def pd_split(df, p, random_state=None, shuffle=False):
Expand Down Expand Up @@ -7657,10 +7664,10 @@ def idxmaxmin_chunk(x, fn=None, skipna=True):
idx = getattr(x, fn)(skipna=skipna)
value = getattr(x, minmax)(skipna=skipna)
else:
idx = value = pd.Series([], dtype="i8")
idx = value = meta_series_constructor(x)([], dtype="i8")
if is_series_like(idx):
return pd.DataFrame({"idx": idx, "value": value})
return pd.DataFrame({"idx": [idx], "value": [value]})
return meta_frame_constructor(x)({"idx": idx, "value": value})
return meta_frame_constructor(x)({"idx": [idx], "value": [value]})


def idxmaxmin_row(x, fn=None, skipna=True):
Expand All @@ -7670,8 +7677,8 @@ def idxmaxmin_row(x, fn=None, skipna=True):
idx = [getattr(x.value, fn)(skipna=skipna)]
value = [getattr(x.value, minmax)(skipna=skipna)]
else:
idx = value = pd.Series([], dtype="i8")
return pd.DataFrame({"idx": idx, "value": value})
idx = value = meta_series_constructor(x)([], dtype="i8")
return meta_frame_constructor(x)({"idx": idx, "value": value})


def idxmaxmin_combine(x, fn=None, skipna=True):
Expand Down Expand Up @@ -7765,7 +7772,7 @@ def to_datetime(arg, meta=None, **kwargs):
"non-index-able arguments (like scalars)"
)
else:
meta = pd.Series([pd.Timestamp("2000", **tz_kwarg)])
meta = meta_series_constructor(arg)([pd.Timestamp("2000", **tz_kwarg)])
meta.index = meta.index.astype(arg.index.dtype)
meta.index.name = arg.index.name
return map_partitions(pd.to_datetime, arg, meta=meta, **kwargs)
Expand All @@ -7775,7 +7782,7 @@ def to_datetime(arg, meta=None, **kwargs):
def to_timedelta(arg, unit=None, errors="raise"):
if not PANDAS_GT_110 and unit is None:
unit = "ns"
meta = pd.Series([pd.Timedelta(1, unit=unit)])
meta = meta_series_constructor(arg)([pd.Timedelta(1, unit=unit)])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Not a blocking comment, just meant for other reviewers) Noting that this will fail for some inputs (e.g. numpy.ndarrays) that are supported by pandas.to_timedelta. However, it looks like things already fail with dd.to_timedelta for such inputs on main and we arguably get a better error message with the changes in this PR.

On main:

In [1]: import numpy as np

In [2]: import dask.dataframe as dd

In [3]: dd.to_timedelta(np.arange(5), unit='s')
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Input In [3], in <cell line: 1>()
----> 1 dd.to_timedelta(np.arange(5), unit='s')

File ~/projects/dask/dask/dask/dataframe/core.py:7779, in to_timedelta(arg, unit, errors)
   7777     unit = "ns"
   7778 meta = pd.Series([pd.Timedelta(1, unit=unit)])
-> 7779 return map_partitions(pd.to_timedelta, arg, unit=unit, errors=errors, meta=meta)

File ~/projects/dask/dask/dask/dataframe/core.py:6668, in map_partitions(func, meta, enforce_metadata, transform_divisions, align_dataframes, *args, **kwargs)
   6665     if collections:
   6666         simple = False
-> 6668 divisions = _get_divisions_map_partitions(
   6669     align_dataframes, transform_divisions, dfs, func, args, kwargs
   6670 )
   6672 if has_keyword(func, "partition_info"):
   6673     partition_info = {
   6674         (i,): {"number": i, "division": division}
   6675         for i, division in enumerate(divisions[:-1])
   6676     }

File ~/projects/dask/dask/dask/dataframe/core.py:6711, in _get_divisions_map_partitions(align_dataframes, transform_divisions, dfs, func, args, kwargs)
   6707 """
   6708 Helper to get divisions for map_partitions and map_overlap output.
   6709 """
   6710 if align_dataframes:
-> 6711     divisions = dfs[0].divisions
   6712 else:
   6713     # Unaligned, dfs is a mix of 1 partition and 1+ partition dataframes,
   6714     # use longest divisions found
   6715     divisions = max((d.divisions for d in dfs), key=len)

IndexError: list index out of range

With this PR:

In [4]: dd.to_timedelta(np.arange(5), unit='s')
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Input In [4], in <cell line: 1>()
----> 1 dd.to_timedelta(np.arange(5), unit='s')

File ~/projects/dask/dask/dask/dataframe/core.py:7785, in to_timedelta(arg, unit, errors)
   7783 if not PANDAS_GT_110 and unit is None:
   7784     unit = "ns"
-> 7785 meta = meta_series_constructor(arg)([pd.Timedelta(1, unit=unit)])
   7786 return map_partitions(pd.to_timedelta, arg, unit=unit, errors=errors, meta=meta)

File ~/projects/dask/dask/dask/dataframe/utils.py:782, in meta_series_constructor(like)
    780     return like.to_frame()._constructor_sliced
    781 else:
--> 782     raise TypeError(f"{type(like)} not supported by meta_series_constructor")

TypeError: <class 'numpy.ndarray'> not supported by meta_series_constructor

return map_partitions(pd.to_timedelta, arg, unit=unit, errors=errors, meta=meta)


Expand Down
34 changes: 34 additions & 0 deletions dask/dataframe/tests/test_dataframe.py
Expand Up @@ -3201,6 +3201,23 @@ def test_cov():
assert res._name != res3._name


@pytest.mark.gpu
def test_cov_gpu():
Comment on lines +3204 to +3205
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the backend PR goes in before this PR, could we just reuse test_cov with the backend engine parametrized?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps. We would need to change the _compat dataframe-creation machinery to use dispachable functions to do this.

cudf = pytest.importorskip("cudf")

# cudf DataFrame
df = cudf.from_pandas(_compat.makeDataFrame())
ddf = dd.from_pandas(df, npartitions=6)

res = ddf.cov()
res2 = ddf.cov(split_every=2)
sol = df.cov()
assert_eq(res, sol)
assert_eq(res2, sol)
assert res._name == ddf.cov()._name
assert res._name != res2._name


def test_corr():
# DataFrame
df = _compat.makeMissingDataframe()
Expand Down Expand Up @@ -3248,6 +3265,23 @@ def test_corr():
pytest.raises(TypeError, lambda: da.corr(ddf))


@pytest.mark.gpu
def test_corr_gpu():
Comment on lines +3268 to +3269
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar comment here

cudf = pytest.importorskip("cudf")

# cudf DataFrame
df = cudf.from_pandas(_compat.makeDataFrame())
ddf = dd.from_pandas(df, npartitions=6)

res = ddf.corr()
res2 = ddf.corr(split_every=2)
sol = df.corr()
assert_eq(res, sol)
assert_eq(res2, sol)
assert res._name == ddf.corr()._name
assert res._name != res2._name


def test_corr_same_name():
# Series with same names (see https://github.com/dask/dask/issues/4906)

Expand Down
32 changes: 32 additions & 0 deletions dask/dataframe/tests/test_utils_dataframe.py
Expand Up @@ -20,7 +20,9 @@
is_index_like,
is_series_like,
make_meta,
meta_frame_constructor,
meta_nonempty,
meta_series_constructor,
raise_on_meta_error,
shard_df_on_index,
)
Expand Down Expand Up @@ -618,3 +620,33 @@ def check_custom_scheduler(part: pd.DataFrame) -> pd.DataFrame:
assert_eq(ddf2, ddf2, scheduler=custom_scheduler)
with dask.config.set(scheduler=custom_scheduler):
assert_eq(ddf2, ddf2, scheduler=None)


@pytest.mark.parametrize(
"data",
[
pd.DataFrame([0]),
pd.Series([0]),
pd.Index([0]),
dd.from_dict({"x": [0]}, npartitions=1),
dd.from_dict({"x": [0]}, npartitions=1).x,
dd.from_dict({"x": [0]}, npartitions=1).index,
],
)
def test_meta_constructor_utilities(data):
assert meta_series_constructor(data) is pd.Series
assert meta_frame_constructor(data) is pd.DataFrame


@pytest.mark.parametrize(
"data",
[
dd.from_dict({"x": [0]}, npartitions=1).x.values,
np.array([0]),
],
)
def test_meta_constructor_utilities_raise(data):
with pytest.raises(TypeError, match="not supported by meta_series"):
meta_series_constructor(data)
with pytest.raises(TypeError, match="not supported by meta_frame"):
meta_frame_constructor(data)
46 changes: 46 additions & 0 deletions dask/dataframe/utils.py
Expand Up @@ -734,3 +734,49 @@ def drop_by_shallow_copy(df, columns, errors="raise"):

class AttributeNotImplementedError(NotImplementedError, AttributeError):
"""NotImplementedError and AttributeError"""


def meta_frame_constructor(like):
jrbourbeau marked this conversation as resolved.
Show resolved Hide resolved
"""Return a serial DataFrame constructor

Parameters
----------
like :
Any series-like, Index-like or dataframe-like object.
"""
if is_dask_collection(like):
rjzamora marked this conversation as resolved.
Show resolved Hide resolved
try:
like = like._meta
except AttributeError:
raise TypeError(f"{type(like)} not supported by meta_frame_constructor")
if is_dataframe_like(like):
return like._constructor
elif is_series_like(like):
return like._constructor_expanddim
elif is_index_like(like):
return like.to_frame()._constructor
else:
raise TypeError(f"{type(like)} not supported by meta_frame_constructor")


def meta_series_constructor(like):
"""Return a serial Series constructor

Parameters
----------
like :
Any series-like, Index-like or dataframe-like object.
"""
if is_dask_collection(like):
try:
like = like._meta
except AttributeError:
raise TypeError(f"{type(like)} not supported by meta_series_constructor")
if is_dataframe_like(like):
return like._constructor_sliced
elif is_series_like(like):
return like._constructor
elif is_index_like(like):
return like.to_frame()._constructor_sliced
else:
raise TypeError(f"{type(like)} not supported by meta_series_constructor")