Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid pandas constructors in dask.dataframe.core #9570

Merged
merged 14 commits into from Nov 10, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
57 changes: 35 additions & 22 deletions dask/dataframe/core.py
Expand Up @@ -63,6 +63,8 @@
is_series_like,
make_meta,
raise_on_meta_error,
serial_frame_constructor,
serial_series_constructor,
valid_divisions,
)
from dask.delayed import Delayed, delayed, unpack_collections
Expand Down Expand Up @@ -2482,8 +2484,8 @@ def _convert_time_cols_to_numeric(self, time_cols, axis, meta, skipna):
# since each standard deviation will just be NaN
needs_time_conversion = False
numeric_dd = from_pandas(
pd.DataFrame(
{"_": pd.Series([np.nan])},
serial_frame_constructor(self)(
{"_": serial_series_constructor(self)([np.nan])},
index=self.index,
),
npartitions=self.npartitions,
Expand Down Expand Up @@ -3062,7 +3064,7 @@ def _cum_agg(
_take_last,
cumpart,
skipna,
meta=pd.Series([], dtype="float"),
meta=serial_series_constructor(self)([], dtype="float"),
token=name2,
)

Expand Down Expand Up @@ -3260,7 +3262,7 @@ def dot(self, other, meta=no_default):
def _dot_series(*args, **kwargs):
# .sum() is invoked on each partition before being applied to all
# partitions. The return type is expected to be a series, not a numpy object
return pd.Series(M.dot(*args, **kwargs))
return serial_series_constructor(self)(M.dot(*args, **kwargs))

return self.map_partitions(_dot_series, other, token="dot", meta=meta).sum(
skipna=False
Expand Down Expand Up @@ -3532,7 +3534,7 @@ def __array_wrap__(self, array, context=None):
f"{method_name} is not implemented for `dask.dataframe.Series`."
)

return pd.Series(array, index=index, name=self.name)
return serial_series_constructor(self)(array, index=index, name=self.name)

@property
def axes(self):
Expand Down Expand Up @@ -3675,7 +3677,9 @@ def rename(self, index=None, inplace=False, sorted_index=False):
res = self.map_partitions(M.rename, index, enforce_metadata=False)
if self.known_divisions:
if sorted_index and (callable(index) or is_dict_like(index)):
old = pd.Series(range(self.npartitions + 1), index=self.divisions)
old = serial_series_constructor(self)(
range(self.npartitions + 1), index=self.divisions
)
new = old.rename(index).index
if not new.is_monotonic_increasing:
msg = (
Expand Down Expand Up @@ -4398,7 +4402,9 @@ def map(self, arg, na_action=None, meta=no_default, is_monotonic=False):
applied = super().map(arg, na_action=na_action, meta=meta)
if is_monotonic and self.known_divisions:
applied.divisions = tuple(
pd.Series(self.divisions).map(arg, na_action=na_action)
serial_series_constructor(self)(self.divisions).map(
arg, na_action=na_action
)
)
else:
applied = applied.clear_divisions()
Expand Down Expand Up @@ -4513,7 +4519,7 @@ def __array_wrap__(self, array, context=None):
f"{method_name} is not implemented for `dask.dataframe.DataFrame`."
)

return pd.DataFrame(array, index=index, columns=self.columns)
return serial_frame_constructor(self)(array, index=index, columns=self.columns)

@property
def axes(self):
Expand Down Expand Up @@ -5973,7 +5979,9 @@ def _repr_data(self):
index = self._repr_divisions
cols = meta.columns
if len(cols) == 0:
series_df = pd.DataFrame([[]] * len(index), columns=cols, index=index)
series_df = serial_frame_constructor(self)(
[[]] * len(index), columns=cols, index=index
)
else:
series_df = pd.concat(
[_repr_data_series(s, index=index) for _, s in meta.items()], axis=1
Expand Down Expand Up @@ -6345,7 +6353,7 @@ def split_evenly(df, k):
def split_out_on_index(df):
h = df.index
if isinstance(h, pd.MultiIndex):
h = pd.DataFrame([], index=h).reset_index()
h = serial_frame_constructor(df)([], index=h).reset_index()
return h


Expand Down Expand Up @@ -7070,6 +7078,7 @@ def cov_corr(df, min_periods=None, corr=False, scalar=False, split_every=False):
min_periods,
corr,
scalar,
df._meta,
)
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[df])
if scalar:
Expand Down Expand Up @@ -7150,7 +7159,7 @@ def cov_corr_combine(data_in, corr=False):
return out


def cov_corr_agg(data, cols, min_periods=2, corr=False, scalar=False):
def cov_corr_agg(data, cols, min_periods=2, corr=False, scalar=False, like_df=None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is data here? Can we get the DataFrame type from that?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

data is list[dict[str, np/cupy.ndarray]] here. Therfore, we would need to extend serial_frame_constructor to handle array-like data if we want to avoid the like_df argument.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1ba0353 adds a serial_constructor_from_array dispatch (but doesn't actually use it yet) to illustrate what it would probably look like.

out = cov_corr_combine(data, corr)
counts = out["count"]
C = out["cov"]
Expand All @@ -7164,7 +7173,7 @@ def cov_corr_agg(data, cols, min_periods=2, corr=False, scalar=False):
mat = C / den
if scalar:
return float(mat[0, 1])
return pd.DataFrame(mat, columns=cols, index=cols)
return serial_frame_constructor(like_df)(mat, columns=cols, index=cols)


def pd_split(df, p, random_state=None, shuffle=False):
Expand Down Expand Up @@ -7477,7 +7486,7 @@ def repartition_size(df, size):
split_mem_usages = []
for n, usage in zip(nsplits, mem_usages):
split_mem_usages.extend([usage / n] * n)
mem_usages = pd.Series(split_mem_usages)
mem_usages = serial_series_constructor(df)(split_mem_usages)

# 2. now that all partitions are less than size, concat them up to size
assert np.all(mem_usages <= size)
Expand Down Expand Up @@ -7509,7 +7518,9 @@ def repartition_npartitions(df, npartitions):
else:
# Drop duplcates in case last partition has same
# value for min and max division
original_divisions = divisions = pd.Series(df.divisions).drop_duplicates()
original_divisions = divisions = serial_series_constructor(df)(
df.divisions
).drop_duplicates()
if df.known_divisions and (
np.issubdtype(divisions.dtype, np.datetime64)
or np.issubdtype(divisions.dtype, np.number)
Expand All @@ -7528,7 +7539,9 @@ def repartition_npartitions(df, npartitions):
)
if np.issubdtype(original_divisions.dtype, np.datetime64):
divisions = methods.tolist(
pd.Series(divisions).astype(original_divisions.dtype)
serial_series_constructor(df)(divisions).astype(
original_divisions.dtype
)
)
elif np.issubdtype(original_divisions.dtype, np.integer):
divisions = divisions.astype(original_divisions.dtype)
Expand Down Expand Up @@ -7679,10 +7692,10 @@ def idxmaxmin_chunk(x, fn=None, skipna=True):
idx = getattr(x, fn)(skipna=skipna)
value = getattr(x, minmax)(skipna=skipna)
else:
idx = value = pd.Series([], dtype="i8")
idx = value = serial_series_constructor(x)([], dtype="i8")
if is_series_like(idx):
return pd.DataFrame({"idx": idx, "value": value})
return pd.DataFrame({"idx": [idx], "value": [value]})
return serial_frame_constructor(x)({"idx": idx, "value": value})
return serial_frame_constructor(x)({"idx": [idx], "value": [value]})


def idxmaxmin_row(x, fn=None, skipna=True):
Expand All @@ -7692,8 +7705,8 @@ def idxmaxmin_row(x, fn=None, skipna=True):
idx = [getattr(x.value, fn)(skipna=skipna)]
value = [getattr(x.value, minmax)(skipna=skipna)]
else:
idx = value = pd.Series([], dtype="i8")
return pd.DataFrame({"idx": idx, "value": value})
idx = value = serial_series_constructor(x)([], dtype="i8")
return serial_frame_constructor(x)({"idx": idx, "value": value})


def idxmaxmin_combine(x, fn=None, skipna=True):
Expand Down Expand Up @@ -7787,7 +7800,7 @@ def to_datetime(arg, meta=None, **kwargs):
"non-index-able arguments (like scalars)"
)
else:
meta = pd.Series([pd.Timestamp("2000", **tz_kwarg)])
meta = serial_series_constructor(arg)([pd.Timestamp("2000", **tz_kwarg)])
meta.index = meta.index.astype(arg.index.dtype)
meta.index.name = arg.index.name
return map_partitions(pd.to_datetime, arg, meta=meta, **kwargs)
Expand All @@ -7797,7 +7810,7 @@ def to_datetime(arg, meta=None, **kwargs):
def to_timedelta(arg, unit=None, errors="raise"):
if not PANDAS_GT_110 and unit is None:
unit = "ns"
meta = pd.Series([pd.Timedelta(1, unit=unit)])
meta = serial_series_constructor(arg)([pd.Timedelta(1, unit=unit)])
return map_partitions(pd.to_timedelta, arg, unit=unit, errors=errors, meta=meta)


Expand Down
36 changes: 36 additions & 0 deletions dask/dataframe/tests/test_dataframe.py
Expand Up @@ -3202,6 +3202,24 @@ def test_cov():
assert res._name != res3._name


@pytest.mark.gpu
def test_cov_gpu():
Comment on lines +3204 to +3205
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the backend PR goes in before this PR, could we just reuse test_cov with the backend engine parametrized?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps. We would need to change the _compat dataframe-creation machinery to use dispachable functions to do this.

cudf = pytest.importorskip("cudf")

# cudf DataFrame
df = cudf.from_pandas(_compat.makeDataFrame())
ddf = dd.from_pandas(df, npartitions=6)

res = ddf.cov()
res2 = ddf.cov(split_every=2)
sol = df.cov()
# TODO: dtype of res.index is wrong before compute
assert_eq(res.compute(), sol)
assert_eq(res2.compute(), sol)
assert res._name == ddf.cov()._name
assert res._name != res2._name


def test_corr():
# DataFrame
df = _compat.makeMissingDataframe()
Expand Down Expand Up @@ -3249,6 +3267,24 @@ def test_corr():
pytest.raises(TypeError, lambda: da.corr(ddf))


@pytest.mark.gpu
def test_corr_gpu():
Comment on lines +3268 to +3269
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar comment here

cudf = pytest.importorskip("cudf")

# cudf DataFrame
df = cudf.from_pandas(_compat.makeDataFrame())
ddf = dd.from_pandas(df, npartitions=6)

res = ddf.corr()
res2 = ddf.corr(split_every=2)
sol = df.corr()
# TODO: dtype of res.index is wrong before compute
assert_eq(res.compute(), sol)
assert_eq(res2.compute(), sol)
assert res._name == ddf.corr()._name
assert res._name != res2._name


def test_corr_same_name():
# Series with same names (see https://github.com/dask/dask/issues/4906)

Expand Down
20 changes: 20 additions & 0 deletions dask/dataframe/utils.py
Expand Up @@ -734,3 +734,23 @@ def drop_by_shallow_copy(df, columns, errors="raise"):

class AttributeNotImplementedError(NotImplementedError, AttributeError):
"""NotImplementedError and AttributeError"""


def serial_frame_constructor(like=None):
rjzamora marked this conversation as resolved.
Show resolved Hide resolved
"""Return a serial DataFrame constructor"""
if is_dask_collection(like):
rjzamora marked this conversation as resolved.
Show resolved Hide resolved
like = like._meta
if hasattr(like, "to_frame"):
# `like` is a Series rather than a DataFrame
like = like.iloc[:1].to_frame()
rjzamora marked this conversation as resolved.
Show resolved Hide resolved
return pd.DataFrame if like is None else like._constructor


def serial_series_constructor(like=None):
rjzamora marked this conversation as resolved.
Show resolved Hide resolved
"""Return a serial Series constructor"""
if is_dask_collection(like):
like = like._meta
if not hasattr(like, "to_frame"):
# `like` is a DataFrame rather than a Series
return like._constructor_sliced
return pd.Series if like is None else like._constructor