Skip to content

Commit

Permalink
Filter out numeric_only warnings from pandas (#9496)
Browse files Browse the repository at this point in the history
* Initial checkpoint

* test-upstream

* Pass method name [test-upstream]

* Groupby [test-upstream]

* Cleanup [test-upstream]

* More specific warning catching [test-upstream]

* Remove stray breakpoint [test-upstream]

* Fix categorical tests [test-upstream]

* Restore npartitions after debugging [test-upstream]

* Updates [test-upstream]

* Roll back columns [test-upstream]

* Be more explicit about method name in _getattr_numeric_only [test-upstream]

* Use more specific parameter for method name [test-upstream]
  • Loading branch information
jrbourbeau committed Sep 15, 2022
1 parent 803c7fd commit 1a8533f
Show file tree
Hide file tree
Showing 7 changed files with 190 additions and 47 deletions.
17 changes: 17 additions & 0 deletions dask/dataframe/_compat.py
@@ -1,4 +1,6 @@
import contextlib
import string
import warnings

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -82,3 +84,18 @@ def makeMixedDataFrame():
}
)
return df


@contextlib.contextmanager
def check_numeric_only_deprecation():

if PANDAS_GT_150:
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
message="The default value of numeric_only in",
category=FutureWarning,
)
yield
else:
yield
98 changes: 81 additions & 17 deletions dask/dataframe/core.py
Expand Up @@ -33,7 +33,11 @@
from dask.blockwise import Blockwise, BlockwiseDep, BlockwiseDepDict, blockwise
from dask.context import globalmethod
from dask.dataframe import methods
from dask.dataframe._compat import PANDAS_GT_140, PANDAS_GT_150
from dask.dataframe._compat import (
PANDAS_GT_140,
PANDAS_GT_150,
check_numeric_only_deprecation,
)
from dask.dataframe.accessor import CachedAccessor, DatetimeAccessor, StringAccessor
from dask.dataframe.categorical import CategoricalAccessor, categorize
from dask.dataframe.dispatch import (
Expand Down Expand Up @@ -1886,26 +1890,49 @@ def shift(self, periods=1, freq=None, axis=0):
)
return maybe_shift_divisions(out, periods, freq=freq)

def _reduction_agg(self, name, axis=None, skipna=True, split_every=False, out=None):
def _reduction_agg(
self,
name,
axis=None,
skipna=True,
split_every=False,
out=None,
numeric_only=None,
):
axis = self._validate_axis(axis)

meta = getattr(self._meta_nonempty, name)(axis=axis, skipna=skipna)
token = self._token_prefix + name
if has_keyword(getattr(self._meta_nonempty, name), "numeric_only"):
numeric_only_kwargs = {"numeric_only": numeric_only}
else:
numeric_only_kwargs = {}

method = getattr(M, name)
with check_numeric_only_deprecation():
meta = getattr(self._meta_nonempty, name)(
axis=axis, skipna=skipna, **numeric_only_kwargs
)

token = self._token_prefix + name
if axis == 1:
result = self.map_partitions(
method, meta=meta, token=token, skipna=skipna, axis=axis
_getattr_numeric_only,
meta=meta,
token=token,
skipna=skipna,
axis=axis,
_dask_method_name=name,
**numeric_only_kwargs,
)
return handle_out(out, result)
else:
result = self.reduction(
method,
_getattr_numeric_only,
meta=meta,
token=token,
skipna=skipna,
axis=axis,
split_every=split_every,
_dask_method_name=name,
**numeric_only_kwargs,
)
if isinstance(self, DataFrame):
result.divisions = (self.columns.min(), self.columns.max())
Expand Down Expand Up @@ -1982,7 +2009,11 @@ def prod(
numeric_only=None,
):
result = self._reduction_agg(
"prod", axis=axis, skipna=skipna, split_every=split_every, out=out
"prod",
axis=axis,
skipna=skipna,
split_every=split_every,
out=out,
)
if min_count:
cond = self.notnull().sum(axis=axis) >= min_count
Expand All @@ -2003,7 +2034,11 @@ def max(
self, axis=None, skipna=True, split_every=False, out=None, numeric_only=None
):
return self._reduction_agg(
"max", axis=axis, skipna=skipna, split_every=split_every, out=out
"max",
axis=axis,
skipna=skipna,
split_every=split_every,
out=out,
)

@_numeric_only
Expand All @@ -2012,7 +2047,11 @@ def min(
self, axis=None, skipna=True, split_every=False, out=None, numeric_only=None
):
return self._reduction_agg(
"min", axis=axis, skipna=skipna, split_every=split_every, out=out
"min",
axis=axis,
skipna=skipna,
split_every=split_every,
out=out,
)

@derived_from(pd.DataFrame)
Expand Down Expand Up @@ -2132,7 +2171,11 @@ def mean(
):
axis = self._validate_axis(axis)
_raise_if_object_series(self, "mean")
meta = self._meta_nonempty.mean(axis=axis, skipna=skipna)
# NOTE: Do we want to warn here?
with check_numeric_only_deprecation():
meta = self._meta_nonempty.mean(
axis=axis, skipna=skipna, numeric_only=numeric_only
)
if axis == 1:
result = map_partitions(
M.mean,
Expand All @@ -2142,6 +2185,7 @@ def mean(
axis=axis,
skipna=skipna,
enforce_metadata=False,
numeric_only=numeric_only,
)
return handle_out(out, result)
else:
Expand Down Expand Up @@ -2204,7 +2248,10 @@ def var(
):
axis = self._validate_axis(axis)
_raise_if_object_series(self, "var")
meta = self._meta_nonempty.var(axis=axis, skipna=skipna)
with check_numeric_only_deprecation():
meta = self._meta_nonempty.var(
axis=axis, skipna=skipna, numeric_only=numeric_only
)
if axis == 1:
result = map_partitions(
M.var,
Expand All @@ -2215,6 +2262,7 @@ def var(
skipna=skipna,
ddof=ddof,
enforce_metadata=False,
numeric_only=numeric_only,
)
return handle_out(out, result)
else:
Expand Down Expand Up @@ -2351,7 +2399,10 @@ def std(
_raise_if_object_series(self, "std")
_raise_if_not_series_or_dataframe(self, "std")

meta = self._meta_nonempty.std(axis=axis, skipna=skipna)
with check_numeric_only_deprecation():
meta = self._meta_nonempty.std(
axis=axis, skipna=skipna, numeric_only=numeric_only
)
is_df_like = is_dataframe_like(self._meta)
needs_time_conversion = False
numeric_dd = self
Expand All @@ -2378,6 +2429,7 @@ def std(
skipna=skipna,
ddof=ddof,
enforce_metadata=False,
numeric_only=numeric_only,
parent_meta=self._meta,
)
return handle_out(out, result)
Expand Down Expand Up @@ -2671,7 +2723,10 @@ def _kurtosis_numeric(self, fisher=True, bias=True, nan_policy="propagate"):
def sem(self, axis=None, skipna=True, ddof=1, split_every=False, numeric_only=None):
axis = self._validate_axis(axis)
_raise_if_object_series(self, "sem")
meta = self._meta_nonempty.sem(axis=axis, skipna=skipna, ddof=ddof)
with check_numeric_only_deprecation():
meta = self._meta_nonempty.sem(
axis=axis, skipna=skipna, ddof=ddof, numeric_only=numeric_only
)
if axis == 1:
return map_partitions(
M.sem,
Expand All @@ -2682,6 +2737,7 @@ def sem(self, axis=None, skipna=True, ddof=1, split_every=False, numeric_only=No
skipna=skipna,
ddof=ddof,
parent_meta=self._meta,
numeric_only=numeric_only,
)
else:
num = self._get_numeric_data()
Expand All @@ -2701,7 +2757,8 @@ def sem(self, axis=None, skipna=True, ddof=1, split_every=False, numeric_only=No
result.divisions = (self.columns.min(), self.columns.max())
return result

def quantile(self, q=0.5, axis=0, method="default"):
@_numeric_only
def quantile(self, q=0.5, axis=0, numeric_only=True, method="default"):
"""Approximate row-wise and precise column-wise quantiles of DataFrame
Parameters
Expand All @@ -2717,24 +2774,26 @@ def quantile(self, q=0.5, axis=0, method="default"):
"""
axis = self._validate_axis(axis)
keyname = "quantiles-concat--" + tokenize(self, q, axis)
meta = self._meta.quantile(q, axis=axis, numeric_only=numeric_only)

if axis == 1:
if isinstance(q, list):
# Not supported, the result will have current index as columns
raise ValueError("'q' must be scalar when axis=1 is specified")

return map_partitions(
M.quantile,
self,
q,
axis,
token=keyname,
enforce_metadata=False,
numeric_only=numeric_only,
meta=(q, "f8"),
parent_meta=self._meta,
)
else:
_raise_if_object_series(self, "quantile")
meta = self._meta.quantile(q, axis=axis)
num = self._get_numeric_data()
quantiles = tuple(quantile(self[c], q, method) for c in num.columns)

Expand Down Expand Up @@ -6519,7 +6578,7 @@ def _emulate(func, *args, udf=False, **kwargs):
Apply a function using args / kwargs. If arguments contain dd.DataFrame /
dd.Series, using internal cache (``_meta``) for calculation
"""
with raise_on_meta_error(funcname(func), udf=udf):
with raise_on_meta_error(funcname(func), udf=udf), check_numeric_only_deprecation():
return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))


Expand Down Expand Up @@ -8123,3 +8182,8 @@ def _raise_if_not_series_or_dataframe(x, funcname):
"`%s` is only supported with objects that are Dataframes or Series"
% funcname
)


def _getattr_numeric_only(*args, _dask_method_name, **kwargs):
with check_numeric_only_deprecation():
return getattr(M, _dask_method_name)(*args, **kwargs)
11 changes: 7 additions & 4 deletions dask/dataframe/groupby.py
Expand Up @@ -10,7 +10,7 @@

from dask import config
from dask.base import tokenize
from dask.dataframe._compat import PANDAS_GT_150
from dask.dataframe._compat import PANDAS_GT_150, check_numeric_only_deprecation
from dask.dataframe.core import (
GROUP_KEYS_DEFAULT,
DataFrame,
Expand Down Expand Up @@ -346,15 +346,17 @@ def _var_chunk(df, *by):
df = df.copy()

g = _groupby_raise_unaligned(df, by=by)
x = g.sum()
with check_numeric_only_deprecation():
x = g.sum()

n = g[x.columns].count().rename(columns=lambda c: (c, "-count"))

cols = x.columns
df[cols] = df[cols] ** 2

g2 = _groupby_raise_unaligned(df, by=by)
x2 = g2.sum().rename(columns=lambda c: (c, "-x2"))
with check_numeric_only_deprecation():
x2 = g2.sum().rename(columns=lambda c: (c, "-x2"))

return concat([x, x2, n], axis=1)

Expand Down Expand Up @@ -1251,7 +1253,8 @@ def _aca_agg(
aggfunc = func

if meta is None:
meta = func(self._meta_nonempty)
with check_numeric_only_deprecation():
meta = func(self._meta_nonempty)

if chunk_kwargs is None:
chunk_kwargs = {}
Expand Down

0 comments on commit 1a8533f

Please sign in to comment.