Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API: Make describe changes backwards compatible #34798

Merged
merged 10 commits into from
Jul 14, 2020
10 changes: 1 addition & 9 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ Other enhancements
- Added :meth:`DataFrame.value_counts` (:issue:`5377`)
- Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations.
- Added a :func:`pandas.api.indexers.VariableOffsetWindowIndexer` class to support ``rolling`` operations with non-fixed offsets (:issue:`34994`)
- :meth:`~DataFrame.describe` now includes a ``datetime_is_numeric`` keyword to control how datetime columns are summarized (:issue:`30164`, :issue:`34798`)
- :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`)
- :meth:`Styler.highlight_null` now accepts ``subset`` argument (:issue:`31345`)
- When writing directly to a sqlite connection :func:`to_sql` now supports the ``multi`` method (:issue:`29921`)
Expand Down Expand Up @@ -675,15 +676,6 @@ apply and applymap on ``DataFrame`` evaluates first row/column only once

df.apply(func, axis=1)

.. _whatsnew_110.api.other:

Other API changes
^^^^^^^^^^^^^^^^^

- :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last``
will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`)


Increased minimum versions for dependencies
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Expand Down
54 changes: 48 additions & 6 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9711,7 +9711,11 @@ def abs(self: FrameOrSeries) -> FrameOrSeries:
return np.abs(self)

def describe(
self: FrameOrSeries, percentiles=None, include=None, exclude=None
self: FrameOrSeries,
percentiles=None,
include=None,
exclude=None,
datetime_is_numeric=False,
) -> FrameOrSeries:
"""
Generate descriptive statistics.
Expand Down Expand Up @@ -9757,6 +9761,12 @@ def describe(
``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To
exclude pandas categorical columns, use ``'category'``
- None (default) : The result will exclude nothing.
datetime_is_numeric : bool, default False
Whether to treat datetime dtypes as numeric. This affects statistics
calculated for the column. For DataFrame input, this also
controls whether datetime columns are included by default.

.. versionadded:: 1.1.0

Returns
-------
Expand Down Expand Up @@ -9834,7 +9844,7 @@ def describe(
... np.datetime64("2010-01-01"),
... np.datetime64("2010-01-01")
... ])
>>> s.describe()
>>> s.describe(datetime_is_numeric=True)
count 3
mean 2006-09-01 08:00:00
min 2000-01-01 00:00:00
Expand Down Expand Up @@ -9992,8 +10002,37 @@ def describe_categorical_1d(data):
dtype = None
if result[1] > 0:
top, freq = objcounts.index[0], objcounts.iloc[0]
names += ["top", "freq"]
result += [top, freq]
if is_datetime64_any_dtype(data.dtype):
if self.ndim == 1:
stacklevel = 4
else:
stacklevel = 5
warnings.warn(
"Treating datetime data as categorical rather than numeric in "
"`.describe` is deprecated and will be removed in a future "
"version of pandas. Specify `datetime_is_numeric=True` to "
"silence this warning and adopt the future behavior now.",
FutureWarning,
stacklevel=stacklevel,
)
tz = data.dt.tz
asint = data.dropna().values.view("i8")
top = Timestamp(top)
if top.tzinfo is not None and tz is not None:
jreback marked this conversation as resolved.
Show resolved Hide resolved
# Don't tz_localize(None) if key is already tz-aware
top = top.tz_convert(tz)
else:
top = top.tz_localize(tz)
names += ["top", "freq", "first", "last"]
result += [
top,
freq,
Timestamp(asint.min(), tz=tz),
Timestamp(asint.max(), tz=tz),
]
else:
names += ["top", "freq"]
result += [top, freq]

# If the DataFrame is empty, set 'top' and 'freq' to None
# to maintain output shape consistency
Expand All @@ -10019,7 +10058,7 @@ def describe_1d(data):
return describe_categorical_1d(data)
elif is_numeric_dtype(data):
return describe_numeric_1d(data)
elif is_datetime64_any_dtype(data.dtype):
elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
return describe_timestamp_1d(data)
elif is_timedelta64_dtype(data.dtype):
return describe_numeric_1d(data)
Expand All @@ -10030,7 +10069,10 @@ def describe_1d(data):
return describe_1d(self)
elif (include is None) and (exclude is None):
# when some numerics are found, keep only numerics
data = self.select_dtypes(include=[np.number])
default_include = [np.number]
if datetime_is_numeric:
default_include.append("datetime")
data = self.select_dtypes(include=default_include)
if len(data.columns) == 0:
data = self
elif include == "all":
Expand Down
64 changes: 63 additions & 1 deletion pandas/tests/frame/methods/test_describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,69 @@ def test_describe_tz_values(self, tz_naive_fixture):
},
index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
)
result = df.describe(include="all")
result = df.describe(include="all", datetime_is_numeric=True)
tm.assert_frame_equal(result, expected)

def test_datetime_is_numeric_includes_datetime(self):
df = pd.DataFrame({"a": pd.date_range("2012", periods=3), "b": [1, 2, 3]})
result = df.describe(datetime_is_numeric=True)
expected = pd.DataFrame(
{
"a": [
3,
pd.Timestamp("2012-01-02"),
pd.Timestamp("2012-01-01"),
pd.Timestamp("2012-01-01T12:00:00"),
pd.Timestamp("2012-01-02"),
pd.Timestamp("2012-01-02T12:00:00"),
pd.Timestamp("2012-01-03"),
np.nan,
],
"b": [3, 2, 1, 1.5, 2, 2.5, 3, 1],
},
index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
)
tm.assert_frame_equal(result, expected)

def test_describe_tz_values2(self):
tz = "CET"
s1 = Series(range(5))
start = Timestamp(2018, 1, 1)
end = Timestamp(2018, 1, 5)
s2 = Series(date_range(start, end, tz=tz))
df = pd.DataFrame({"s1": s1, "s2": s2})

s1_ = s1.describe()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you make as a separate test

s2_ = pd.Series(
[
5,
5,
s2.value_counts().index[0],
1,
start.tz_localize(tz),
end.tz_localize(tz),
],
index=["count", "unique", "top", "freq", "first", "last"],
)
idx = [
"count",
"unique",
"top",
"freq",
"first",
"last",
"mean",
"std",
"min",
"25%",
"50%",
"75%",
"max",
]
expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).loc[idx]

with tm.assert_produces_warning(FutureWarning):
result = df.describe(include="all")
tm.assert_frame_equal(result, expected)

def test_describe_percentiles_integer_idx(self):
Expand Down
42 changes: 41 additions & 1 deletion pandas/tests/series/methods/test_describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def test_describe_with_tz(self, tz_naive_fixture):
start = Timestamp(2018, 1, 1)
end = Timestamp(2018, 1, 5)
s = Series(date_range(start, end, tz=tz), name=name)
result = s.describe()
result = s.describe(datetime_is_numeric=True)
expected = Series(
[
5,
Expand All @@ -98,3 +98,43 @@ def test_describe_with_tz(self, tz_naive_fixture):
index=["count", "mean", "min", "25%", "50%", "75%", "max"],
)
tm.assert_series_equal(result, expected)

def test_describe_with_tz_warns(self):
name = tz = "CET"
start = Timestamp(2018, 1, 1)
end = Timestamp(2018, 1, 5)
s = Series(date_range(start, end, tz=tz), name=name)

with tm.assert_produces_warning(FutureWarning):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same

result = s.describe()

expected = Series(
[
5,
5,
s.value_counts().index[0],
1,
start.tz_localize(tz),
end.tz_localize(tz),
],
name=name,
index=["count", "unique", "top", "freq", "first", "last"],
)
tm.assert_series_equal(result, expected)

def test_datetime_is_numeric_includes_datetime(self):
s = Series(date_range("2012", periods=3))
result = s.describe(datetime_is_numeric=True)
expected = Series(
[
3,
Timestamp("2012-01-02"),
Timestamp("2012-01-01"),
Timestamp("2012-01-01T12:00:00"),
Timestamp("2012-01-02"),
Timestamp("2012-01-02T12:00:00"),
Timestamp("2012-01-03"),
],
index=["count", "mean", "min", "25%", "50%", "75%", "max"],
)
tm.assert_series_equal(result, expected)