Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API: Make describe changes backwards compatible #34798

Merged
merged 10 commits into from
Jul 14, 2020
3 changes: 1 addition & 2 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -668,8 +668,6 @@ apply and applymap on ``DataFrame`` evaluates first row/column only once
Other API changes
^^^^^^^^^^^^^^^^^

- :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last``
will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`)
- Added :meth:`DataFrame.value_counts` (:issue:`5377`)
- :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`)
- ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`)
Expand Down Expand Up @@ -783,6 +781,7 @@ Deprecations

- Lookups on a :class:`Series` with a single-item list containing a slice (e.g. ``ser[[slice(0, 4)]]``) are deprecated, will raise in a future version. Either convert the list to tuple, or pass the slice directly instead (:issue:`31333`)

- :meth:`Series.describe` and :meth:`DataFrame.describe` treating datetime dtypes as categorical rather than numeric is deprecated. Specify ``datetime_is_numeric=True`` to show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last`` will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`, :issue:`33903`)
- :meth:`DataFrame.mean` and :meth:`DataFrame.median` with ``numeric_only=None`` will include datetime64 and datetime64tz columns in a future version (:issue:`29941`)
- Setting values with ``.loc`` using a positional slice is deprecated and will raise in a future version. Use ``.loc`` with labels or ``.iloc`` with positions instead (:issue:`31840`)
- :meth:`DataFrame.to_dict` has deprecated accepting short names for ``orient`` in future versions (:issue:`32515`)
Expand Down
47 changes: 42 additions & 5 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9711,7 +9711,11 @@ def abs(self: FrameOrSeries) -> FrameOrSeries:
return np.abs(self)

def describe(
self: FrameOrSeries, percentiles=None, include=None, exclude=None
self: FrameOrSeries,
percentiles=None,
include=None,
exclude=None,
datetime_is_numeric=False,
) -> FrameOrSeries:
"""
Generate descriptive statistics.
Expand Down Expand Up @@ -9757,6 +9761,10 @@ def describe(
``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To
exclude pandas categorical columns, use ``'category'``
- None (default) : The result will exclude nothing.
datetime_is_numeric : bool, default False
Whether to treat datetime dtypes as numeric.

.. versionadded:: 1.1.0

Returns
-------
Expand Down Expand Up @@ -9834,7 +9842,7 @@ def describe(
... np.datetime64("2010-01-01"),
... np.datetime64("2010-01-01")
... ])
>>> s.describe()
>>> s.describe(datetime_is_numeric=True)
count 3
mean 2006-09-01 08:00:00
min 2000-01-01 00:00:00
Expand Down Expand Up @@ -9992,8 +10000,37 @@ def describe_categorical_1d(data):
dtype = None
if result[1] > 0:
top, freq = objcounts.index[0], objcounts.iloc[0]
names += ["top", "freq"]
result += [top, freq]
if is_datetime64_any_dtype(data.dtype):
if self.ndim == 1:
stacklevel = 4
else:
stacklevel = 5
warnings.warn(
"Treating datetime data as categorical rather than numeric in "
"`.describe` is deprecated and will be removed in a future "
"version of pandas. Specify `datetime_is_numeric=True` to "
"silence this warning and adopt the future behavior now.",
FutureWarning,
stacklevel=stacklevel,
)
tz = data.dt.tz
asint = data.dropna().values.view("i8")
top = Timestamp(top)
if top.tzinfo is not None and tz is not None:
jreback marked this conversation as resolved.
Show resolved Hide resolved
# Don't tz_localize(None) if key is already tz-aware
top = top.tz_convert(tz)
else:
top = top.tz_localize(tz)
names += ["top", "freq", "first", "last"]
result += [
top,
freq,
Timestamp(asint.min(), tz=tz),
Timestamp(asint.max(), tz=tz),
]
else:
names += ["top", "freq"]
result += [top, freq]

# If the DataFrame is empty, set 'top' and 'freq' to None
# to maintain output shape consistency
Expand All @@ -10019,7 +10056,7 @@ def describe_1d(data):
return describe_categorical_1d(data)
elif is_numeric_dtype(data):
return describe_numeric_1d(data)
elif is_datetime64_any_dtype(data.dtype):
elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
return describe_timestamp_1d(data)
elif is_timedelta64_dtype(data.dtype):
return describe_numeric_1d(data)
Expand Down
43 changes: 42 additions & 1 deletion pandas/tests/frame/methods/test_describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,48 @@ def test_describe_tz_values(self, tz_naive_fixture):
},
index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
)
result = df.describe(include="all")
result = df.describe(include="all", datetime_is_numeric=True)
tm.assert_frame_equal(result, expected)

def test_describe_tz_values2(self):
tz = "CET"
s1 = Series(range(5))
start = Timestamp(2018, 1, 1)
end = Timestamp(2018, 1, 5)
s2 = Series(date_range(start, end, tz=tz))
df = pd.DataFrame({"s1": s1, "s2": s2})

s1_ = s1.describe()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you make as a separate test

s2_ = pd.Series(
[
5,
5,
s2.value_counts().index[0],
1,
start.tz_localize(tz),
end.tz_localize(tz),
],
index=["count", "unique", "top", "freq", "first", "last"],
)
idx = [
"count",
"unique",
"top",
"freq",
"first",
"last",
"mean",
"std",
"min",
"25%",
"50%",
"75%",
"max",
]
expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).loc[idx]

with tm.assert_produces_warning(FutureWarning):
result = df.describe(include="all")
tm.assert_frame_equal(result, expected)

def test_describe_percentiles_integer_idx(self):
Expand Down
25 changes: 24 additions & 1 deletion pandas/tests/series/methods/test_describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def test_describe_with_tz(self, tz_naive_fixture):
start = Timestamp(2018, 1, 1)
end = Timestamp(2018, 1, 5)
s = Series(date_range(start, end, tz=tz), name=name)
result = s.describe()
result = s.describe(datetime_is_numeric=True)
expected = Series(
[
5,
Expand All @@ -98,3 +98,26 @@ def test_describe_with_tz(self, tz_naive_fixture):
index=["count", "mean", "min", "25%", "50%", "75%", "max"],
)
tm.assert_series_equal(result, expected)

def test_describe_with_tz_warns(self):
name = tz = "CET"
start = Timestamp(2018, 1, 1)
end = Timestamp(2018, 1, 5)
s = Series(date_range(start, end, tz=tz), name=name)

with tm.assert_produces_warning(FutureWarning):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same

result = s.describe()

expected = Series(
[
5,
5,
s.value_counts().index[0],
1,
start.tz_localize(tz),
end.tz_localize(tz),
],
name=name,
index=["count", "unique", "top", "freq", "first", "last"],
)
tm.assert_series_equal(result, expected)