Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: change get_dummies default dtype to bool #48022

Merged
merged 21 commits into from
Oct 11, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
1eb5cd6
ENH: Warn when dtype is not passed to get_dummies
kianelbo Aug 10, 2022
efa678b
Edit get_dummies' dtype warning
kianelbo Aug 10, 2022
472fa28
Add whatsnew entry for issue #45848
kianelbo Aug 10, 2022
2ead750
Fix dtype warning test
kianelbo Aug 10, 2022
ddcc7d3
Suppress warnings in docs
kianelbo Aug 10, 2022
81dbb87
Edit whatsnew entry
kianelbo Aug 10, 2022
45d9c79
Merge branch 'main' into 'getdummies-default-dtype'
kianelbo Aug 23, 2022
f97df66
Fix find_stack_level in get_dummies dtype warning
kianelbo Aug 23, 2022
707a222
Merge branch 'main' into getdummies-default-dtype
kianelbo Sep 21, 2022
15aeb3e
Change the default dtype of get_dummies to bool
kianelbo Sep 22, 2022
a5f709d
Merge branch 'main' into 'getdummies-default-dtype'
kianelbo Sep 23, 2022
a246b8c
Revert dtype(bool) change
kianelbo Sep 25, 2022
7d72067
Merge branch 'main' again
kianelbo Sep 27, 2022
940bd11
Merge branch 'main' into getdummies-default-dtype
MarcoGorelli Sep 29, 2022
ee06958
Merge branch 'main' into getdummies-default-dtype
MarcoGorelli Oct 5, 2022
6e90b45
Merge branch 'main' into getdummies-default-dtype
MarcoGorelli Oct 6, 2022
ce37f33
Merge branch 'main' into getdummies-default-dtype
kianelbo Oct 7, 2022
7cef2fc
Move the changelog entry to v1.6.0.rst
kianelbo Oct 7, 2022
9285bf1
Merge branch 'main' into getdummies-default-dtype
MarcoGorelli Oct 10, 2022
d7e6490
Merge branch 'main' into getdummies-default-dtype
kianelbo Oct 11, 2022
8a93cc9
Move whatsnew entry to 'Other API changes'
kianelbo Oct 11, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -843,6 +843,7 @@ Other Deprecations
- Deprecated setting a categorical's categories with ``cat.categories = ['a', 'b', 'c']``, use :meth:`Categorical.rename_categories` instead (:issue:`37643`)
- Deprecated unused arguments ``encoding`` and ``verbose`` in :meth:`Series.to_excel` and :meth:`DataFrame.to_excel` (:issue:`47912`)
- Deprecated producing a single element when iterating over a :class:`DataFrameGroupBy` or a :class:`SeriesGroupBy` that has been grouped by a list of length 1; A tuple of length one will be returned instead (:issue:`42795`)
- Emit ``FutureWarning`` from :func:`get_dummies` when ``dtype`` is unspecified, indicating that its default value will be changed to ``bool`` (:issue:`45848`)
kianelbo marked this conversation as resolved.
Show resolved Hide resolved

.. ---------------------------------------------------------------------------
.. _whatsnew_150.performance:
Expand Down
8 changes: 8 additions & 0 deletions pandas/core/reshape/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
from collections import defaultdict
import itertools
from typing import Hashable
import warnings

import numpy as np

from pandas._libs.sparse import IntIndex
from pandas._typing import Dtype
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.common import (
is_integer_dtype,
Expand Down Expand Up @@ -228,6 +230,12 @@ def _get_dummies_1d(
codes, levels = factorize_from_iterable(Series(data))

if dtype is None:
warnings.warn(
"In a future version of pandas the default dtype will change from "
"'uint8' to 'bool', please specify a dtype to silence this warning",
FutureWarning,
stacklevel=find_stack_level(),
)
dtype = np.dtype(np.uint8)
# error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str,
# dtype[Any], Type[object]]"; expected "Type[Any]"
Expand Down
88 changes: 54 additions & 34 deletions pandas/tests/reshape/test_get_dummies.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,12 @@ def test_get_dummies_raises_on_dtype_object(self, df):
with pytest.raises(ValueError, match=msg):
get_dummies(df, dtype="object")

def test_get_dummies_warns_default_dtype(self, df):
MarcoGorelli marked this conversation as resolved.
Show resolved Hide resolved
# https://github.com/pandas-dev/pandas/issues/45848
msg = "The default dtype will change from 'uint8' to 'bool'"
with tm.assert_produces_warning(FutureWarning, match=msg):
get_dummies(df)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry to go back on the approval, but can we check the return value here?


def test_get_dummies_basic(self, sparse, dtype):
s_list = list("abc")
s_series = Series(s_list)
Expand Down Expand Up @@ -121,9 +127,11 @@ def test_get_dummies_just_na(self, sparse):
just_na_series = Series(just_na_list)
just_na_series_index = Series(just_na_list, index=["A"])

res_list = get_dummies(just_na_list, sparse=sparse)
res_series = get_dummies(just_na_series, sparse=sparse)
res_series_index = get_dummies(just_na_series_index, sparse=sparse)
res_list = get_dummies(just_na_list, dtype=np.uint8, sparse=sparse)
res_series = get_dummies(just_na_series, dtype=np.uint8, sparse=sparse)
res_series_index = get_dummies(
just_na_series_index, dtype=np.uint8, sparse=sparse
)

assert res_list.empty
assert res_series.empty
Expand Down Expand Up @@ -169,7 +177,7 @@ def test_get_dummies_unicode(self, sparse):
e = "e"
eacute = unicodedata.lookup("LATIN SMALL LETTER E WITH ACUTE")
s = [e, eacute, eacute]
res = get_dummies(s, prefix="letter", sparse=sparse)
res = get_dummies(s, dtype=np.uint8, prefix="letter", sparse=sparse)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't we rather just catch the warnings for these? Wondering how we remember in the future to go back and update these tests when we make the change to the dtype

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could do, I just thought that would be a lot of warnings to catch

Regarding updating tests - I wouldn't have thought they needed updating, I'd have thought just having a test which called .get_dummies() (without specifying dtype) would be enough

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea I agree. So that's why I was thinking it is better to catch the warning for now and not change the argument. Otherwise with this in the future we lose testing the behavior of the default argument unless someone comes back and revert what was changed here

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no all tests should be fixed now
and then u have an explicit test of the warning

it's not better to defer fixing something like this

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK good points, thanks for raising

I've added this to the agenda for the next dev meeting

@kianelbo let's hold off further changes til after there's been discussion

exp = DataFrame(
{"letter_e": [1, 0, 0], f"letter_{eacute}": [0, 1, 1]}, dtype=np.uint8
)
Expand All @@ -179,7 +187,7 @@ def test_get_dummies_unicode(self, sparse):

def test_dataframe_dummies_all_obj(self, df, sparse):
df = df[["A", "B"]]
result = get_dummies(df, sparse=sparse)
result = get_dummies(df, dtype=np.uint8, sparse=sparse)
expected = DataFrame(
{"A_a": [1, 0, 1], "A_b": [0, 1, 0], "B_b": [1, 1, 0], "B_c": [0, 0, 1]},
dtype=np.uint8,
Expand All @@ -200,7 +208,7 @@ def test_dataframe_dummies_string_dtype(self, df):
# GH44965
df = df[["A", "B"]]
df = df.astype({"A": "object", "B": "string"})
result = get_dummies(df)
result = get_dummies(df, dtype=np.uint8)
expected = DataFrame(
{
"A_a": [1, 0, 1],
Expand Down Expand Up @@ -234,7 +242,7 @@ def test_dataframe_dummies_mix_default(self, df, sparse, dtype):

def test_dataframe_dummies_prefix_list(self, df, sparse):
prefixes = ["from_A", "from_B"]
result = get_dummies(df, prefix=prefixes, sparse=sparse)
result = get_dummies(df, dtype=np.uint8, prefix=prefixes, sparse=sparse)
expected = DataFrame(
{
"C": [1, 2, 3],
Expand All @@ -255,7 +263,7 @@ def test_dataframe_dummies_prefix_list(self, df, sparse):

def test_dataframe_dummies_prefix_str(self, df, sparse):
# not that you should do this...
result = get_dummies(df, prefix="bad", sparse=sparse)
result = get_dummies(df, dtype=np.uint8, prefix="bad", sparse=sparse)
bad_columns = ["bad_a", "bad_b", "bad_b", "bad_c"]
expected = DataFrame(
[[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]],
Expand All @@ -280,7 +288,9 @@ def test_dataframe_dummies_prefix_str(self, df, sparse):
tm.assert_frame_equal(result, expected)

def test_dataframe_dummies_subset(self, df, sparse):
result = get_dummies(df, prefix=["from_A"], columns=["A"], sparse=sparse)
result = get_dummies(
df, dtype=np.uint8, prefix=["from_A"], columns=["A"], sparse=sparse
)
expected = DataFrame(
{
"B": ["b", "b", "c"],
Expand All @@ -298,7 +308,7 @@ def test_dataframe_dummies_subset(self, df, sparse):
tm.assert_frame_equal(result, expected)

def test_dataframe_dummies_prefix_sep(self, df, sparse):
result = get_dummies(df, prefix_sep="..", sparse=sparse)
result = get_dummies(df, dtype=np.uint8, prefix_sep="..", sparse=sparse)
expected = DataFrame(
{
"C": [1, 2, 3],
Expand All @@ -317,11 +327,13 @@ def test_dataframe_dummies_prefix_sep(self, df, sparse):

tm.assert_frame_equal(result, expected)

result = get_dummies(df, prefix_sep=["..", "__"], sparse=sparse)
result = get_dummies(df, dtype=np.uint8, prefix_sep=["..", "__"], sparse=sparse)
expected = expected.rename(columns={"B..b": "B__b", "B..c": "B__c"})
tm.assert_frame_equal(result, expected)

result = get_dummies(df, prefix_sep={"A": "..", "B": "__"}, sparse=sparse)
result = get_dummies(
df, dtype=np.uint8, prefix_sep={"A": "..", "B": "__"}, sparse=sparse
)
tm.assert_frame_equal(result, expected)

def test_dataframe_dummies_prefix_bad_length(self, df, sparse):
Expand All @@ -330,20 +342,20 @@ def test_dataframe_dummies_prefix_bad_length(self, df, sparse):
"encoded (2)"
)
with pytest.raises(ValueError, match=msg):
get_dummies(df, prefix=["too few"], sparse=sparse)
get_dummies(df, dtype=np.uint8, prefix=["too few"], sparse=sparse)

def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse):
msg = re.escape(
"Length of 'prefix_sep' (1) did not match the length of the columns being "
"encoded (2)"
)
with pytest.raises(ValueError, match=msg):
get_dummies(df, prefix_sep=["bad"], sparse=sparse)
get_dummies(df, dtype=np.uint8, prefix_sep=["bad"], sparse=sparse)

def test_dataframe_dummies_prefix_dict(self, sparse):
prefixes = {"A": "from_A", "B": "from_B"}
df = DataFrame({"C": [1, 2, 3], "A": ["a", "b", "a"], "B": ["b", "b", "c"]})
result = get_dummies(df, prefix=prefixes, sparse=sparse)
result = get_dummies(df, dtype=np.uint8, prefix=prefixes, sparse=sparse)

expected = DataFrame(
{
Expand Down Expand Up @@ -453,16 +465,18 @@ def test_get_dummies_basic_drop_first(self, sparse):

expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=np.uint8)

result = get_dummies(s_list, drop_first=True, sparse=sparse)
result = get_dummies(s_list, dtype=np.uint8, drop_first=True, sparse=sparse)
if sparse:
expected = expected.apply(SparseArray, fill_value=0)
tm.assert_frame_equal(result, expected)

result = get_dummies(s_series, drop_first=True, sparse=sparse)
result = get_dummies(s_series, dtype=np.uint8, drop_first=True, sparse=sparse)
tm.assert_frame_equal(result, expected)

expected.index = list("ABC")
result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
result = get_dummies(
s_series_index, dtype=np.uint8, drop_first=True, sparse=sparse
)
tm.assert_frame_equal(result, expected)

def test_get_dummies_basic_drop_first_one_level(self, sparse):
Expand All @@ -473,27 +487,31 @@ def test_get_dummies_basic_drop_first_one_level(self, sparse):

expected = DataFrame(index=np.arange(3))

result = get_dummies(s_list, drop_first=True, sparse=sparse)
result = get_dummies(s_list, dtype=np.uint8, drop_first=True, sparse=sparse)
tm.assert_frame_equal(result, expected)

result = get_dummies(s_series, drop_first=True, sparse=sparse)
result = get_dummies(s_series, dtype=np.uint8, drop_first=True, sparse=sparse)
tm.assert_frame_equal(result, expected)

expected = DataFrame(index=list("ABC"))
result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
result = get_dummies(
s_series_index, dtype=np.uint8, drop_first=True, sparse=sparse
)
tm.assert_frame_equal(result, expected)

def test_get_dummies_basic_drop_first_NA(self, sparse):
# Test NA handling together with drop_first
s_NA = ["a", "b", np.nan]
res = get_dummies(s_NA, drop_first=True, sparse=sparse)
res = get_dummies(s_NA, dtype=np.uint8, drop_first=True, sparse=sparse)
exp = DataFrame({"b": [0, 1, 0]}, dtype=np.uint8)
if sparse:
exp = exp.apply(SparseArray, fill_value=0)

tm.assert_frame_equal(res, exp)

res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse)
res_na = get_dummies(
s_NA, dtype=np.uint8, dummy_na=True, drop_first=True, sparse=sparse
)
exp_na = DataFrame({"b": [0, 1, 0], np.nan: [0, 0, 1]}, dtype=np.uint8).reindex(
["b", np.nan], axis=1
)
Expand All @@ -502,22 +520,22 @@ def test_get_dummies_basic_drop_first_NA(self, sparse):
tm.assert_frame_equal(res_na, exp_na)

res_just_na = get_dummies(
[np.nan], dummy_na=True, drop_first=True, sparse=sparse
[np.nan], dtype=np.uint8, dummy_na=True, drop_first=True, sparse=sparse
)
exp_just_na = DataFrame(index=np.arange(1))
tm.assert_frame_equal(res_just_na, exp_just_na)

def test_dataframe_dummies_drop_first(self, df, sparse):
df = df[["A", "B"]]
result = get_dummies(df, drop_first=True, sparse=sparse)
result = get_dummies(df, dtype=np.uint8, drop_first=True, sparse=sparse)
expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=np.uint8)
if sparse:
expected = expected.apply(SparseArray, fill_value=0)
tm.assert_frame_equal(result, expected)

def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype):
df["cat"] = Categorical(["x", "y", "y"])
result = get_dummies(df, drop_first=True, sparse=sparse)
result = get_dummies(df, dtype=np.uint8, drop_first=True, sparse=sparse)
expected = DataFrame(
{"C": [1, 2, 3], "A_b": [0, 1, 0], "B_c": [0, 0, 1], "cat_y": [0, 1, 1]}
)
Expand All @@ -532,7 +550,7 @@ def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype):
def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
df.loc[3, :] = [np.nan, np.nan, np.nan]
result = get_dummies(
df, dummy_na=True, drop_first=True, sparse=sparse
df, dtype=np.uint8, dummy_na=True, drop_first=True, sparse=sparse
).sort_index(axis=1)
expected = DataFrame(
{
Expand All @@ -552,18 +570,20 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse):

tm.assert_frame_equal(result, expected)

result = get_dummies(df, dummy_na=False, drop_first=True, sparse=sparse)
result = get_dummies(
df, dtype=np.uint8, dummy_na=False, drop_first=True, sparse=sparse
)
expected = expected[["C", "A_b", "B_c"]]
tm.assert_frame_equal(result, expected)

def test_get_dummies_int_int(self):
data = Series([1, 2, 1])
result = get_dummies(data)
result = get_dummies(data, dtype=np.uint8)
expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=np.uint8)
tm.assert_frame_equal(result, expected)

data = Series(Categorical(["a", "b", "a"]))
result = get_dummies(data)
result = get_dummies(data, dtype=np.uint8)
expected = DataFrame(
[[1, 0], [0, 1], [1, 0]], columns=Categorical(["a", "b"]), dtype=np.uint8
)
Expand Down Expand Up @@ -605,15 +625,15 @@ def test_dataframe_dummies_preserve_categorical_dtype(self, dtype, ordered):
def test_get_dummies_dont_sparsify_all_columns(self, sparse):
# GH18914
df = DataFrame.from_dict({"GDP": [1, 2], "Nation": ["AB", "CD"]})
df = get_dummies(df, columns=["Nation"], sparse=sparse)
df = get_dummies(df, dtype=np.uint8, columns=["Nation"], sparse=sparse)
df2 = df.reindex(columns=["GDP"])

tm.assert_frame_equal(df[["GDP"]], df2)

def test_get_dummies_duplicate_columns(self, df):
# GH20839
df.columns = ["A", "A", "A"]
result = get_dummies(df).sort_index(axis=1)
result = get_dummies(df, dtype=np.uint8).sort_index(axis=1)

expected = DataFrame(
[[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]],
Expand All @@ -627,7 +647,7 @@ def test_get_dummies_duplicate_columns(self, df):

def test_get_dummies_all_sparse(self):
df = DataFrame({"A": [1, 2]})
result = get_dummies(df, columns=["A"], sparse=True)
result = get_dummies(df, dtype=np.uint8, columns=["A"], sparse=True)
dtype = SparseDtype("uint8", 0)
expected = DataFrame(
{
Expand All @@ -652,4 +672,4 @@ def test_get_dummies_with_string_values(self, values):
msg = "Input must be a list-like for parameter `columns`"

with pytest.raises(TypeError, match=msg):
get_dummies(df, columns=values)
get_dummies(df, dtype=np.uint8, columns=values)