Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: change get_dummies default dtype to bool #48022

Merged
merged 21 commits into from
Oct 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
1eb5cd6
ENH: Warn when dtype is not passed to get_dummies
kianelbo Aug 10, 2022
efa678b
Edit get_dummies' dtype warning
kianelbo Aug 10, 2022
472fa28
Add whatsnew entry for issue #45848
kianelbo Aug 10, 2022
2ead750
Fix dtype warning test
kianelbo Aug 10, 2022
ddcc7d3
Suppress warnings in docs
kianelbo Aug 10, 2022
81dbb87
Edit whatsnew entry
kianelbo Aug 10, 2022
45d9c79
Merge branch 'main' into 'getdummies-default-dtype'
kianelbo Aug 23, 2022
f97df66
Fix find_stack_level in get_dummies dtype warning
kianelbo Aug 23, 2022
707a222
Merge branch 'main' into getdummies-default-dtype
kianelbo Sep 21, 2022
15aeb3e
Change the default dtype of get_dummies to bool
kianelbo Sep 22, 2022
a5f709d
Merge branch 'main' into 'getdummies-default-dtype'
kianelbo Sep 23, 2022
a246b8c
Revert dtype(bool) change
kianelbo Sep 25, 2022
7d72067
Merge branch 'main' again
kianelbo Sep 27, 2022
940bd11
Merge branch 'main' into getdummies-default-dtype
MarcoGorelli Sep 29, 2022
ee06958
Merge branch 'main' into getdummies-default-dtype
MarcoGorelli Oct 5, 2022
6e90b45
Merge branch 'main' into getdummies-default-dtype
MarcoGorelli Oct 6, 2022
ce37f33
Merge branch 'main' into getdummies-default-dtype
kianelbo Oct 7, 2022
7cef2fc
Move the changelog entry to v1.6.0.rst
kianelbo Oct 7, 2022
9285bf1
Merge branch 'main' into getdummies-default-dtype
MarcoGorelli Oct 10, 2022
d7e6490
Merge branch 'main' into getdummies-default-dtype
kianelbo Oct 11, 2022
8a93cc9
Move whatsnew entry to 'Other API changes'
kianelbo Oct 11, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.6.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ Other API changes
^^^^^^^^^^^^^^^^^
- Passing ``nanoseconds`` greater than 999 or less than 0 in :class:`Timestamp` now raises a ``ValueError`` (:issue:`48538`, :issue:`48255`)
- :func:`read_csv`: specifying an incorrect number of columns with ``index_col`` of now raises ``ParserError`` instead of ``IndexError`` when using the c parser.
- Default value of ``dtype`` in :func:`get_dummies` is changed to ``bool`` from ``uint8`` (:issue:`45848`)
- :meth:`DataFrame.astype`, :meth:`Series.astype`, and :meth:`DatetimeIndex.astype` casting datetime64 data to any of "datetime64[s]", "datetime64[ms]", "datetime64[us]" will return an object with the given resolution instead of coercing back to "datetime64[ns]" (:issue:`48928`)
- :meth:`DataFrame.astype`, :meth:`Series.astype`, and :meth:`DatetimeIndex.astype` casting timedelta64 data to any of "timedelta64[s]", "timedelta64[ms]", "timedelta64[us]" will return an object with the given resolution instead of coercing to "float64" dtype (:issue:`48963`)
-
Expand Down
60 changes: 30 additions & 30 deletions pandas/core/reshape/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def get_dummies(
drop_first : bool, default False
Whether to get k-1 dummies out of k categorical levels by removing the
first level.
dtype : dtype, default np.uint8
dtype : dtype, default bool
Data type for new columns. Only a single dtype is allowed.

Returns
Expand All @@ -89,50 +89,50 @@ def get_dummies(
>>> s = pd.Series(list('abca'))

>>> pd.get_dummies(s)
a b c
0 1 0 0
1 0 1 0
2 0 0 1
3 1 0 0
a b c
0 True False False
1 False True False
2 False False True
3 True False False

>>> s1 = ['a', 'b', np.nan]

>>> pd.get_dummies(s1)
a b
0 1 0
1 0 1
2 0 0
a b
0 True False
1 False True
2 False False

>>> pd.get_dummies(s1, dummy_na=True)
a b NaN
0 1 0 0
1 0 1 0
2 0 0 1
a b NaN
0 True False False
1 False True False
2 False False True

>>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
... 'C': [1, 2, 3]})

>>> pd.get_dummies(df, prefix=['col1', 'col2'])
C col1_a col1_b col2_a col2_b col2_c
0 1 1 0 0 1 0
1 2 0 1 1 0 0
2 3 1 0 0 0 1
0 1 True False False True False
1 2 False True True False False
2 3 True False False False True

>>> pd.get_dummies(pd.Series(list('abcaa')))
a b c
0 1 0 0
1 0 1 0
2 0 0 1
3 1 0 0
4 1 0 0
a b c
0 True False False
1 False True False
2 False False True
3 True False False
4 True False False

>>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)
b c
0 0 0
1 1 0
2 0 1
3 0 0
4 0 0
b c
0 False False
1 True False
2 False True
3 False False
4 False False

>>> pd.get_dummies(pd.Series(list('abc')), dtype=float)
a b c
Expand Down Expand Up @@ -236,7 +236,7 @@ def _get_dummies_1d(
codes, levels = factorize_from_iterable(Series(data))

if dtype is None:
dtype = np.dtype(np.uint8)
dtype = np.dtype(bool)
# error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str,
# dtype[Any], Type[object]]"; expected "Type[Any]"
dtype = np.dtype(dtype) # type: ignore[arg-type]
Expand Down
4 changes: 1 addition & 3 deletions pandas/tests/frame/indexing/test_getitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,7 @@ def test_getitem_list_of_labels_categoricalindex_cols(self):
# GH#16115
cats = Categorical([Timestamp("12-31-1999"), Timestamp("12-31-2000")])

expected = DataFrame(
[[1, 0], [0, 1]], dtype="uint8", index=[0, 1], columns=cats
)
expected = DataFrame([[1, 0], [0, 1]], dtype="bool", index=[0, 1], columns=cats)
dummies = get_dummies(cats)
result = dummies[list(dummies.columns)]
tm.assert_frame_equal(result, expected)
Expand Down
90 changes: 47 additions & 43 deletions pandas/tests/reshape/test_get_dummies.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def test_get_dummies_unicode(self, sparse):
s = [e, eacute, eacute]
res = get_dummies(s, prefix="letter", sparse=sparse)
exp = DataFrame(
{"letter_e": [1, 0, 0], f"letter_{eacute}": [0, 1, 1]}, dtype=np.uint8
{"letter_e": [True, False, False], f"letter_{eacute}": [False, True, True]}
)
if sparse:
exp = exp.apply(SparseArray, fill_value=0)
Expand All @@ -182,15 +182,15 @@ def test_dataframe_dummies_all_obj(self, df, sparse):
result = get_dummies(df, sparse=sparse)
expected = DataFrame(
{"A_a": [1, 0, 1], "A_b": [0, 1, 0], "B_b": [1, 1, 0], "B_c": [0, 0, 1]},
dtype=np.uint8,
dtype=bool,
)
if sparse:
expected = DataFrame(
{
"A_a": SparseArray([1, 0, 1], dtype="uint8"),
"A_b": SparseArray([0, 1, 0], dtype="uint8"),
"B_b": SparseArray([1, 1, 0], dtype="uint8"),
"B_c": SparseArray([0, 0, 1], dtype="uint8"),
"A_a": SparseArray([1, 0, 1], dtype="bool"),
"A_b": SparseArray([0, 1, 0], dtype="bool"),
"B_b": SparseArray([1, 1, 0], dtype="bool"),
"B_c": SparseArray([0, 0, 1], dtype="bool"),
}
)

Expand All @@ -208,7 +208,7 @@ def test_dataframe_dummies_string_dtype(self, df):
"B_b": [1, 1, 0],
"B_c": [0, 0, 1],
},
dtype=np.uint8,
dtype=bool,
)
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -238,12 +238,11 @@ def test_dataframe_dummies_prefix_list(self, df, sparse):
expected = DataFrame(
{
"C": [1, 2, 3],
"from_A_a": [1, 0, 1],
"from_A_b": [0, 1, 0],
"from_B_b": [1, 1, 0],
"from_B_c": [0, 0, 1],
"from_A_a": [True, False, True],
"from_A_b": [False, True, False],
"from_B_b": [True, True, False],
"from_B_c": [False, False, True],
},
dtype=np.uint8,
)
expected[["C"]] = df[["C"]]
cols = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
Expand All @@ -258,9 +257,12 @@ def test_dataframe_dummies_prefix_str(self, df, sparse):
result = get_dummies(df, prefix="bad", sparse=sparse)
bad_columns = ["bad_a", "bad_b", "bad_b", "bad_c"]
expected = DataFrame(
[[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]],
[
[1, True, False, True, False],
[2, False, True, True, False],
[3, True, False, False, True],
],
columns=["C"] + bad_columns,
dtype=np.uint8,
)
expected = expected.astype({"C": np.int64})
if sparse:
Expand All @@ -269,10 +271,10 @@ def test_dataframe_dummies_prefix_str(self, df, sparse):
expected = pd.concat(
[
Series([1, 2, 3], name="C"),
Series([1, 0, 1], name="bad_a", dtype="Sparse[uint8]"),
Series([0, 1, 0], name="bad_b", dtype="Sparse[uint8]"),
Series([1, 1, 0], name="bad_b", dtype="Sparse[uint8]"),
Series([0, 0, 1], name="bad_c", dtype="Sparse[uint8]"),
Series([True, False, True], name="bad_a", dtype="Sparse[bool]"),
Series([False, True, False], name="bad_b", dtype="Sparse[bool]"),
Series([True, True, False], name="bad_b", dtype="Sparse[bool]"),
Series([False, False, True], name="bad_c", dtype="Sparse[bool]"),
],
axis=1,
)
Expand All @@ -290,30 +292,29 @@ def test_dataframe_dummies_subset(self, df, sparse):
},
)
cols = expected.columns
expected[cols[1:]] = expected[cols[1:]].astype(np.uint8)
expected[cols[1:]] = expected[cols[1:]].astype(bool)
expected[["C"]] = df[["C"]]
if sparse:
cols = ["from_A_a", "from_A_b"]
expected[cols] = expected[cols].astype(SparseDtype("uint8", 0))
expected[cols] = expected[cols].astype(SparseDtype("bool", 0))
tm.assert_frame_equal(result, expected)

def test_dataframe_dummies_prefix_sep(self, df, sparse):
result = get_dummies(df, prefix_sep="..", sparse=sparse)
expected = DataFrame(
{
"C": [1, 2, 3],
"A..a": [1, 0, 1],
"A..b": [0, 1, 0],
"B..b": [1, 1, 0],
"B..c": [0, 0, 1],
"A..a": [True, False, True],
"A..b": [False, True, False],
"B..b": [True, True, False],
"B..c": [False, False, True],
},
dtype=np.uint8,
)
expected[["C"]] = df[["C"]]
expected = expected[["C", "A..a", "A..b", "B..b", "B..c"]]
if sparse:
cols = ["A..a", "A..b", "B..b", "B..c"]
expected[cols] = expected[cols].astype(SparseDtype("uint8", 0))
expected[cols] = expected[cols].astype(SparseDtype("bool", 0))

tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -356,9 +357,9 @@ def test_dataframe_dummies_prefix_dict(self, sparse):
)

columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
expected[columns] = expected[columns].astype(np.uint8)
expected[columns] = expected[columns].astype(bool)
if sparse:
expected[columns] = expected[columns].astype(SparseDtype("uint8", 0))
expected[columns] = expected[columns].astype(SparseDtype("bool", 0))

tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -422,19 +423,19 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
[
(
{"data": DataFrame({"ä": ["a"]})},
DataFrame({"ä_a": [1]}, dtype=np.uint8),
DataFrame({"ä_a": [True]}),
),
(
{"data": DataFrame({"x": ["ä"]})},
DataFrame({"x_ä": [1]}, dtype=np.uint8),
DataFrame({"x_ä": [True]}),
),
(
{"data": DataFrame({"x": ["a"]}), "prefix": "ä"},
DataFrame({"ä_a": [1]}, dtype=np.uint8),
DataFrame({"ä_a": [True]}),
),
(
{"data": DataFrame({"x": ["a"]}), "prefix_sep": "ä"},
DataFrame({"xäa": [1]}, dtype=np.uint8),
DataFrame({"xäa": [True]}),
),
],
)
Expand All @@ -451,7 +452,7 @@ def test_get_dummies_basic_drop_first(self, sparse):
s_series = Series(s_list)
s_series_index = Series(s_list, list("ABC"))

expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=np.uint8)
expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=bool)

result = get_dummies(s_list, drop_first=True, sparse=sparse)
if sparse:
Expand Down Expand Up @@ -487,14 +488,14 @@ def test_get_dummies_basic_drop_first_NA(self, sparse):
# Test NA handling together with drop_first
s_NA = ["a", "b", np.nan]
res = get_dummies(s_NA, drop_first=True, sparse=sparse)
exp = DataFrame({"b": [0, 1, 0]}, dtype=np.uint8)
exp = DataFrame({"b": [0, 1, 0]}, dtype=bool)
if sparse:
exp = exp.apply(SparseArray, fill_value=0)

tm.assert_frame_equal(res, exp)

res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse)
exp_na = DataFrame({"b": [0, 1, 0], np.nan: [0, 0, 1]}, dtype=np.uint8).reindex(
exp_na = DataFrame({"b": [0, 1, 0], np.nan: [0, 0, 1]}, dtype=bool).reindex(
["b", np.nan], axis=1
)
if sparse:
Expand All @@ -510,7 +511,7 @@ def test_get_dummies_basic_drop_first_NA(self, sparse):
def test_dataframe_dummies_drop_first(self, df, sparse):
df = df[["A", "B"]]
result = get_dummies(df, drop_first=True, sparse=sparse)
expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=np.uint8)
expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=bool)
if sparse:
expected = expected.apply(SparseArray, fill_value=0)
tm.assert_frame_equal(result, expected)
Expand All @@ -522,7 +523,7 @@ def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype):
{"C": [1, 2, 3], "A_b": [0, 1, 0], "B_c": [0, 0, 1], "cat_y": [0, 1, 1]}
)
cols = ["A_b", "B_c", "cat_y"]
expected[cols] = expected[cols].astype(np.uint8)
expected[cols] = expected[cols].astype(bool)
expected = expected[["C", "A_b", "B_c", "cat_y"]]
if sparse:
for col in cols:
Expand All @@ -544,7 +545,7 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
}
)
cols = ["A_b", "A_nan", "B_c", "B_nan"]
expected[cols] = expected[cols].astype(np.uint8)
expected[cols] = expected[cols].astype(bool)
expected = expected.sort_index(axis=1)
if sparse:
for col in cols:
Expand All @@ -559,13 +560,13 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
def test_get_dummies_int_int(self):
data = Series([1, 2, 1])
result = get_dummies(data)
expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=np.uint8)
expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=bool)
tm.assert_frame_equal(result, expected)

data = Series(Categorical(["a", "b", "a"]))
result = get_dummies(data)
expected = DataFrame(
[[1, 0], [0, 1], [1, 0]], columns=Categorical(["a", "b"]), dtype=np.uint8
[[1, 0], [0, 1], [1, 0]], columns=Categorical(["a", "b"]), dtype=bool
)
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -616,9 +617,12 @@ def test_get_dummies_duplicate_columns(self, df):
result = get_dummies(df).sort_index(axis=1)

expected = DataFrame(
[[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]],
[
[1, True, False, True, False],
[2, False, True, True, False],
[3, True, False, False, True],
],
columns=["A", "A_a", "A_b", "A_b", "A_c"],
dtype=np.uint8,
).sort_index(axis=1)

expected = expected.astype({"A": np.int64})
Expand All @@ -628,7 +632,7 @@ def test_get_dummies_duplicate_columns(self, df):
def test_get_dummies_all_sparse(self):
df = DataFrame({"A": [1, 2]})
result = get_dummies(df, columns=["A"], sparse=True)
dtype = SparseDtype("uint8", 0)
dtype = SparseDtype("bool", 0)
expected = DataFrame(
{
"A_1": SparseArray([1, 0], dtype=dtype),
Expand Down