Skip to content

Commit

Permalink
ENH: Warn when dtype is not passed to get_dummies
Browse files Browse the repository at this point in the history
  • Loading branch information
kianelbo committed Aug 10, 2022
1 parent 6db95e7 commit ed5136a
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 34 deletions.
8 changes: 8 additions & 0 deletions pandas/core/reshape/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from collections import defaultdict
import itertools
from typing import Hashable
import warnings

import numpy as np

Expand All @@ -20,6 +21,7 @@
from pandas.core.frame import DataFrame
from pandas.core.indexes.api import Index
from pandas.core.series import Series
from pandas.util._exceptions import find_stack_level


def get_dummies(
Expand Down Expand Up @@ -228,6 +230,12 @@ def _get_dummies_1d(
codes, levels = factorize_from_iterable(Series(data))

if dtype is None:
warnings.warn(
"The default dtype will change from 'uint8' to 'bool', "
"please specify a dtype to silence this warning",
FutureWarning,
stacklevel=find_stack_level(),
)
dtype = np.dtype(np.uint8)
# error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str,
# dtype[Any], Type[object]]"; expected "Type[Any]"
Expand Down
87 changes: 53 additions & 34 deletions pandas/tests/reshape/test_get_dummies.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ def test_get_dummies_raises_on_dtype_object(self, df):
with pytest.raises(ValueError, match=msg):
get_dummies(df, dtype="object")

def test_get_dummies_warns_default_dtype(self, df):
msg = "The default dtype will change from 'uint8' to 'bool'"
with pytest.warns(FutureWarning, match=msg):
get_dummies(df)

def test_get_dummies_basic(self, sparse, dtype):
s_list = list("abc")
s_series = Series(s_list)
Expand Down Expand Up @@ -121,9 +126,11 @@ def test_get_dummies_just_na(self, sparse):
just_na_series = Series(just_na_list)
just_na_series_index = Series(just_na_list, index=["A"])

res_list = get_dummies(just_na_list, sparse=sparse)
res_series = get_dummies(just_na_series, sparse=sparse)
res_series_index = get_dummies(just_na_series_index, sparse=sparse)
res_list = get_dummies(just_na_list, dtype=np.uint8, sparse=sparse)
res_series = get_dummies(just_na_series, dtype=np.uint8, sparse=sparse)
res_series_index = get_dummies(
just_na_series_index, dtype=np.uint8, sparse=sparse
)

assert res_list.empty
assert res_series.empty
Expand Down Expand Up @@ -169,7 +176,7 @@ def test_get_dummies_unicode(self, sparse):
e = "e"
eacute = unicodedata.lookup("LATIN SMALL LETTER E WITH ACUTE")
s = [e, eacute, eacute]
res = get_dummies(s, prefix="letter", sparse=sparse)
res = get_dummies(s, dtype=np.uint8, prefix="letter", sparse=sparse)
exp = DataFrame(
{"letter_e": [1, 0, 0], f"letter_{eacute}": [0, 1, 1]}, dtype=np.uint8
)
Expand All @@ -179,7 +186,7 @@ def test_get_dummies_unicode(self, sparse):

def test_dataframe_dummies_all_obj(self, df, sparse):
df = df[["A", "B"]]
result = get_dummies(df, sparse=sparse)
result = get_dummies(df, dtype=np.uint8, sparse=sparse)
expected = DataFrame(
{"A_a": [1, 0, 1], "A_b": [0, 1, 0], "B_b": [1, 1, 0], "B_c": [0, 0, 1]},
dtype=np.uint8,
Expand All @@ -200,7 +207,7 @@ def test_dataframe_dummies_string_dtype(self, df):
# GH44965
df = df[["A", "B"]]
df = df.astype({"A": "object", "B": "string"})
result = get_dummies(df)
result = get_dummies(df, dtype=np.uint8)
expected = DataFrame(
{
"A_a": [1, 0, 1],
Expand Down Expand Up @@ -234,7 +241,7 @@ def test_dataframe_dummies_mix_default(self, df, sparse, dtype):

def test_dataframe_dummies_prefix_list(self, df, sparse):
prefixes = ["from_A", "from_B"]
result = get_dummies(df, prefix=prefixes, sparse=sparse)
result = get_dummies(df, dtype=np.uint8, prefix=prefixes, sparse=sparse)
expected = DataFrame(
{
"C": [1, 2, 3],
Expand All @@ -255,7 +262,7 @@ def test_dataframe_dummies_prefix_list(self, df, sparse):

def test_dataframe_dummies_prefix_str(self, df, sparse):
# not that you should do this...
result = get_dummies(df, prefix="bad", sparse=sparse)
result = get_dummies(df, dtype=np.uint8, prefix="bad", sparse=sparse)
bad_columns = ["bad_a", "bad_b", "bad_b", "bad_c"]
expected = DataFrame(
[[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]],
Expand All @@ -280,7 +287,9 @@ def test_dataframe_dummies_prefix_str(self, df, sparse):
tm.assert_frame_equal(result, expected)

def test_dataframe_dummies_subset(self, df, sparse):
result = get_dummies(df, prefix=["from_A"], columns=["A"], sparse=sparse)
result = get_dummies(
df, dtype=np.uint8, prefix=["from_A"], columns=["A"], sparse=sparse
)
expected = DataFrame(
{
"B": ["b", "b", "c"],
Expand All @@ -298,7 +307,7 @@ def test_dataframe_dummies_subset(self, df, sparse):
tm.assert_frame_equal(result, expected)

def test_dataframe_dummies_prefix_sep(self, df, sparse):
result = get_dummies(df, prefix_sep="..", sparse=sparse)
result = get_dummies(df, dtype=np.uint8, prefix_sep="..", sparse=sparse)
expected = DataFrame(
{
"C": [1, 2, 3],
Expand All @@ -317,11 +326,13 @@ def test_dataframe_dummies_prefix_sep(self, df, sparse):

tm.assert_frame_equal(result, expected)

result = get_dummies(df, prefix_sep=["..", "__"], sparse=sparse)
result = get_dummies(df, dtype=np.uint8, prefix_sep=["..", "__"], sparse=sparse)
expected = expected.rename(columns={"B..b": "B__b", "B..c": "B__c"})
tm.assert_frame_equal(result, expected)

result = get_dummies(df, prefix_sep={"A": "..", "B": "__"}, sparse=sparse)
result = get_dummies(
df, dtype=np.uint8, prefix_sep={"A": "..", "B": "__"}, sparse=sparse
)
tm.assert_frame_equal(result, expected)

def test_dataframe_dummies_prefix_bad_length(self, df, sparse):
Expand All @@ -330,20 +341,20 @@ def test_dataframe_dummies_prefix_bad_length(self, df, sparse):
"encoded (2)"
)
with pytest.raises(ValueError, match=msg):
get_dummies(df, prefix=["too few"], sparse=sparse)
get_dummies(df, dtype=np.uint8, prefix=["too few"], sparse=sparse)

def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse):
msg = re.escape(
"Length of 'prefix_sep' (1) did not match the length of the columns being "
"encoded (2)"
)
with pytest.raises(ValueError, match=msg):
get_dummies(df, prefix_sep=["bad"], sparse=sparse)
get_dummies(df, dtype=np.uint8, prefix_sep=["bad"], sparse=sparse)

def test_dataframe_dummies_prefix_dict(self, sparse):
prefixes = {"A": "from_A", "B": "from_B"}
df = DataFrame({"C": [1, 2, 3], "A": ["a", "b", "a"], "B": ["b", "b", "c"]})
result = get_dummies(df, prefix=prefixes, sparse=sparse)
result = get_dummies(df, dtype=np.uint8, prefix=prefixes, sparse=sparse)

expected = DataFrame(
{
Expand Down Expand Up @@ -453,16 +464,18 @@ def test_get_dummies_basic_drop_first(self, sparse):

expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=np.uint8)

result = get_dummies(s_list, drop_first=True, sparse=sparse)
result = get_dummies(s_list, dtype=np.uint8, drop_first=True, sparse=sparse)
if sparse:
expected = expected.apply(SparseArray, fill_value=0)
tm.assert_frame_equal(result, expected)

result = get_dummies(s_series, drop_first=True, sparse=sparse)
result = get_dummies(s_series, dtype=np.uint8, drop_first=True, sparse=sparse)
tm.assert_frame_equal(result, expected)

expected.index = list("ABC")
result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
result = get_dummies(
s_series_index, dtype=np.uint8, drop_first=True, sparse=sparse
)
tm.assert_frame_equal(result, expected)

def test_get_dummies_basic_drop_first_one_level(self, sparse):
Expand All @@ -473,27 +486,31 @@ def test_get_dummies_basic_drop_first_one_level(self, sparse):

expected = DataFrame(index=np.arange(3))

result = get_dummies(s_list, drop_first=True, sparse=sparse)
result = get_dummies(s_list, dtype=np.uint8, drop_first=True, sparse=sparse)
tm.assert_frame_equal(result, expected)

result = get_dummies(s_series, drop_first=True, sparse=sparse)
result = get_dummies(s_series, dtype=np.uint8, drop_first=True, sparse=sparse)
tm.assert_frame_equal(result, expected)

expected = DataFrame(index=list("ABC"))
result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
result = get_dummies(
s_series_index, dtype=np.uint8, drop_first=True, sparse=sparse
)
tm.assert_frame_equal(result, expected)

def test_get_dummies_basic_drop_first_NA(self, sparse):
# Test NA handling together with drop_first
s_NA = ["a", "b", np.nan]
res = get_dummies(s_NA, drop_first=True, sparse=sparse)
res = get_dummies(s_NA, dtype=np.uint8, drop_first=True, sparse=sparse)
exp = DataFrame({"b": [0, 1, 0]}, dtype=np.uint8)
if sparse:
exp = exp.apply(SparseArray, fill_value=0)

tm.assert_frame_equal(res, exp)

res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse)
res_na = get_dummies(
s_NA, dtype=np.uint8, dummy_na=True, drop_first=True, sparse=sparse
)
exp_na = DataFrame({"b": [0, 1, 0], np.nan: [0, 0, 1]}, dtype=np.uint8).reindex(
["b", np.nan], axis=1
)
Expand All @@ -502,22 +519,22 @@ def test_get_dummies_basic_drop_first_NA(self, sparse):
tm.assert_frame_equal(res_na, exp_na)

res_just_na = get_dummies(
[np.nan], dummy_na=True, drop_first=True, sparse=sparse
[np.nan], dtype=np.uint8, dummy_na=True, drop_first=True, sparse=sparse
)
exp_just_na = DataFrame(index=np.arange(1))
tm.assert_frame_equal(res_just_na, exp_just_na)

def test_dataframe_dummies_drop_first(self, df, sparse):
df = df[["A", "B"]]
result = get_dummies(df, drop_first=True, sparse=sparse)
result = get_dummies(df, dtype=np.uint8, drop_first=True, sparse=sparse)
expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=np.uint8)
if sparse:
expected = expected.apply(SparseArray, fill_value=0)
tm.assert_frame_equal(result, expected)

def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype):
df["cat"] = Categorical(["x", "y", "y"])
result = get_dummies(df, drop_first=True, sparse=sparse)
result = get_dummies(df, dtype=np.uint8, drop_first=True, sparse=sparse)
expected = DataFrame(
{"C": [1, 2, 3], "A_b": [0, 1, 0], "B_c": [0, 0, 1], "cat_y": [0, 1, 1]}
)
Expand All @@ -532,7 +549,7 @@ def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype):
def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
df.loc[3, :] = [np.nan, np.nan, np.nan]
result = get_dummies(
df, dummy_na=True, drop_first=True, sparse=sparse
df, dtype=np.uint8, dummy_na=True, drop_first=True, sparse=sparse
).sort_index(axis=1)
expected = DataFrame(
{
Expand All @@ -552,18 +569,20 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse):

tm.assert_frame_equal(result, expected)

result = get_dummies(df, dummy_na=False, drop_first=True, sparse=sparse)
result = get_dummies(
df, dtype=np.uint8, dummy_na=False, drop_first=True, sparse=sparse
)
expected = expected[["C", "A_b", "B_c"]]
tm.assert_frame_equal(result, expected)

def test_get_dummies_int_int(self):
data = Series([1, 2, 1])
result = get_dummies(data)
result = get_dummies(data, dtype=np.uint8)
expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=np.uint8)
tm.assert_frame_equal(result, expected)

data = Series(Categorical(["a", "b", "a"]))
result = get_dummies(data)
result = get_dummies(data, dtype=np.uint8)
expected = DataFrame(
[[1, 0], [0, 1], [1, 0]], columns=Categorical(["a", "b"]), dtype=np.uint8
)
Expand Down Expand Up @@ -605,15 +624,15 @@ def test_dataframe_dummies_preserve_categorical_dtype(self, dtype, ordered):
def test_get_dummies_dont_sparsify_all_columns(self, sparse):
# GH18914
df = DataFrame.from_dict({"GDP": [1, 2], "Nation": ["AB", "CD"]})
df = get_dummies(df, columns=["Nation"], sparse=sparse)
df = get_dummies(df, dtype=np.uint8, columns=["Nation"], sparse=sparse)
df2 = df.reindex(columns=["GDP"])

tm.assert_frame_equal(df[["GDP"]], df2)

def test_get_dummies_duplicate_columns(self, df):
# GH20839
df.columns = ["A", "A", "A"]
result = get_dummies(df).sort_index(axis=1)
result = get_dummies(df, dtype=np.uint8).sort_index(axis=1)

expected = DataFrame(
[[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]],
Expand All @@ -627,7 +646,7 @@ def test_get_dummies_duplicate_columns(self, df):

def test_get_dummies_all_sparse(self):
df = DataFrame({"A": [1, 2]})
result = get_dummies(df, columns=["A"], sparse=True)
result = get_dummies(df, dtype=np.uint8, columns=["A"], sparse=True)
dtype = SparseDtype("uint8", 0)
expected = DataFrame(
{
Expand All @@ -652,4 +671,4 @@ def test_get_dummies_with_string_values(self, values):
msg = "Input must be a list-like for parameter `columns`"

with pytest.raises(TypeError, match=msg):
get_dummies(df, columns=values)
get_dummies(df, dtype=np.uint8, columns=values)

0 comments on commit ed5136a

Please sign in to comment.