Skip to content

Commit

Permalink
BUG: DataFrame constructor defaulting to float dtype on empty input
Browse files Browse the repository at this point in the history
  • Loading branch information
rhshadrach committed May 12, 2024
1 parent 34177d6 commit be2c977
Show file tree
Hide file tree
Showing 21 changed files with 66 additions and 48 deletions.
4 changes: 2 additions & 2 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -652,8 +652,8 @@ def sanitize_array(
data = list(data)

if len(data) == 0 and dtype is None:
# We default to float64, matching numpy
subarr = np.array([], dtype=np.float64)
# We default to object, diverging from NumPy
subarr = np.array([], dtype=np.object_)

elif dtype is not None:
subarr = _try_cast(data, dtype, copy)
Expand Down
14 changes: 6 additions & 8 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -13059,16 +13059,14 @@ def quantile(
interpolation=interpolation,
method=method,
)
if method == "single":
res = res_df.iloc[0]
else:
# cannot directly iloc over sparse arrays
res = res_df.T.iloc[:, 0]
res = res_df.iloc[0]
if axis == 1 and len(self) == 0:
# GH#41544 try to get an appropriate dtype
dtype = find_common_type(list(self.dtypes))
if needs_i8_conversion(dtype):
return res.astype(dtype)
dtype = "float64"
cdtype = find_common_type(list(self.dtypes))
if needs_i8_conversion(cdtype):
dtype = cdtype
return res.astype(dtype)
return res

q = Index(q, dtype=np.float64)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -578,7 +578,7 @@ def _transform_general(
concatenated = concat(results, ignore_index=True)
result = self._set_result_index_ordered(concatenated)
else:
result = self.obj._constructor(dtype=np.float64)
result = self.obj._constructor(dtype=self.obj.dtype)

result.name = self.obj.name
return result
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1778,7 +1778,7 @@ def as_array(
passed_nan = lib.is_float(na_value) and isna(na_value)

if len(self.blocks) == 0:
arr = np.empty(self.shape, dtype=float)
arr = np.empty(self.shape, dtype=object)
return arr.transpose()

if self.is_single_block:
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/arrays/categorical/test_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,9 +122,9 @@ def test_compare_categorical_with_missing(self, a1, a2, categories):
"na_value, dtype",
[
(pd.NaT, "datetime64[ns]"),
(None, "float64"),
(None, "object"),
(np.nan, "float64"),
(pd.NA, "float64"),
(pd.NA, "object"),
],
)
def test_categorical_only_missing_values_no_cast(self, na_value, dtype):
Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/frame/methods/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def test_quantile(self, datetime_frame, interp_method, request):
def test_empty(self, interp_method):
interpolation, method = interp_method
q = DataFrame({"x": [], "y": []}).quantile(
0.1, axis=0, numeric_only=True, interpolation=interpolation, method=method
0.1, axis=0, interpolation=interpolation, method=method
)
assert np.isnan(q["x"]) and np.isnan(q["y"])

Expand Down Expand Up @@ -319,8 +319,11 @@ def test_quantile_multi_empty(self, interp_method):
result = DataFrame({"x": [], "y": []}).quantile(
[0.1, 0.9], axis=0, interpolation=interpolation, method=method
)
dtype = "float64" if method == "single" else "object"
expected = DataFrame(
{"x": [np.nan, np.nan], "y": [np.nan, np.nan]}, index=[0.1, 0.9]
{"x": [np.nan, np.nan], "y": [np.nan, np.nan]},
index=[0.1, 0.9],
dtype=dtype,
)
tm.assert_frame_equal(result, expected)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/methods/test_reindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def test_setitem_reset_index_dtypes(self):
df1["d"] = []
result = df1.reset_index()
expected = DataFrame(columns=["a", "b", "c", "d"], index=range(0)).astype(
{"a": "datetime64[ns]", "b": np.int64, "c": np.float64, "d": np.float64}
{"a": "datetime64[ns]", "b": np.int64, "c": np.float64, "d": np.object_}
)
tm.assert_frame_equal(result, expected)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1627,7 +1627,7 @@ def test_min_max_dt64_api_consistency_empty_df(self):
# check DataFrame/Series api consistency when calling min/max on an empty
# DataFrame/Series.
df = DataFrame({"x": []})
expected_float_series = Series([], dtype=float)
expected_float_series = Series([], dtype=object)
# check axis 0
assert np.isnan(df.min(axis=0).x) == np.isnan(expected_float_series.min())
assert np.isnan(df.max(axis=0).x) == np.isnan(expected_float_series.max())
Expand Down
15 changes: 10 additions & 5 deletions pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -1418,11 +1418,12 @@ def test_stack_timezone_aware_values(future_stack):
def test_stack_empty_frame(dropna, future_stack):
# GH 36113
levels = [np.array([], dtype=np.int64), np.array([], dtype=np.int64)]
expected = Series(dtype=np.float64, index=MultiIndex(levels=levels, codes=[[], []]))
expected = Series(dtype=np.object_, index=MultiIndex(levels=levels, codes=[[], []]))
if future_stack and dropna is not lib.no_default:
with pytest.raises(ValueError, match="dropna must be unspecified"):
DataFrame(dtype=np.float64).stack(dropna=dropna, future_stack=future_stack)
else:
# dtype=np.float64 is lost since there are no columns
result = DataFrame(dtype=np.float64).stack(
dropna=dropna, future_stack=future_stack
)
Expand Down Expand Up @@ -1612,7 +1613,9 @@ def test_unstack(self, multiindex_year_month_day_dataframe_random_data):
(
[[1, 1, None, None, 30.0], [2, None, None, None, 30.0]],
["ix1", "ix2", "col1", "col2", "col3"],
None,
# Nones are used as floats in the presence of numeric data,
# resulting in np.nan for index level 1.
np.nan,
[None, None, 30.0],
),
],
Expand All @@ -1624,10 +1627,12 @@ def test_unstack_partial(
# https://github.com/pandas-dev/pandas/issues/19351
# make sure DataFrame.unstack() works when its run on a subset of the DataFrame
# and the Index levels contain values that are not present in the subset
result = DataFrame(result_rows, columns=result_columns).set_index(
["ix1", "ix2"]
data = (
DataFrame(result_rows, columns=result_columns)
.set_index(["ix1", "ix2"])
.iloc[1:2]
)
result = result.iloc[1:2].unstack("ix2")
result = data.unstack("ix2")
expected = DataFrame(
[expected_row],
columns=MultiIndex.from_product(
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/methods/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def test_quantile_missing_group_values_no_segfaults():
([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]),
(["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]),
([0], [42], [0], [42.0]),
([], [], np.array([], dtype="float64"), np.array([], dtype="float64")),
([], np.array([], dtype="float64"), [], np.array([], dtype="float64")),
],
)
def test_quantile_missing_group_values_correct_results(
Expand Down
4 changes: 1 addition & 3 deletions pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -1479,9 +1479,7 @@ def test_empty_df(method, op):
group = getattr(gb, "b")

result = getattr(group, method)(op)
expected = Series(
[], name="b", dtype="float64", index=Index([], dtype="float64", name="a")
)
expected = Series([], name="b", index=Index([], name="a"))

tm.assert_series_equal(result, expected)

Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1116,10 +1116,10 @@ def convert_force_pure(x):
def test_groupby_dtype_inference_empty():
# GH 6733
df = DataFrame({"x": [], "range": np.arange(0, dtype="int64")})
assert df["x"].dtype == np.float64
assert df["x"].dtype == np.object_

result = df.groupby("x").first()
exp_index = Index([], name="x", dtype=np.float64)
exp_index = Index([], name="x", dtype=np.object_)
expected = DataFrame({"range": Series([], index=exp_index, dtype="int64")})
tm.assert_frame_equal(result, expected, by_blocks=True)

Expand Down
10 changes: 3 additions & 7 deletions pandas/tests/groupby/test_grouping.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,19 +739,15 @@ def test_list_grouper_with_nat(self):
[
(
"transform",
Series(name=2, dtype=np.float64),
Series(name=2),
),
(
"agg",
Series(
name=2, dtype=np.float64, index=Index([], dtype=np.float64, name=1)
),
Series(name=2, index=Index([], name=1)),
),
(
"apply",
Series(
name=2, dtype=np.float64, index=Index([], dtype=np.float64, name=1)
),
Series(name=2, index=Index([], name=1)),
),
],
)
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/indexing/test_partial.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,6 @@ def test_partial_set_empty_frame3(self):
expected = DataFrame(
columns=Index(["foo"], dtype=object), index=Index([], dtype="int64")
)
expected["foo"] = expected["foo"].astype("float64")

df = DataFrame(index=Index([], dtype="int64"))
df["foo"] = []
Expand All @@ -128,6 +127,11 @@ def test_partial_set_empty_frame3(self):

df = DataFrame(index=Index([], dtype="int64"))
df["foo"] = Series(np.arange(len(df)), dtype="float64")
expected = DataFrame(
columns=Index(["foo"], dtype=object),
index=Index([], dtype="int64"),
dtype="float64",
)

tm.assert_frame_equal(df, expected)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/resample/test_datetime_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2009,7 +2009,7 @@ def test_resample_empty_series_with_tz():
expected_idx = DatetimeIndex(
[], freq="2MS", name="ts", dtype="datetime64[ns, Atlantic/Faroe]"
)
expected = Series([], index=expected_idx, name="values", dtype="float64")
expected = Series([], index=expected_idx, name="values")
tm.assert_series_equal(result, expected)


Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/reshape/concat/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -572,7 +572,7 @@ def test_concat_empty_and_non_empty_frame_regression():
# GH 18178 regression test
df1 = DataFrame({"foo": [1]})
df2 = DataFrame({"foo": []})
expected = DataFrame({"foo": [1.0]})
expected = DataFrame({"foo": [1]}, dtype="object")
result = concat([df1, df2])
tm.assert_frame_equal(result, expected)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/reshape/concat/test_empty.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def test_concat_empty_series_timelike(self, tz, values):
expected = DataFrame(
{
0: Series([pd.NaT] * len(values), dtype="M8[ns]").dt.tz_localize(tz),
1: values,
1: Series(values, dtype=dtype),
}
)
result = concat([first, second], axis=1)
Expand Down
20 changes: 17 additions & 3 deletions pandas/tests/reshape/test_melt.py
Original file line number Diff line number Diff line change
Expand Up @@ -924,7 +924,14 @@ def test_invalid_separator(self):
"A": [],
"B": [],
}
expected = DataFrame(exp_data).astype({"year": np.int64})
expected = DataFrame(exp_data).astype(
{
"A2010": np.float64,
"A2011": np.float64,
"B2010": np.float64,
"year": np.int64,
}
)
expected = expected.set_index(["id", "year"])[
["X", "A2010", "A2011", "B2010", "A", "B"]
]
Expand Down Expand Up @@ -987,7 +994,14 @@ def test_invalid_suffixtype(self):
"A": [],
"B": [],
}
expected = DataFrame(exp_data).astype({"year": np.int64})
expected = DataFrame(exp_data).astype(
{
"Aone": np.float64,
"Atwo": np.float64,
"Bone": np.float64,
"year": np.int64,
}
)

expected = expected.set_index(["id", "year"])
expected.index = expected.index.set_levels([0, 1], level=0)
Expand Down Expand Up @@ -1211,7 +1225,7 @@ def test_missing_stubname(self, dtype):
name=("id", "num"),
)
expected = DataFrame(
{"a": [100, 200, 300, 400], "b": [np.nan] * 4},
{"a": [100, 200, 300, 400], "b": pd.Series([np.nan] * 4, dtype="object")},
index=index,
)
new_level = expected.index.levels[0].astype(dtype)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1416,7 +1416,7 @@ def test_constructor_dict_tuple_indexer(self):
data = {(1, 1, None): -1.0}
result = Series(data)
expected = Series(
-1.0, index=MultiIndex(levels=[[1], [1], [np.nan]], codes=[[0], [0], [-1]])
-1.0, index=MultiIndex(levels=[[1], [1], []], codes=[[0], [0], [-1]])
)
tm.assert_series_equal(result, expected)

Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/window/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -549,7 +549,7 @@ def test_groupby_rolling_empty_frame(self):
# GH-38057 from_tuples gives empty object dtype, we now get float/int levels
# expected.index = MultiIndex.from_tuples([], names=["s1", None])
expected.index = MultiIndex.from_product(
[Index([], dtype="float64"), Index([], dtype="int64")], names=["s1", None]
[Index([]), Index([], dtype="int64")], names=["s1", None]
)
tm.assert_frame_equal(result, expected)

Expand All @@ -559,8 +559,8 @@ def test_groupby_rolling_empty_frame(self):
expected = expected.drop(columns=["s1", "s2"])
expected.index = MultiIndex.from_product(
[
Index([], dtype="float64"),
Index([], dtype="float64"),
Index([]),
Index([]),
Index([], dtype="int64"),
],
names=["s1", "s2", None],
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/window/test_timeseries_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -671,7 +671,7 @@ def test_rolling_on_empty(self):
# GH-32385
df = DataFrame({"column": []}, index=[])
result = df.rolling("5s").min()
expected = DataFrame({"column": []}, index=[])
expected = DataFrame({"column": []}, index=[], dtype="float64")
tm.assert_frame_equal(result, expected)

def test_rolling_on_multi_index_level(self):
Expand Down

0 comments on commit be2c977

Please sign in to comment.