BUG: DataFrame constructor defaulting to float dtype on empty input

rhshadrach · May 12, 2024 · be2c977 · be2c977
1 parent 34177d6
commit be2c977
Show file tree

Hide file tree

Showing 21 changed files with 66 additions and 48 deletions.
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
@@ -652,8 +652,8 @@ def sanitize_array(
         data = list(data)
 
         if len(data) == 0 and dtype is None:
-            # We default to float64, matching numpy
-            subarr = np.array([], dtype=np.float64)
+            # We default to object, diverging from NumPy
+            subarr = np.array([], dtype=np.object_)
 
         elif dtype is not None:
             subarr = _try_cast(data, dtype, copy)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -13059,16 +13059,14 @@ def quantile(
                 interpolation=interpolation,
                 method=method,
             )
-            if method == "single":
-                res = res_df.iloc[0]
-            else:
-                # cannot directly iloc over sparse arrays
-                res = res_df.T.iloc[:, 0]
+            res = res_df.iloc[0]
             if axis == 1 and len(self) == 0:
                 # GH#41544 try to get an appropriate dtype
-                dtype = find_common_type(list(self.dtypes))
-                if needs_i8_conversion(dtype):
-                    return res.astype(dtype)
+                dtype = "float64"
+                cdtype = find_common_type(list(self.dtypes))
+                if needs_i8_conversion(cdtype):
+                    dtype = cdtype
+                return res.astype(dtype)
             return res
 
         q = Index(q, dtype=np.float64)

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -578,7 +578,7 @@ def _transform_general(
             concatenated = concat(results, ignore_index=True)
             result = self._set_result_index_ordered(concatenated)
         else:
-            result = self.obj._constructor(dtype=np.float64)
+            result = self.obj._constructor(dtype=self.obj.dtype)
 
         result.name = self.obj.name
         return result

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -1778,7 +1778,7 @@ def as_array(
         passed_nan = lib.is_float(na_value) and isna(na_value)
 
         if len(self.blocks) == 0:
-            arr = np.empty(self.shape, dtype=float)
+            arr = np.empty(self.shape, dtype=object)
             return arr.transpose()
 
         if self.is_single_block:

diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py
@@ -122,9 +122,9 @@ def test_compare_categorical_with_missing(self, a1, a2, categories):
         "na_value, dtype",
         [
             (pd.NaT, "datetime64[ns]"),
-            (None, "float64"),
+            (None, "object"),
             (np.nan, "float64"),
-            (pd.NA, "float64"),
+            (pd.NA, "object"),
         ],
     )
     def test_categorical_only_missing_values_no_cast(self, na_value, dtype):

diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py
@@ -81,7 +81,7 @@ def test_quantile(self, datetime_frame, interp_method, request):
     def test_empty(self, interp_method):
         interpolation, method = interp_method
         q = DataFrame({"x": [], "y": []}).quantile(
-            0.1, axis=0, numeric_only=True, interpolation=interpolation, method=method
+            0.1, axis=0, interpolation=interpolation, method=method
         )
         assert np.isnan(q["x"]) and np.isnan(q["y"])
 
@@ -319,8 +319,11 @@ def test_quantile_multi_empty(self, interp_method):
         result = DataFrame({"x": [], "y": []}).quantile(
             [0.1, 0.9], axis=0, interpolation=interpolation, method=method
         )
+        dtype = "float64" if method == "single" else "object"
         expected = DataFrame(
-            {"x": [np.nan, np.nan], "y": [np.nan, np.nan]}, index=[0.1, 0.9]
+            {"x": [np.nan, np.nan], "y": [np.nan, np.nan]},
+            index=[0.1, 0.9],
+            dtype=dtype,
         )
         tm.assert_frame_equal(result, expected)
 

diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py
@@ -77,7 +77,7 @@ def test_setitem_reset_index_dtypes(self):
         df1["d"] = []
         result = df1.reset_index()
         expected = DataFrame(columns=["a", "b", "c", "d"], index=range(0)).astype(
-            {"a": "datetime64[ns]", "b": np.int64, "c": np.float64, "d": np.float64}
+            {"a": "datetime64[ns]", "b": np.int64, "c": np.float64, "d": np.object_}
         )
         tm.assert_frame_equal(result, expected)
 

diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
@@ -1627,7 +1627,7 @@ def test_min_max_dt64_api_consistency_empty_df(self):
         # check DataFrame/Series api consistency when calling min/max on an empty
         # DataFrame/Series.
         df = DataFrame({"x": []})
-        expected_float_series = Series([], dtype=float)
+        expected_float_series = Series([], dtype=object)
         # check axis 0
         assert np.isnan(df.min(axis=0).x) == np.isnan(expected_float_series.min())
         assert np.isnan(df.max(axis=0).x) == np.isnan(expected_float_series.max())

diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py
@@ -1418,11 +1418,12 @@ def test_stack_timezone_aware_values(future_stack):
 def test_stack_empty_frame(dropna, future_stack):
     # GH 36113
     levels = [np.array([], dtype=np.int64), np.array([], dtype=np.int64)]
-    expected = Series(dtype=np.float64, index=MultiIndex(levels=levels, codes=[[], []]))
+    expected = Series(dtype=np.object_, index=MultiIndex(levels=levels, codes=[[], []]))
     if future_stack and dropna is not lib.no_default:
         with pytest.raises(ValueError, match="dropna must be unspecified"):
             DataFrame(dtype=np.float64).stack(dropna=dropna, future_stack=future_stack)
     else:
+        # dtype=np.float64 is lost since there are no columns
         result = DataFrame(dtype=np.float64).stack(
             dropna=dropna, future_stack=future_stack
         )
@@ -1612,7 +1613,9 @@ def test_unstack(self, multiindex_year_month_day_dataframe_random_data):
             (
                 [[1, 1, None, None, 30.0], [2, None, None, None, 30.0]],
                 ["ix1", "ix2", "col1", "col2", "col3"],
-                None,
+                # Nones are used as floats in the presence of numeric data,
+                # resulting in np.nan for index level 1.
+                np.nan,
                 [None, None, 30.0],
             ),
         ],
@@ -1624,10 +1627,12 @@ def test_unstack_partial(
         # https://github.com/pandas-dev/pandas/issues/19351
         # make sure DataFrame.unstack() works when its run on a subset of the DataFrame
         # and the Index levels contain values that are not present in the subset
-        result = DataFrame(result_rows, columns=result_columns).set_index(
-            ["ix1", "ix2"]
+        data = (
+            DataFrame(result_rows, columns=result_columns)
+            .set_index(["ix1", "ix2"])
+            .iloc[1:2]
         )
-        result = result.iloc[1:2].unstack("ix2")
+        result = data.unstack("ix2")
         expected = DataFrame(
             [expected_row],
             columns=MultiIndex.from_product(

diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py
@@ -192,7 +192,7 @@ def test_quantile_missing_group_values_no_segfaults():
         ([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]),
         (["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]),
         ([0], [42], [0], [42.0]),
-        ([], [], np.array([], dtype="float64"), np.array([], dtype="float64")),
+        ([], np.array([], dtype="float64"), [], np.array([], dtype="float64")),
     ],
 )
 def test_quantile_missing_group_values_correct_results(

diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
@@ -1479,9 +1479,7 @@ def test_empty_df(method, op):
     group = getattr(gb, "b")
 
     result = getattr(group, method)(op)
-    expected = Series(
-        [], name="b", dtype="float64", index=Index([], dtype="float64", name="a")
-    )
+    expected = Series([], name="b", index=Index([], name="a"))
 
     tm.assert_series_equal(result, expected)
 

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -1116,10 +1116,10 @@ def convert_force_pure(x):
 def test_groupby_dtype_inference_empty():
     # GH 6733
     df = DataFrame({"x": [], "range": np.arange(0, dtype="int64")})
-    assert df["x"].dtype == np.float64
+    assert df["x"].dtype == np.object_
 
     result = df.groupby("x").first()
-    exp_index = Index([], name="x", dtype=np.float64)
+    exp_index = Index([], name="x", dtype=np.object_)
     expected = DataFrame({"range": Series([], index=exp_index, dtype="int64")})
     tm.assert_frame_equal(result, expected, by_blocks=True)
 

diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py
@@ -739,19 +739,15 @@ def test_list_grouper_with_nat(self):
         [
             (
                 "transform",
-                Series(name=2, dtype=np.float64),
+                Series(name=2),
             ),
             (
                 "agg",
-                Series(
-                    name=2, dtype=np.float64, index=Index([], dtype=np.float64, name=1)
-                ),
+                Series(name=2, index=Index([], name=1)),
             ),
             (
                 "apply",
-                Series(
-                    name=2, dtype=np.float64, index=Index([], dtype=np.float64, name=1)
-                ),
+                Series(name=2, index=Index([], name=1)),
             ),
         ],
     )

diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py
@@ -119,7 +119,6 @@ def test_partial_set_empty_frame3(self):
         expected = DataFrame(
             columns=Index(["foo"], dtype=object), index=Index([], dtype="int64")
         )
-        expected["foo"] = expected["foo"].astype("float64")
 
         df = DataFrame(index=Index([], dtype="int64"))
         df["foo"] = []
@@ -128,6 +127,11 @@ def test_partial_set_empty_frame3(self):
 
         df = DataFrame(index=Index([], dtype="int64"))
         df["foo"] = Series(np.arange(len(df)), dtype="float64")
+        expected = DataFrame(
+            columns=Index(["foo"], dtype=object),
+            index=Index([], dtype="int64"),
+            dtype="float64",
+        )
 
         tm.assert_frame_equal(df, expected)
 

diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py
@@ -2009,7 +2009,7 @@ def test_resample_empty_series_with_tz():
     expected_idx = DatetimeIndex(
         [], freq="2MS", name="ts", dtype="datetime64[ns, Atlantic/Faroe]"
     )
-    expected = Series([], index=expected_idx, name="values", dtype="float64")
+    expected = Series([], index=expected_idx, name="values")
     tm.assert_series_equal(result, expected)
 
 

diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py
@@ -572,7 +572,7 @@ def test_concat_empty_and_non_empty_frame_regression():
     # GH 18178 regression test
     df1 = DataFrame({"foo": [1]})
     df2 = DataFrame({"foo": []})
-    expected = DataFrame({"foo": [1.0]})
+    expected = DataFrame({"foo": [1]}, dtype="object")
     result = concat([df1, df2])
     tm.assert_frame_equal(result, expected)
 

diff --git a/pandas/tests/reshape/concat/test_empty.py b/pandas/tests/reshape/concat/test_empty.py
@@ -90,7 +90,7 @@ def test_concat_empty_series_timelike(self, tz, values):
         expected = DataFrame(
             {
                 0: Series([pd.NaT] * len(values), dtype="M8[ns]").dt.tz_localize(tz),
-                1: values,
+                1: Series(values, dtype=dtype),
             }
         )
         result = concat([first, second], axis=1)

diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py
@@ -924,7 +924,14 @@ def test_invalid_separator(self):
             "A": [],
             "B": [],
         }
-        expected = DataFrame(exp_data).astype({"year": np.int64})
+        expected = DataFrame(exp_data).astype(
+            {
+                "A2010": np.float64,
+                "A2011": np.float64,
+                "B2010": np.float64,
+                "year": np.int64,
+            }
+        )
         expected = expected.set_index(["id", "year"])[
             ["X", "A2010", "A2011", "B2010", "A", "B"]
         ]
@@ -987,7 +994,14 @@ def test_invalid_suffixtype(self):
             "A": [],
             "B": [],
         }
-        expected = DataFrame(exp_data).astype({"year": np.int64})
+        expected = DataFrame(exp_data).astype(
+            {
+                "Aone": np.float64,
+                "Atwo": np.float64,
+                "Bone": np.float64,
+                "year": np.int64,
+            }
+        )
 
         expected = expected.set_index(["id", "year"])
         expected.index = expected.index.set_levels([0, 1], level=0)
@@ -1211,7 +1225,7 @@ def test_missing_stubname(self, dtype):
             name=("id", "num"),
         )
         expected = DataFrame(
-            {"a": [100, 200, 300, 400], "b": [np.nan] * 4},
+            {"a": [100, 200, 300, 400], "b": pd.Series([np.nan] * 4, dtype="object")},
             index=index,
         )
         new_level = expected.index.levels[0].astype(dtype)

diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
@@ -1416,7 +1416,7 @@ def test_constructor_dict_tuple_indexer(self):
         data = {(1, 1, None): -1.0}
         result = Series(data)
         expected = Series(
-            -1.0, index=MultiIndex(levels=[[1], [1], [np.nan]], codes=[[0], [0], [-1]])
+            -1.0, index=MultiIndex(levels=[[1], [1], []], codes=[[0], [0], [-1]])
         )
         tm.assert_series_equal(result, expected)
 

diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py
@@ -549,7 +549,7 @@ def test_groupby_rolling_empty_frame(self):
         # GH-38057 from_tuples gives empty object dtype, we now get float/int levels
         # expected.index = MultiIndex.from_tuples([], names=["s1", None])
         expected.index = MultiIndex.from_product(
-            [Index([], dtype="float64"), Index([], dtype="int64")], names=["s1", None]
+            [Index([]), Index([], dtype="int64")], names=["s1", None]
         )
         tm.assert_frame_equal(result, expected)
 
@@ -559,8 +559,8 @@ def test_groupby_rolling_empty_frame(self):
         expected = expected.drop(columns=["s1", "s2"])
         expected.index = MultiIndex.from_product(
             [
-                Index([], dtype="float64"),
-                Index([], dtype="float64"),
+                Index([]),
+                Index([]),
                 Index([], dtype="int64"),
             ],
             names=["s1", "s2", None],

diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py
@@ -671,7 +671,7 @@ def test_rolling_on_empty(self):
         # GH-32385
         df = DataFrame({"column": []}, index=[])
         result = df.rolling("5s").min()
-        expected = DataFrame({"column": []}, index=[])
+        expected = DataFrame({"column": []}, index=[], dtype="float64")
         tm.assert_frame_equal(result, expected)
 
     def test_rolling_on_multi_index_level(self):