diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 9cc63b01693f1..1ddf2b9bb63f8 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1020,6 +1020,7 @@ I/O - Bug in :meth:`DataFrame.to_string` ignoring float formatter for extension arrays (:issue:`39336`) - Fixed memory leak which stemmed from the initialization of the internal JSON module (:issue:`49222`) - Fixed issue where :func:`json_normalize` would incorrectly remove leading characters from column names that matched the ``sep`` argument (:issue:`49861`) +- Bug in :meth:`DataFrame.to_dict` not converting ``NA`` to ``None`` (:issue:`50795`) - Bug in :meth:`DataFrame.to_json` where it would segfault when failing to encode a string (:issue:`50307`) Period diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 97361fb88bc70..3be89f6da2bd8 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -20,6 +20,10 @@ import numpy as np from pandas._libs import lib +from pandas._libs.missing import ( + NA, + NAType, +) from pandas._libs.tslibs import ( NaT, OutOfBoundsDatetime, @@ -176,7 +180,7 @@ def maybe_box_datetimelike(value: Scalar, dtype: Dtype | None = None) -> Scalar: return value -def maybe_box_native(value: Scalar) -> Scalar: +def maybe_box_native(value: Scalar | None | NAType) -> Scalar | None | NAType: """ If passed a scalar cast the scalar to a python native type. @@ -202,6 +206,8 @@ def maybe_box_native(value: Scalar) -> Scalar: value = bool(value) elif isinstance(value, (np.datetime64, np.timedelta64)): value = maybe_box_datetimelike(value) + elif value is NA: + value = None return value diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py index 19f4e5c23785b..d187ed10c5798 100644 --- a/pandas/core/methods/to_dict.py +++ b/pandas/core/methods/to_dict.py @@ -6,7 +6,10 @@ from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import maybe_box_native -from pandas.core.dtypes.common import is_object_dtype +from pandas.core.dtypes.common import ( + is_extension_array_dtype, + is_object_dtype, +) from pandas import DataFrame from pandas.core import common as com @@ -88,16 +91,18 @@ def to_dict( # GH46470 Return quickly if orient series to avoid creating dtype objects return into_c((k, v) for k, v in df.items()) - object_dtype_indices = [ - i for i, col_dtype in enumerate(df.dtypes.values) if is_object_dtype(col_dtype) + box_native_indices = [ + i + for i, col_dtype in enumerate(df.dtypes.values) + if is_object_dtype(col_dtype) or is_extension_array_dtype(col_dtype) ] - are_all_object_dtype_cols = len(object_dtype_indices) == len(df.dtypes) + are_all_object_dtype_cols = len(box_native_indices) == len(df.dtypes) if orient == "dict": return into_c((k, v.to_dict(into)) for k, v in df.items()) elif orient == "list": - object_dtype_indices_as_set = set(object_dtype_indices) + object_dtype_indices_as_set = set(box_native_indices) return into_c( ( k, @@ -110,7 +115,7 @@ def to_dict( elif orient == "split": data = df._create_data_for_split_and_tight_to_dict( - are_all_object_dtype_cols, object_dtype_indices + are_all_object_dtype_cols, box_native_indices ) return into_c( @@ -123,7 +128,7 @@ def to_dict( elif orient == "tight": data = df._create_data_for_split_and_tight_to_dict( - are_all_object_dtype_cols, object_dtype_indices + are_all_object_dtype_cols, box_native_indices ) return into_c( @@ -155,8 +160,8 @@ def to_dict( data = [ into_c(zip(columns, t)) for t in df.itertuples(index=False, name=None) ] - if object_dtype_indices: - object_dtype_indices_as_set = set(object_dtype_indices) + if box_native_indices: + object_dtype_indices_as_set = set(box_native_indices) object_dtype_cols = { col for i, col in enumerate(df.columns) @@ -176,8 +181,8 @@ def to_dict( (t[0], dict(zip(df.columns, map(maybe_box_native, t[1:])))) for t in df.itertuples(name=None) ) - elif object_dtype_indices: - object_dtype_indices_as_set = set(object_dtype_indices) + elif box_native_indices: + object_dtype_indices_as_set = set(box_native_indices) is_object_dtype_by_index = [ i in object_dtype_indices_as_set for i in range(len(df.columns)) ] diff --git a/pandas/core/series.py b/pandas/core/series.py index 992b86a532433..c79a35f4ebf8e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -89,6 +89,7 @@ from pandas.core.dtypes.common import ( ensure_platform_int, is_dict_like, + is_extension_array_dtype, is_integer, is_iterator, is_list_like, @@ -1832,7 +1833,7 @@ def to_dict(self, into: type[dict] = dict) -> dict: # GH16122 into_c = com.standardize_mapping(into) - if is_object_dtype(self): + if is_object_dtype(self) or is_extension_array_dtype(self): return into_c((k, maybe_box_native(v)) for k, v in self.items()) else: # Not an object dtype => all types will be the same so let the default diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py index c76699cafd481..d08323ec01ae8 100644 --- a/pandas/tests/frame/methods/test_to_dict.py +++ b/pandas/tests/frame/methods/test_to_dict.py @@ -9,6 +9,7 @@ import pytz from pandas import ( + NA, DataFrame, Index, MultiIndex, @@ -458,3 +459,29 @@ def test_to_dict_index_false(self, orient, expected): df = DataFrame({"col1": [1, 2], "col2": [3, 4]}, index=["row1", "row2"]) result = df.to_dict(orient=orient, index=False) tm.assert_dict_equal(result, expected) + + @pytest.mark.parametrize( + "orient, expected", + [ + ("dict", {"a": {0: 1, 1: None}}), + ("list", {"a": [1, None]}), + ("split", {"index": [0, 1], "columns": ["a"], "data": [[1], [None]]}), + ( + "tight", + { + "index": [0, 1], + "columns": ["a"], + "data": [[1], [None]], + "index_names": [None], + "column_names": [None], + }, + ), + ("records", [{"a": 1}, {"a": None}]), + ("index", {0: {"a": 1}, 1: {"a": None}}), + ], + ) + def test_to_dict_na_to_none(self, orient, expected): + # GH#50795 + df = DataFrame({"a": [1, NA]}, dtype="Int64") + result = df.to_dict(orient=orient) + assert result == expected