From e89624fcf2b96f5b4c72e1cc03dc12b1e0d791ca Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 17 Jul 2022 13:50:33 +0200 Subject: [PATCH 1/7] REGR: preserve array object for concat with all-NA array --- doc/source/whatsnew/v1.4.4.rst | 1 + pandas/core/internals/concat.py | 6 +- .../extension/array_with_attr/__init__.py | 7 ++ .../tests/extension/array_with_attr/array.py | 88 +++++++++++++++++++ .../array_with_attr/test_array_with_attr.py | 33 +++++++ 5 files changed, 134 insertions(+), 1 deletion(-) create mode 100644 pandas/tests/extension/array_with_attr/__init__.py create mode 100644 pandas/tests/extension/array_with_attr/array.py create mode 100644 pandas/tests/extension/array_with_attr/test_array_with_attr.py diff --git a/doc/source/whatsnew/v1.4.4.rst b/doc/source/whatsnew/v1.4.4.rst index 6ee140f59e096..85f8d00f1ad0a 100644 --- a/doc/source/whatsnew/v1.4.4.rst +++ b/doc/source/whatsnew/v1.4.4.rst @@ -15,6 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :func:`concat` materializing :class:`Index` during sorting even if :class:`Index` was already sorted (:issue:`47501`) +- Fixed regression in :func:`concat` or :func:`merge` handling of all-NaN ExtensionArrays with custom attributes - .. --------------------------------------------------------------------------- diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 77197dac3363b..28f9c815f3945 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -471,7 +471,11 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: if len(values) and values[0] is None: fill_value = None - if is_datetime64tz_dtype(empty_dtype): + if blk_dtype == empty_dtype and self.indexers: + # avoid creating new empty array if we already have an array + # with correct dtype that can be reindexed + pass + elif is_datetime64tz_dtype(empty_dtype): i8values = np.full(self.shape, fill_value.value) return DatetimeArray(i8values, dtype=empty_dtype) diff --git a/pandas/tests/extension/array_with_attr/__init__.py b/pandas/tests/extension/array_with_attr/__init__.py new file mode 100644 index 0000000000000..8b00ea32146b0 --- /dev/null +++ b/pandas/tests/extension/array_with_attr/__init__.py @@ -0,0 +1,7 @@ +from pandas.tests.extension.array_with_attr.array import ( + FloatAttrArray, + FloatAttrDtype, + make_data, +) + +__all__ = ["FloatAttrArray", "FloatAttrDtype", "make_data"] diff --git a/pandas/tests/extension/array_with_attr/array.py b/pandas/tests/extension/array_with_attr/array.py new file mode 100644 index 0000000000000..667b6ee1f736d --- /dev/null +++ b/pandas/tests/extension/array_with_attr/array.py @@ -0,0 +1,88 @@ +""" +Test extension array that has custom attribute information (not stored on the dtype). + +""" +from __future__ import annotations + +import numbers + +import numpy as np + +from pandas._typing import type_t + +from pandas.core.dtypes.base import ExtensionDtype + +import pandas as pd +from pandas.core.arrays import ExtensionArray + + +class FloatAttrDtype(ExtensionDtype): + type = int + name = "int_attr" + na_value = np.nan + + @classmethod + def construct_array_type(cls) -> type_t[FloatAttrArray]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return FloatAttrArray + + +class FloatAttrArray(ExtensionArray): + dtype = FloatAttrDtype() + __array_priority__ = 1000 + + def __init__(self, values, attr=None) -> None: + if not isinstance(values, np.ndarray): + raise TypeError("Need to pass a numpy array of float64 dtype as values") + if not values.dtype == "float64": + raise TypeError("Need to pass a numpy array of float64 dtype as values") + self.data = values + self.attr = attr + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + data = np.array(scalars, dtype="float64", copy=copy) + return cls(data) + + def __getitem__(self, item): + if isinstance(item, numbers.Integral): + return self.data[item] + else: + # slice, list-like, mask + item = pd.api.indexers.check_array_indexer(self, item) + return type(self)(self.data[item], self.attr) + + def __len__(self) -> int: + return len(self.data) + + def isna(self): + return np.isnan(self.data) + + def take(self, indexer, allow_fill=False, fill_value=None): + from pandas.api.extensions import take + + data = self.data + if allow_fill and fill_value is None: + fill_value = self.dtype.na_value + + result = take(data, indexer, fill_value=fill_value, allow_fill=allow_fill) + return type(self)(result, self.attr) + + def copy(self): + return type(self)(self.data.copy(), self.attr) + + @classmethod + def _concat_same_type(cls, to_concat): + data = np.concatenate([x.data for x in to_concat]) + attr = to_concat[0].attr if len(to_concat) else None + return cls(data, attr) + + +def make_data(): + return np.arange(100, dtype="float64") diff --git a/pandas/tests/extension/array_with_attr/test_array_with_attr.py b/pandas/tests/extension/array_with_attr/test_array_with_attr.py new file mode 100644 index 0000000000000..aea5dd94a333e --- /dev/null +++ b/pandas/tests/extension/array_with_attr/test_array_with_attr.py @@ -0,0 +1,33 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.tests.extension.array_with_attr import ( + FloatAttrArray, + FloatAttrDtype, + make_data, +) + + +@pytest.fixture +def dtype(): + return FloatAttrDtype() + + +@pytest.fixture +def data(): + return FloatAttrArray(make_data()) + + +def test_concat_with_all_na(data): + # https://github.com/pandas-dev/pandas/issues/28840 + # + arr = FloatAttrArray(np.array([np.nan, np.nan], dtype="float64"), attr="test") + df1 = pd.DataFrame({"col": arr, "key": [0, 1]}) + df2 = pd.DataFrame({"key": [0, 1], "col2": [1, 2]}) + + result = pd.merge(df1, df2, on="key") + expected = pd.DataFrame({"col": arr, "key": [0, 1], "col2": [1, 2]}) + tm.assert_frame_equal(result, expected) + assert result["col"].array.attr == "test" From 7bc3b6406d983be946f55a5a0efecf6b99b4ce93 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 17 Jul 2022 14:07:56 +0200 Subject: [PATCH 2/7] update PR number --- doc/source/whatsnew/v1.4.4.rst | 2 +- .../array_with_attr/test_array_with_attr.py | 14 +++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.4.4.rst b/doc/source/whatsnew/v1.4.4.rst index 85f8d00f1ad0a..58026ee842ff7 100644 --- a/doc/source/whatsnew/v1.4.4.rst +++ b/doc/source/whatsnew/v1.4.4.rst @@ -15,7 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :func:`concat` materializing :class:`Index` during sorting even if :class:`Index` was already sorted (:issue:`47501`) -- Fixed regression in :func:`concat` or :func:`merge` handling of all-NaN ExtensionArrays with custom attributes +- Fixed regression in :func:`concat` or :func:`merge` handling of all-NaN ExtensionArrays with custom attributes (:issue:`47762`) - .. --------------------------------------------------------------------------- diff --git a/pandas/tests/extension/array_with_attr/test_array_with_attr.py b/pandas/tests/extension/array_with_attr/test_array_with_attr.py index aea5dd94a333e..af4c30000e75f 100644 --- a/pandas/tests/extension/array_with_attr/test_array_with_attr.py +++ b/pandas/tests/extension/array_with_attr/test_array_with_attr.py @@ -21,13 +21,21 @@ def data(): def test_concat_with_all_na(data): - # https://github.com/pandas-dev/pandas/issues/28840 - # + # https://github.com/pandas-dev/pandas/pull/47762 + # ensure that attribute of the column array is preserved (when it gets + # preserved in reindexing the array) during merge arr = FloatAttrArray(np.array([np.nan, np.nan], dtype="float64"), attr="test") + df1 = pd.DataFrame({"col": arr, "key": [0, 1]}) df2 = pd.DataFrame({"key": [0, 1], "col2": [1, 2]}) - result = pd.merge(df1, df2, on="key") expected = pd.DataFrame({"col": arr, "key": [0, 1], "col2": [1, 2]}) tm.assert_frame_equal(result, expected) assert result["col"].array.attr == "test" + + df1 = pd.DataFrame({"col": arr, "key": [0, 1]}) + df2 = pd.DataFrame({"key": [0, 2], "col2": [1, 2]}) + result = pd.merge(df1, df2, on="key") + expected = pd.DataFrame({"col": arr.take([0]), "key": [0], "col2": [1]}) + tm.assert_frame_equal(result, expected) + assert result["col"].array.attr == "test" From c42f54e58b90655c24f914f90828afdd220519da Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 23 Aug 2022 21:10:58 +0200 Subject: [PATCH 3/7] additional test --- .../extension/array_with_attr/test_array_with_attr.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/array_with_attr/test_array_with_attr.py b/pandas/tests/extension/array_with_attr/test_array_with_attr.py index af4c30000e75f..202709e0e4b5f 100644 --- a/pandas/tests/extension/array_with_attr/test_array_with_attr.py +++ b/pandas/tests/extension/array_with_attr/test_array_with_attr.py @@ -23,7 +23,7 @@ def data(): def test_concat_with_all_na(data): # https://github.com/pandas-dev/pandas/pull/47762 # ensure that attribute of the column array is preserved (when it gets - # preserved in reindexing the array) during merge + # preserved in reindexing the array) during merge/concat arr = FloatAttrArray(np.array([np.nan, np.nan], dtype="float64"), attr="test") df1 = pd.DataFrame({"col": arr, "key": [0, 1]}) @@ -39,3 +39,10 @@ def test_concat_with_all_na(data): expected = pd.DataFrame({"col": arr.take([0]), "key": [0], "col2": [1]}) tm.assert_frame_equal(result, expected) assert result["col"].array.attr == "test" + + result = pd.concat([df1.set_index("key"), df2.set_index("key")], axis=1) + expected = pd.DataFrame( + {"col": arr.take([0, 0, 1]), "col2": [1, np.nan, 2], "key": [0, 1, 2]} + ).set_index("key") + tm.assert_frame_equal(result, expected) + assert result["col"].array.attr == "test" From d76aa6f6ac8926c5718c0bd1b8ef561b085d0caf Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 24 Aug 2022 00:13:52 +0200 Subject: [PATCH 4/7] use is_dtype_equal for compat with old numpy --- pandas/core/internals/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 28f9c815f3945..eca5dfa7ce39b 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -471,7 +471,7 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: if len(values) and values[0] is None: fill_value = None - if blk_dtype == empty_dtype and self.indexers: + if is_dtype_equal(blk_dtype, empty_dtype) and self.indexers: # avoid creating new empty array if we already have an array # with correct dtype that can be reindexed pass From 2e0e0a81ce370a4e7cbb1c18fe7906bea1bc4045 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 25 Aug 2022 12:32:02 +0200 Subject: [PATCH 5/7] EA specific, simplify tests --- pandas/core/internals/concat.py | 29 ++++++++++--------- .../extension/array_with_attr/__init__.py | 1 - .../tests/extension/array_with_attr/array.py | 8 ++--- .../array_with_attr/test_array_with_attr.py | 19 ++---------- 4 files changed, 19 insertions(+), 38 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index eca5dfa7ce39b..68dce4b30da81 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -471,25 +471,26 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: if len(values) and values[0] is None: fill_value = None - if is_dtype_equal(blk_dtype, empty_dtype) and self.indexers: - # avoid creating new empty array if we already have an array - # with correct dtype that can be reindexed - pass elif is_datetime64tz_dtype(empty_dtype): i8values = np.full(self.shape, fill_value.value) return DatetimeArray(i8values, dtype=empty_dtype) elif is_1d_only_ea_dtype(empty_dtype): - empty_dtype = cast(ExtensionDtype, empty_dtype) - cls = empty_dtype.construct_array_type() - - missing_arr = cls._from_sequence([], dtype=empty_dtype) - ncols, nrows = self.shape - assert ncols == 1, ncols - empty_arr = -1 * np.ones((nrows,), dtype=np.intp) - return missing_arr.take( - empty_arr, allow_fill=True, fill_value=fill_value - ) + if is_dtype_equal(blk_dtype, empty_dtype) and self.indexers: + # avoid creating new empty array if we already have an array + # with correct dtype that can be reindexed + pass + else: + empty_dtype = cast(ExtensionDtype, empty_dtype) + cls = empty_dtype.construct_array_type() + + missing_arr = cls._from_sequence([], dtype=empty_dtype) + ncols, nrows = self.shape + assert ncols == 1, ncols + empty_arr = -1 * np.ones((nrows,), dtype=np.intp) + return missing_arr.take( + empty_arr, allow_fill=True, fill_value=fill_value + ) elif isinstance(empty_dtype, ExtensionDtype): # TODO: no tests get here, a handful would if we disabled # the dt64tz special-case above (which is faster) diff --git a/pandas/tests/extension/array_with_attr/__init__.py b/pandas/tests/extension/array_with_attr/__init__.py index 8b00ea32146b0..7e404c91072de 100644 --- a/pandas/tests/extension/array_with_attr/__init__.py +++ b/pandas/tests/extension/array_with_attr/__init__.py @@ -1,7 +1,6 @@ from pandas.tests.extension.array_with_attr.array import ( FloatAttrArray, FloatAttrDtype, - make_data, ) __all__ = ["FloatAttrArray", "FloatAttrDtype", "make_data"] diff --git a/pandas/tests/extension/array_with_attr/array.py b/pandas/tests/extension/array_with_attr/array.py index 667b6ee1f736d..d9327ca9f2f3f 100644 --- a/pandas/tests/extension/array_with_attr/array.py +++ b/pandas/tests/extension/array_with_attr/array.py @@ -17,8 +17,8 @@ class FloatAttrDtype(ExtensionDtype): - type = int - name = "int_attr" + type = float + name = "float_attr" na_value = np.nan @classmethod @@ -82,7 +82,3 @@ def _concat_same_type(cls, to_concat): data = np.concatenate([x.data for x in to_concat]) attr = to_concat[0].attr if len(to_concat) else None return cls(data, attr) - - -def make_data(): - return np.arange(100, dtype="float64") diff --git a/pandas/tests/extension/array_with_attr/test_array_with_attr.py b/pandas/tests/extension/array_with_attr/test_array_with_attr.py index 202709e0e4b5f..b6a85a7d8d242 100644 --- a/pandas/tests/extension/array_with_attr/test_array_with_attr.py +++ b/pandas/tests/extension/array_with_attr/test_array_with_attr.py @@ -1,26 +1,11 @@ import numpy as np -import pytest import pandas as pd import pandas._testing as tm -from pandas.tests.extension.array_with_attr import ( - FloatAttrArray, - FloatAttrDtype, - make_data, -) +from pandas.tests.extension.array_with_attr import FloatAttrArray -@pytest.fixture -def dtype(): - return FloatAttrDtype() - - -@pytest.fixture -def data(): - return FloatAttrArray(make_data()) - - -def test_concat_with_all_na(data): +def test_concat_with_all_na(): # https://github.com/pandas-dev/pandas/pull/47762 # ensure that attribute of the column array is preserved (when it gets # preserved in reindexing the array) during merge/concat From 12d848098f761efd49acedae6dd1317f8464fc4d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 25 Aug 2022 12:34:31 +0200 Subject: [PATCH 6/7] fixup --- pandas/tests/extension/array_with_attr/__init__.py | 2 +- pandas/tests/extension/array_with_attr/test_array_with_attr.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/array_with_attr/__init__.py b/pandas/tests/extension/array_with_attr/__init__.py index 7e404c91072de..49da6af024a31 100644 --- a/pandas/tests/extension/array_with_attr/__init__.py +++ b/pandas/tests/extension/array_with_attr/__init__.py @@ -3,4 +3,4 @@ FloatAttrDtype, ) -__all__ = ["FloatAttrArray", "FloatAttrDtype", "make_data"] +__all__ = ["FloatAttrArray", "FloatAttrDtype"] diff --git a/pandas/tests/extension/array_with_attr/test_array_with_attr.py b/pandas/tests/extension/array_with_attr/test_array_with_attr.py index b6a85a7d8d242..3735fe40a0d67 100644 --- a/pandas/tests/extension/array_with_attr/test_array_with_attr.py +++ b/pandas/tests/extension/array_with_attr/test_array_with_attr.py @@ -27,7 +27,7 @@ def test_concat_with_all_na(): result = pd.concat([df1.set_index("key"), df2.set_index("key")], axis=1) expected = pd.DataFrame( - {"col": arr.take([0, 0, 1]), "col2": [1, np.nan, 2], "key": [0, 1, 2]} + {"col": arr.take([0, 1, -1]), "col2": [1, np.nan, 2], "key": [0, 1, 2]} ).set_index("key") tm.assert_frame_equal(result, expected) assert result["col"].array.attr == "test" From 83b1ea94d4b21ed108b1e4a644accfeb42af6a06 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 25 Aug 2022 12:55:59 +0200 Subject: [PATCH 7/7] fixup --- pandas/core/internals/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 68dce4b30da81..0df8aa5a055b0 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -471,7 +471,7 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: if len(values) and values[0] is None: fill_value = None - elif is_datetime64tz_dtype(empty_dtype): + if is_datetime64tz_dtype(empty_dtype): i8values = np.full(self.shape, fill_value.value) return DatetimeArray(i8values, dtype=empty_dtype)