Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REGR: preserve reindexed array object (instead of creating new array) for concat with all-NA array #47762

Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.4.rst
Expand Up @@ -17,6 +17,7 @@ Fixed regressions
- Fixed regression in :meth:`DataFrame.fillna` not working :class:`DataFrame` with :class:`MultiIndex` (:issue:`47649`)
- Fixed regression in taking NULL :class:`objects` from a :class:`DataFrame` causing a segmentation violation. These NULL values are created by :meth:`numpy.empty_like` (:issue:`46848`)
- Fixed regression in :func:`concat` materializing :class:`Index` during sorting even if :class:`Index` was already sorted (:issue:`47501`)
- Fixed regression in :func:`concat` or :func:`merge` handling of all-NaN ExtensionArrays with custom attributes (:issue:`47762`)
- Fixed regression in calling bitwise numpy ufuncs (for example, ``np.bitwise_and``) on Index objects (:issue:`46769`)
- Fixed regression in :func:`cut` using a ``datetime64`` IntervalIndex as bins (:issue:`46218`)
- Fixed regression in :meth:`DataFrame.select_dtypes` where ``include="number"`` included :class:`BooleanDtype` (:issue:`46870`)
Expand Down
25 changes: 15 additions & 10 deletions pandas/core/internals/concat.py
Expand Up @@ -476,16 +476,21 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
return DatetimeArray(i8values, dtype=empty_dtype)

elif is_1d_only_ea_dtype(empty_dtype):
empty_dtype = cast(ExtensionDtype, empty_dtype)
cls = empty_dtype.construct_array_type()

missing_arr = cls._from_sequence([], dtype=empty_dtype)
ncols, nrows = self.shape
assert ncols == 1, ncols
empty_arr = -1 * np.ones((nrows,), dtype=np.intp)
return missing_arr.take(
empty_arr, allow_fill=True, fill_value=fill_value
)
if is_dtype_equal(blk_dtype, empty_dtype) and self.indexers:
# avoid creating new empty array if we already have an array
# with correct dtype that can be reindexed
pass
else:
empty_dtype = cast(ExtensionDtype, empty_dtype)
cls = empty_dtype.construct_array_type()

missing_arr = cls._from_sequence([], dtype=empty_dtype)
ncols, nrows = self.shape
assert ncols == 1, ncols
empty_arr = -1 * np.ones((nrows,), dtype=np.intp)
return missing_arr.take(
empty_arr, allow_fill=True, fill_value=fill_value
)
elif isinstance(empty_dtype, ExtensionDtype):
# TODO: no tests get here, a handful would if we disabled
# the dt64tz special-case above (which is faster)
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/extension/array_with_attr/__init__.py
@@ -0,0 +1,6 @@
from pandas.tests.extension.array_with_attr.array import (
FloatAttrArray,
FloatAttrDtype,
)

__all__ = ["FloatAttrArray", "FloatAttrDtype"]
84 changes: 84 additions & 0 deletions pandas/tests/extension/array_with_attr/array.py
@@ -0,0 +1,84 @@
"""
Test extension array that has custom attribute information (not stored on the dtype).

"""
from __future__ import annotations

import numbers

import numpy as np

from pandas._typing import type_t

from pandas.core.dtypes.base import ExtensionDtype

import pandas as pd
from pandas.core.arrays import ExtensionArray


class FloatAttrDtype(ExtensionDtype):
type = float
name = "float_attr"
na_value = np.nan

@classmethod
def construct_array_type(cls) -> type_t[FloatAttrArray]:
"""
Return the array type associated with this dtype.

Returns
-------
type
"""
return FloatAttrArray


class FloatAttrArray(ExtensionArray):
dtype = FloatAttrDtype()
__array_priority__ = 1000

def __init__(self, values, attr=None) -> None:
if not isinstance(values, np.ndarray):
raise TypeError("Need to pass a numpy array of float64 dtype as values")
if not values.dtype == "float64":
raise TypeError("Need to pass a numpy array of float64 dtype as values")
self.data = values
self.attr = attr

@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
data = np.array(scalars, dtype="float64", copy=copy)
return cls(data)

def __getitem__(self, item):
if isinstance(item, numbers.Integral):
return self.data[item]
else:
# slice, list-like, mask
item = pd.api.indexers.check_array_indexer(self, item)
return type(self)(self.data[item], self.attr)

def __len__(self) -> int:
return len(self.data)

def isna(self):
return np.isnan(self.data)

def take(self, indexer, allow_fill=False, fill_value=None):
from pandas.api.extensions import take

data = self.data
if allow_fill and fill_value is None:
fill_value = self.dtype.na_value

result = take(data, indexer, fill_value=fill_value, allow_fill=allow_fill)
return type(self)(result, self.attr)

def copy(self):
return type(self)(self.data.copy(), self.attr)
simonjayhawkins marked this conversation as resolved.
Show resolved Hide resolved

@classmethod
def _concat_same_type(cls, to_concat):
data = np.concatenate([x.data for x in to_concat])
attr = to_concat[0].attr if len(to_concat) else None
return cls(data, attr)
33 changes: 33 additions & 0 deletions pandas/tests/extension/array_with_attr/test_array_with_attr.py
@@ -0,0 +1,33 @@
import numpy as np

import pandas as pd
import pandas._testing as tm
from pandas.tests.extension.array_with_attr import FloatAttrArray


def test_concat_with_all_na():
# https://github.com/pandas-dev/pandas/pull/47762
# ensure that attribute of the column array is preserved (when it gets
# preserved in reindexing the array) during merge/concat
arr = FloatAttrArray(np.array([np.nan, np.nan], dtype="float64"), attr="test")

df1 = pd.DataFrame({"col": arr, "key": [0, 1]})
df2 = pd.DataFrame({"key": [0, 1], "col2": [1, 2]})
result = pd.merge(df1, df2, on="key")
expected = pd.DataFrame({"col": arr, "key": [0, 1], "col2": [1, 2]})
tm.assert_frame_equal(result, expected)
assert result["col"].array.attr == "test"

df1 = pd.DataFrame({"col": arr, "key": [0, 1]})
df2 = pd.DataFrame({"key": [0, 2], "col2": [1, 2]})
result = pd.merge(df1, df2, on="key")
expected = pd.DataFrame({"col": arr.take([0]), "key": [0], "col2": [1]})
tm.assert_frame_equal(result, expected)
assert result["col"].array.attr == "test"

result = pd.concat([df1.set_index("key"), df2.set_index("key")], axis=1)
expected = pd.DataFrame(
{"col": arr.take([0, 1, -1]), "col2": [1, np.nan, 2], "key": [0, 1, 2]}
).set_index("key")
tm.assert_frame_equal(result, expected)
assert result["col"].array.attr == "test"