diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 0928569641b5b..e153cb30caef3 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -847,6 +847,7 @@ ExtensionArray - Bug in :meth:`Series.rank` returning wrong order for small values with ``Float64`` dtype (:issue:`52471`) - Bug in :meth:`Series.unique` for boolean ``ArrowDtype`` with ``NA`` values (:issue:`54667`) - Bug in :meth:`~arrays.ArrowExtensionArray.__iter__` and :meth:`~arrays.ArrowExtensionArray.__getitem__` returning python datetime and timedelta objects for non-nano dtypes (:issue:`53326`) +- Bug in :meth:`~arrays.ArrowExtensionArray.factorize` returning incorrect uniques for a ``pyarrow.dictionary`` type ``pyarrow.chunked_array`` with more than one chunk (:issue:`54844`) - Bug when passing an :class:`ExtensionArray` subclass to ``dtype`` keywords. This will now raise a ``UserWarning`` to encourage passing an instance instead (:issue:`31356`, :issue:`54592`) - Bug where the :class:`DataFrame` repr would not work when a column had an :class:`ArrowDtype` with a ``pyarrow.ExtensionDtype`` (:issue:`54063`) - Bug where the ``__from_arrow__`` method of masked ExtensionDtypes (e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept PyArrow arrays of type ``pyarrow.null()`` (:issue:`52223`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 33f9d3639ecb7..e815b8292b0cc 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1043,13 +1043,15 @@ def factorize( indices = np.array([], dtype=np.intp) uniques = type(self)(pa.chunked_array([], type=encoded.type.value_type)) else: - pa_indices = encoded.combine_chunks().indices + # GH 54844 + combined = encoded.combine_chunks() + pa_indices = combined.indices if pa_indices.null_count > 0: pa_indices = pc.fill_null(pa_indices, -1) indices = pa_indices.to_numpy(zero_copy_only=False, writable=True).astype( np.intp, copy=False ) - uniques = type(self)(encoded.chunk(0).dictionary) + uniques = type(self)(combined.dictionary) if pa_version_under11p0 and pa.types.is_duration(pa_type): uniques = cast(ArrowExtensionArray, uniques.astype(self.dtype)) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 0ab99d186597e..13d80329f4d51 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3001,3 +3001,16 @@ def test_duration_fillna_numpy(pa_type): result = ser1.fillna(ser2) expected = pd.Series([1, 2], dtype=ArrowDtype(pa_type)) tm.assert_series_equal(result, expected) + + +def test_factorize_chunked_dictionary(): + # GH 54844 + pa_array = pa.chunked_array( + [pa.array(["a"]).dictionary_encode(), pa.array(["b"]).dictionary_encode()] + ) + ser = pd.Series(ArrowExtensionArray(pa_array)) + res_indices, res_uniques = ser.factorize() + exp_indicies = np.array([0, 1], dtype=np.intp) + exp_uniques = pd.Index(ArrowExtensionArray(pa_array.combine_chunks())) + tm.assert_numpy_array_equal(res_indices, exp_indicies) + tm.assert_index_equal(res_uniques, exp_uniques)