PERF: pd.concat with EA-backed indexes (pandas-dev#49128)

noatamir · Nov 9, 2022 · 6fc50ba · 6fc50ba
1 parent e39a324
commit 6fc50ba
Show file tree

Hide file tree

Showing 7 changed files with 67 additions and 1 deletion.
diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py
@@ -71,3 +71,6 @@ def time_setitem_list(self, multiple_chunks):
 
     def time_setitem_slice(self, multiple_chunks):
         self.array[::10] = "foo"
+
+    def time_tolist(self, multiple_chunks):
+        self.array.tolist()
diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py
@@ -4,6 +4,7 @@
 
 from pandas import (
     DataFrame,
+    Index,
     MultiIndex,
     Series,
     array,
@@ -92,6 +93,39 @@ def time_f_ordered(self, axis, ignore_index):
         concat(self.frame_f, axis=axis, ignore_index=ignore_index)
 
 
+class ConcatIndexDtype:
+
+    params = (
+        ["datetime64[ns]", "int64", "Int64", "string[python]", "string[pyarrow]"],
+        [0, 1],
+        [True, False],
+        [True, False],
+    )
+    param_names = ["dtype", "axis", "sort", "is_monotonic"]
+
+    def setup(self, dtype, axis, sort, is_monotonic):
+        N = 10_000
+        if dtype == "datetime64[ns]":
+            vals = date_range("1970-01-01", periods=N)
+        elif dtype in ("int64", "Int64"):
+            vals = np.arange(N, dtype=np.int64)
+        elif dtype in ("string[python]", "string[pyarrow]"):
+            vals = tm.makeStringIndex(N)
+        else:
+            raise NotImplementedError
+
+        idx = Index(vals, dtype=dtype)
+        if is_monotonic:
+            idx = idx.sort_values()
+        else:
+            idx = idx[::-1]
+
+        self.series = [Series(i, idx[i:]) for i in range(5)]
+
+    def time_concat_series(self, dtype, axis, sort, is_monotonic):
+        concat(self.series, axis=axis, sort=sort)
+
+
 class Join:
 
     params = [True, False]

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -156,6 +156,7 @@ Performance improvements
 - Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`)
 - Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`)
 - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
+- Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`)
 - Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`)
 - Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`)
 - Performance improvement in ``var`` for nullable dtypes (:issue:`48379`).

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -427,6 +427,15 @@ def to_numpy(
             data = self._data.astype(dtype, copy=copy)
         return data
 
+    @doc(ExtensionArray.tolist)
+    def tolist(self):
+        if self.ndim > 1:
+            return [x.tolist() for x in self]
+        if not self._hasna:
+            # faster than list(self)
+            return list(self._data)
+        return list(self)
+
     @overload
     def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray:
         ...

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -20,6 +20,7 @@
 )
 from pandas.compat import pa_version_under6p0
 from pandas.compat.numpy import function as nv
+from pandas.util._decorators import doc
 
 from pandas.core.dtypes.base import (
     ExtensionDtype,
@@ -214,7 +215,11 @@ class BaseStringArray(ExtensionArray):
     Mixin class for StringArray, ArrowStringArray.
     """
 
-    pass
+    @doc(ExtensionArray.tolist)
+    def tolist(self):
+        if self.ndim > 1:
+            return [x.tolist() for x in self]
+        return list(self.to_numpy())
 
 
 class StringArray(BaseStringArray, PandasArray):

diff --git a/pandas/tests/arrays/masked/test_function.py b/pandas/tests/arrays/masked/test_function.py
@@ -49,3 +49,9 @@ def test_round(data, numpy_dtype):
         dtype=data.dtype,
     )
     tm.assert_extension_array_equal(result, expected)
+
+
+def test_tolist(data):
+    result = data.tolist()
+    expected = list(data)
+    tm.assert_equal(result, expected)
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -595,3 +595,11 @@ def test_setitem_scalar_with_mask_validation(dtype):
         msg = "Scalar must be NA or str"
     with pytest.raises(ValueError, match=msg):
         ser[mask] = 1
+
+
+def test_tolist(dtype):
+    vals = ["a", "b", "c"]
+    arr = pd.array(vals, dtype=dtype)
+    result = arr.tolist()
+    expected = vals
+    tm.assert_equal(result, expected)