Skip to content

Commit

Permalink
PERF: pd.concat with EA-backed indexes (pandas-dev#49128)
Browse files Browse the repository at this point in the history
  • Loading branch information
lukemanley authored and noatamir committed Nov 9, 2022
1 parent e39a324 commit 6fc50ba
Show file tree
Hide file tree
Showing 7 changed files with 67 additions and 1 deletion.
3 changes: 3 additions & 0 deletions asv_bench/benchmarks/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,6 @@ def time_setitem_list(self, multiple_chunks):

def time_setitem_slice(self, multiple_chunks):
self.array[::10] = "foo"

def time_tolist(self, multiple_chunks):
self.array.tolist()
34 changes: 34 additions & 0 deletions asv_bench/benchmarks/join_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from pandas import (
DataFrame,
Index,
MultiIndex,
Series,
array,
Expand Down Expand Up @@ -92,6 +93,39 @@ def time_f_ordered(self, axis, ignore_index):
concat(self.frame_f, axis=axis, ignore_index=ignore_index)


class ConcatIndexDtype:

params = (
["datetime64[ns]", "int64", "Int64", "string[python]", "string[pyarrow]"],
[0, 1],
[True, False],
[True, False],
)
param_names = ["dtype", "axis", "sort", "is_monotonic"]

def setup(self, dtype, axis, sort, is_monotonic):
N = 10_000
if dtype == "datetime64[ns]":
vals = date_range("1970-01-01", periods=N)
elif dtype in ("int64", "Int64"):
vals = np.arange(N, dtype=np.int64)
elif dtype in ("string[python]", "string[pyarrow]"):
vals = tm.makeStringIndex(N)
else:
raise NotImplementedError

idx = Index(vals, dtype=dtype)
if is_monotonic:
idx = idx.sort_values()
else:
idx = idx[::-1]

self.series = [Series(i, idx[i:]) for i in range(5)]

def time_concat_series(self, dtype, axis, sort, is_monotonic):
concat(self.series, axis=axis, sort=sort)


class Join:

params = [True, False]
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ Performance improvements
- Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`)
- Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`)
- Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
- Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`)
- Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`)
- Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`)
- Performance improvement in ``var`` for nullable dtypes (:issue:`48379`).
Expand Down
9 changes: 9 additions & 0 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,6 +427,15 @@ def to_numpy(
data = self._data.astype(dtype, copy=copy)
return data

@doc(ExtensionArray.tolist)
def tolist(self):
if self.ndim > 1:
return [x.tolist() for x in self]
if not self._hasna:
# faster than list(self)
return list(self._data)
return list(self)

@overload
def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray:
...
Expand Down
7 changes: 6 additions & 1 deletion pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
)
from pandas.compat import pa_version_under6p0
from pandas.compat.numpy import function as nv
from pandas.util._decorators import doc

from pandas.core.dtypes.base import (
ExtensionDtype,
Expand Down Expand Up @@ -214,7 +215,11 @@ class BaseStringArray(ExtensionArray):
Mixin class for StringArray, ArrowStringArray.
"""

pass
@doc(ExtensionArray.tolist)
def tolist(self):
if self.ndim > 1:
return [x.tolist() for x in self]
return list(self.to_numpy())


class StringArray(BaseStringArray, PandasArray):
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/arrays/masked/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,9 @@ def test_round(data, numpy_dtype):
dtype=data.dtype,
)
tm.assert_extension_array_equal(result, expected)


def test_tolist(data):
result = data.tolist()
expected = list(data)
tm.assert_equal(result, expected)
8 changes: 8 additions & 0 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -595,3 +595,11 @@ def test_setitem_scalar_with_mask_validation(dtype):
msg = "Scalar must be NA or str"
with pytest.raises(ValueError, match=msg):
ser[mask] = 1


def test_tolist(dtype):
vals = ["a", "b", "c"]
arr = pd.array(vals, dtype=dtype)
result = arr.tolist()
expected = vals
tm.assert_equal(result, expected)

0 comments on commit 6fc50ba

Please sign in to comment.