Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve the speed of from_dataframe with a MultiIndex (by 40x!) #4184

Merged
merged 13 commits into from Jul 2, 2020
24 changes: 24 additions & 0 deletions asv_bench/benchmarks/pandas.py
@@ -0,0 +1,24 @@
import numpy as np
import pandas as pd

from . import parameterized


class MultiIndexSeries:
def setup(self, dtype, subset):
data = np.random.rand(100000).astype(dtype)
index = pd.MultiIndex.from_product(
[
list("abcdefhijk"),
list("abcdefhijk"),
pd.date_range(start="2000-01-01", periods=1000, freq="B"),
]
)
series = pd.Series(data, index)
if subset:
series = series[::3]
self.series = series

@parameterized(["dtype", "subset"], ([int, float], [True, False]))
def time_to_xarray(self, dtype, subset):
self.series.to_xarray()
10 changes: 7 additions & 3 deletions doc/whats-new.rst
Expand Up @@ -47,7 +47,10 @@ Enhancements
For orthogonal linear- and nearest-neighbor interpolation, we do 1d-interpolation sequentially
rather than interpolating in multidimensional space. (:issue:`2223`)
By `Keisuke Fujii <https://github.com/fujiisoup>`_.
- :py:meth:`DataArray.reset_index` and :py:meth:`Dataset.reset_index` now keep
- Major performance improvement for :py:meth:`Dataset.from_dataframe` when the
dataframe has a MultiIndex (:pull:`4184`).
By `Stephan Hoyer <https://github.com/shoyer>`_.
- :py:meth:`DataArray.reset_index` and :py:meth:`Dataset.reset_index` now keep
coordinate attributes (:pull:`4103`). By `Oriol Abril <https://github.com/OriolAbril>`_.

New Features
Expand Down Expand Up @@ -124,8 +127,9 @@ Bug fixes
By `Deepak Cherian <https://github.com/dcherian>`_.
- ``ValueError`` is raised when ``fill_value`` is not a scalar in :py:meth:`full_like`. (:issue:`3977`)
By `Huite Bootsma <https://github.com/huite>`_.
- Fix wrong order in converting a ``pd.Series`` with a MultiIndex to ``DataArray``. (:issue:`3951`)
By `Keisuke Fujii <https://github.com/fujiisoup>`_.
- Fix wrong order in converting a ``pd.Series`` with a MultiIndex to ``DataArray``.
(:issue:`3951`, :issue:`4186`)
By `Keisuke Fujii <https://github.com/fujiisoup>`_ and `Stephan Hoyer <https://github.com/shoyer>`_.
- Fix renaming of coords when one or more stacked coords is not in
sorted order during stack+groupby+apply operations. (:issue:`3287`,
:pull:`3906`) By `Spencer Hill <https://github.com/spencerahill>`_
Expand Down
67 changes: 46 additions & 21 deletions xarray/core/dataset.py
Expand Up @@ -4543,11 +4543,10 @@ def to_dataframe(self):
return self._to_dataframe(self.dims)

def _set_sparse_data_from_dataframe(
self, dataframe: pd.DataFrame, dims: tuple
self, idx: pd.Index, arrays: List[Tuple[Hashable, np.ndarray]], dims: tuple
) -> None:
from sparse import COO

idx = dataframe.index
if isinstance(idx, pd.MultiIndex):
coords = np.stack([np.asarray(code) for code in idx.codes], axis=0)
is_sorted = idx.is_lexsorted()
Expand All @@ -4557,11 +4556,7 @@ def _set_sparse_data_from_dataframe(
is_sorted = True
shape = (idx.size,)

for name, series in dataframe.items():
# Cast to a NumPy array first, in case the Series is a pandas
# Extension array (which doesn't have a valid NumPy dtype)
values = np.asarray(series)

for name, values in arrays:
# In virtually all real use cases, the sparse array will now have
# missing values and needs a fill_value. For consistency, don't
# special case the rare exceptions (e.g., dtype=int without a
Expand All @@ -4580,18 +4575,36 @@ def _set_sparse_data_from_dataframe(
self[name] = (dims, data)

def _set_numpy_data_from_dataframe(
self, dataframe: pd.DataFrame, dims: tuple
self, idx: pd.Index, arrays: List[Tuple[Hashable, np.ndarray]], dims: tuple
) -> None:
idx = dataframe.index
if isinstance(idx, pd.MultiIndex):
# expand the DataFrame to include the product of all levels
full_idx = pd.MultiIndex.from_product(idx.levels, names=idx.names)
dataframe = dataframe.reindex(full_idx)
shape = tuple(lev.size for lev in idx.levels)
else:
shape = (idx.size,)
for name, series in dataframe.items():
data = np.asarray(series).reshape(shape)
if not isinstance(idx, pd.MultiIndex):
for name, values in arrays:
self[name] = (dims, values)
return

shape = tuple(lev.size for lev in idx.levels)
indexer = tuple(idx.codes)

# We already verified that the MultiIndex has all unique values, so
# there are missing values if and only if the size of output arrays is
# larger that the index.
missing_values = np.prod(shape) > idx.shape[0]

for name, values in arrays:
# NumPy indexing is much faster than using DataFrame.reindex() to
# fill in missing values:
# https://stackoverflow.com/a/35049899/809705
if missing_values:
dtype, fill_value = dtypes.maybe_promote(values.dtype)
data = np.full(shape, fill_value, dtype)
else:
# If there are no missing values, keep the existing dtype
# instead of promoting to support NA, e.g., keep integer
# columns as integers.
# TODO: consider removing this special case, which doesn't
# exist for sparse=True.
data = np.zeros(shape, values.dtype)
dcherian marked this conversation as resolved.
Show resolved Hide resolved
data[indexer] = values
self[name] = (dims, data)

@classmethod
Expand Down Expand Up @@ -4631,7 +4644,19 @@ def from_dataframe(cls, dataframe: pd.DataFrame, sparse: bool = False) -> "Datas
if not dataframe.columns.is_unique:
raise ValueError("cannot convert DataFrame with non-unique columns")

idx, dataframe = remove_unused_levels_categories(dataframe.index, dataframe)
idx = remove_unused_levels_categories(dataframe.index)

if isinstance(idx, pd.MultiIndex) and not idx.is_unique:
raise ValueError(
"cannot convert a DataFrame with a non-unique MultiIndex into xarray"
)

# Cast to a NumPy array first, in case the Series is a pandas Extension
# array (which doesn't have a valid NumPy dtype)
# TODO: allow users to control how this casting happens, e.g., by
# forwarding arguments to pandas.Series.to_numpy?
arrays = [(k, np.asarray(v)) for k, v in dataframe.items()]

obj = cls()

if isinstance(idx, pd.MultiIndex):
Expand All @@ -4647,9 +4672,9 @@ def from_dataframe(cls, dataframe: pd.DataFrame, sparse: bool = False) -> "Datas
obj[index_name] = (dims, idx)

if sparse:
obj._set_sparse_data_from_dataframe(dataframe, dims)
obj._set_sparse_data_from_dataframe(idx, arrays, dims)
else:
obj._set_numpy_data_from_dataframe(dataframe, dims)
obj._set_numpy_data_from_dataframe(idx, arrays, dims)
return obj

def to_dask_dataframe(self, dim_order=None, set_index=False):
Expand Down
13 changes: 7 additions & 6 deletions xarray/core/indexes.py
Expand Up @@ -9,7 +9,7 @@
from .variable import Variable


def remove_unused_levels_categories(index, dataframe=None):
def remove_unused_levels_categories(index: pd.Index) -> pd.Index:
"""
Remove unused levels from MultiIndex and unused categories from CategoricalIndex
"""
Expand All @@ -25,14 +25,15 @@ def remove_unused_levels_categories(index, dataframe=None):
else:
level = level[index.codes[i]]
levels.append(level)
# TODO: calling from_array() reorders MultiIndex levels. It would
# be best to avoid this, if possible, e.g., by using
# MultiIndex.remove_unused_levels() (which does not reorder) on the
# part of the MultiIndex that is not categorical, or by fixing this
# upstream in pandas.
index = pd.MultiIndex.from_arrays(levels, names=index.names)
elif isinstance(index, pd.CategoricalIndex):
index = index.remove_unused_categories()

if dataframe is None:
return index
dataframe = dataframe.set_index(index)
return dataframe.index, dataframe
return index


class Indexes(collections.abc.Mapping):
Expand Down
43 changes: 43 additions & 0 deletions xarray/tests/test_dataset.py
Expand Up @@ -4013,6 +4013,49 @@ def test_to_and_from_empty_dataframe(self):
assert len(actual) == 0
assert expected.equals(actual)

def test_from_dataframe_multiindex(self):
index = pd.MultiIndex.from_product([["a", "b"], [1, 2, 3]], names=["x", "y"])
df = pd.DataFrame({"z": np.arange(6)}, index=index)

expected = Dataset(
{"z": (("x", "y"), [[0, 1, 2], [3, 4, 5]])},
coords={"x": ["a", "b"], "y": [1, 2, 3]},
)
actual = Dataset.from_dataframe(df)
assert_identical(actual, expected)

df2 = df.iloc[[3, 2, 1, 0, 4, 5], :]
actual = Dataset.from_dataframe(df2)
assert_identical(actual, expected)

df3 = df.iloc[:4, :]
expected3 = Dataset(
{"z": (("x", "y"), [[0, 1, 2], [3, np.nan, np.nan]])},
coords={"x": ["a", "b"], "y": [1, 2, 3]},
)
actual = Dataset.from_dataframe(df3)
assert_identical(actual, expected3)

df_nonunique = df.iloc[[0, 0], :]
with raises_regex(ValueError, "non-unique MultiIndex"):
Dataset.from_dataframe(df_nonunique)

def test_from_dataframe_unsorted_levels(self):
# regression test for GH-4186
index = pd.MultiIndex(
levels=[["b", "a"], ["foo"]], codes=[[0, 1], [0, 0]], names=["lev1", "lev2"]
)
df = pd.DataFrame({"c1": [0, 2], "c2": [1, 3]}, index=index)
expected = Dataset(
{
"c1": (("lev1", "lev2"), [[0], [2]]),
"c2": (("lev1", "lev2"), [[1], [3]]),
},
coords={"lev1": ["b", "a"], "lev2": ["foo"]},
)
actual = Dataset.from_dataframe(df)
assert_identical(actual, expected)

def test_from_dataframe_non_unique_columns(self):
# regression test for GH449
df = pd.DataFrame(np.zeros((2, 2)))
Expand Down