Added Coarsen (#2612)

* Added variable.coarsen * Added DataArray.coarsen and Dataset.coarsen * pep8 * a bugfix for mpa3 * Support mean for datatime dtype * nanmean for DateTime * API updatedd via comments * bug fix in tests * updated docs * use pd.isnull rather than isnat * support Variable in datetime_to_numeric * use pd.isnull instead of numpy.isnat in test * Added an example to doc. * coordinate_func -> coord_func. Support 0d-array mean with datetime * Added an two dimensional example * flake8 * flake8 * a potential bug fix * Update via comments * Always use datetime64[ns] in mean * Added tests for 2d coarsen with value check * update via comment * whats new * typo fix
pydata · Jan 6, 2019 · ede3e01 · ede3e01
1 parent dba299b
commit ede3e01
Show file tree

Hide file tree

Showing 15 changed files with 507 additions and 33 deletions.
diff --git a/doc/api.rst b/doc/api.rst
@@ -147,6 +147,7 @@ Computation
    Dataset.groupby
    Dataset.groupby_bins
    Dataset.rolling
+   Dataset.coarsen
    Dataset.resample
    Dataset.diff
    Dataset.quantile
@@ -312,6 +313,7 @@ Computation
    DataArray.groupby
    DataArray.groupby_bins
    DataArray.rolling
+   DataArray.coarsen
    DataArray.dt
    DataArray.resample
    DataArray.get_axis_num

diff --git a/doc/computation.rst b/doc/computation.rst
@@ -199,6 +199,47 @@ You can also use ``construct`` to compute a weighted rolling sum:
   To avoid this, use ``skipna=False`` as the above example.
 
 
+.. _comput.coarsen:
+
+Coarsen large arrays
+====================
+
+``DataArray`` and ``Dataset`` objects include a
+:py:meth:`~xarray.DataArray.coarsen` and :py:meth:`~xarray.Dataset.coarsen`
+methods. This supports the block aggregation along multiple dimensions,
+
+.. ipython:: python
+
+  x = np.linspace(0, 10, 300)
+  t = pd.date_range('15/12/1999', periods=364)
+  da = xr.DataArray(np.sin(x) * np.cos(np.linspace(0, 1, 364)[:, np.newaxis]),
+                    dims=['time', 'x'], coords={'time': t, 'x': x})
+  da
+
+In order to take a block mean for every 7 days along ``time`` dimension and
+every 2 points along ``x`` dimension,
+
+.. ipython:: python
+
+  da.coarsen(time=7, x=2).mean()
+
+:py:meth:`~xarray.DataArray.coarsen` raises an ``ValueError`` if the data
+length is not a multiple of the corresponding window size.
+You can choose ``boundary='trim'`` or ``boundary='pad'`` options for trimming
+the excess entries or padding ``nan`` to insufficient entries,
+
+.. ipython:: python
+
+  da.coarsen(time=30, x=2, boundary='trim').mean()
+
+If you want to apply a specific function to coordinate, you can pass the
+function or method name to ``coord_func`` option,
+
+.. ipython:: python
+
+  da.coarsen(time=7, x=2, coord_func={'time': 'min'}).mean()
+
+
 Computation using Coordinates
 =============================
 

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -28,6 +28,11 @@ Breaking changes
 Enhancements
 ~~~~~~~~~~~~
 
+- :py:meth:`~xarray.DataArray.coarsen` and
+  :py:meth:`~xarray.Dataset.coarsen` are newly added.
+  See :ref:`comput.coarsen` for details.
+  (:issue:`2525`)
+  By `Keisuke Fujii <https://github.com/fujiisoup>`_.
 - Upsampling an array via interpolation with resample is now dask-compatible,
   as long as the array is not chunked along the resampling dimension.
   By `Spencer Clark <https://github.com/spencerkclark>`_.
@@ -76,8 +81,8 @@ Breaking changes
 - Minimum rasterio version increased from 0.36 to 1.0 (for ``open_rasterio``)
 - Time bounds variables are now also decoded according to CF conventions
   (:issue:`2565`). The previous behavior was to decode them only if they
-  had specific time attributes, now these attributes are copied 
-  automatically from the corresponding time coordinate. This might 
+  had specific time attributes, now these attributes are copied
+  automatically from the corresponding time coordinate. This might
   brake downstream code that was relying on these variables to be
   not decoded.
   By `Fabien Maussion <https://github.com/fmaussion>`_.

diff --git a/xarray/core/common.py b/xarray/core/common.py
@@ -590,6 +590,66 @@ def rolling(self, dim=None, min_periods=None, center=False, **dim_kwargs):
         return self._rolling_cls(self, dim, min_periods=min_periods,
                                  center=center)
 
+    def coarsen(self, dim=None, boundary='exact', side='left',
+                coord_func='mean', **dim_kwargs):
+        """
+        Coarsen object.
+
+        Parameters
+        ----------
+        dim: dict, optional
+            Mapping from the dimension name to the window size.
+            dim : str
+                Name of the dimension to create the rolling iterator
+                along (e.g., `time`).
+            window : int
+                Size of the moving window.
+        boundary : 'exact' | 'trim' | 'pad'
+            If 'exact', a ValueError will be raised if dimension size is not a
+            multiple of the window size. If 'trim', the excess entries are
+            dropped. If 'pad', NA will be padded.
+        side : 'left' or 'right' or mapping from dimension to 'left' or 'right'
+        coord_func: function (name) that is applied to the coordintes,
+            or a mapping from coordinate name to function (name).
+
+        Returns
+        -------
+        Coarsen object (core.rolling.DataArrayCoarsen for DataArray,
+        core.rolling.DatasetCoarsen for Dataset.)
+
+        Examples
+        --------
+        Coarsen the long time series by averaging over every four days.
+
+        >>> da = xr.DataArray(np.linspace(0, 364, num=364),
+        ...                   dims='time',
+        ...                   coords={'time': pd.date_range(
+        ...                       '15/12/1999', periods=364)})
+        >>> da
+        <xarray.DataArray (time: 364)>
+        array([  0.      ,   1.002755,   2.00551 , ..., 361.99449 , 362.997245,
+               364.      ])
+        Coordinates:
+          * time     (time) datetime64[ns] 1999-12-15 1999-12-16 ... 2000-12-12
+        >>>
+        >>> da.coarsen(time=3, boundary='trim').mean()
+        <xarray.DataArray (time: 121)>
+        array([  1.002755,   4.011019,   7.019284,  ...,  358.986226,
+               361.99449 ])
+        Coordinates:
+          * time     (time) datetime64[ns] 1999-12-16 1999-12-19 ... 2000-12-10
+        >>>
+
+        See Also
+        --------
+        core.rolling.DataArrayCoarsen
+        core.rolling.DatasetCoarsen
+        """
+        dim = either_dict_or_kwargs(dim, dim_kwargs, 'coarsen')
+        return self._coarsen_cls(
+            self, dim, boundary=boundary, side=side,
+            coord_func=coord_func)
+
     def resample(self, indexer=None, skipna=None, closed=None, label=None,
                  base=0, keep_attrs=None, loffset=None, **indexer_kwargs):
         """Returns a Resample object for performing resampling operations.

diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -161,6 +161,7 @@ class DataArray(AbstractArray, DataWithCoords):
     """
     _groupby_cls = groupby.DataArrayGroupBy
     _rolling_cls = rolling.DataArrayRolling
+    _coarsen_cls = rolling.DataArrayCoarsen
     _resample_cls = resample.DataArrayResample
 
     dt = property(DatetimeAccessor)

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -317,6 +317,7 @@ class Dataset(Mapping, ImplementsDatasetReduce, DataWithCoords,
     """
     _groupby_cls = groupby.DatasetGroupBy
     _rolling_cls = rolling.DatasetRolling
+    _coarsen_cls = rolling.DatasetCoarsen
     _resample_cls = resample.DatasetResample
 
     def __init__(self, data_vars=None, coords=None, attrs=None,

diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py
@@ -13,7 +13,7 @@
 import numpy as np
 import pandas as pd
 
-from . import dask_array_ops, dtypes, npcompat, nputils
+from . import dask_array_ops, dtypes, npcompat, nputils, utils
 from .nputils import nanfirst, nanlast
 from .pycompat import dask_array_type
 
@@ -261,8 +261,6 @@ def f(values, axis=None, skipna=None, **kwargs):
 sum = _create_nan_agg_method('sum')
 sum.numeric_only = True
 sum.available_min_count = True
-mean = _create_nan_agg_method('mean')
-mean.numeric_only = True
 std = _create_nan_agg_method('std')
 std.numeric_only = True
 var = _create_nan_agg_method('var')
@@ -278,6 +276,25 @@ def f(values, axis=None, skipna=None, **kwargs):
 cumsum_1d.numeric_only = True
 
 
+_mean = _create_nan_agg_method('mean')
+
+
+def mean(array, axis=None, skipna=None, **kwargs):
+    """ inhouse mean that can handle datatime dtype """
+    array = asarray(array)
+    if array.dtype.kind == 'M':
+        offset = min(array)
+        # xarray always uses datetime[ns] for datetime
+        dtype = 'timedelta64[ns]'
+        return _mean(utils.datetime_to_numeric(array, offset), axis=axis,
+                     skipna=skipna, **kwargs).astype(dtype) + offset
+    else:
+        return _mean(array, axis=axis, skipna=skipna, **kwargs)
+
+
+mean.numeric_only = True
+
+
 def _nd_cum_func(cum_func, array, axis, **kwargs):
     array = asarray(array)
     if axis is None:

diff --git a/xarray/core/missing.py b/xarray/core/missing.py
@@ -7,7 +7,7 @@
 import numpy as np
 import pandas as pd
 
-from . import rolling
+from . import utils
 from .common import _contains_datetime_like_objects
 from .computation import apply_ufunc
 from .duck_array_ops import dask_array_type
@@ -370,7 +370,7 @@ def _get_valid_fill_mask(arr, dim, limit):
     None'''
     kw = {dim: limit + 1}
     # we explicitly use construct method to avoid copy.
-    new_dim = rolling._get_new_dimname(arr.dims, '_window')
+    new_dim = utils.get_temp_dimname(arr.dims, '_window')
     return (arr.isnull().rolling(min_periods=1, **kw)
             .construct(new_dim, fill_value=False)
             .sum(new_dim, skipna=False)) <= limit

diff --git a/xarray/core/ops.py b/xarray/core/ops.py
@@ -122,6 +122,20 @@
     New {da_or_ds} object with `{name}` applied along its rolling dimnension.
 """
 
+_COARSEN_REDUCE_DOCSTRING_TEMPLATE = """\
+Coarsen this object by applying `{name}` along its dimensions.
+
+Parameters
+----------
+**kwargs : dict
+    Additional keyword arguments passed on to `{name}`.
+
+Returns
+-------
+reduced : DataArray or Dataset
+    New object with `{name}` applied along its coasen dimnensions.
+"""
+
 
 def fillna(data, other, join="left", dataset_join="left"):
     """Fill missing values in this object with data from the other object.
@@ -378,3 +392,15 @@ def inject_datasetrolling_methods(cls):
     func.__doc__ = _ROLLING_REDUCE_DOCSTRING_TEMPLATE.format(
         name=func.__name__, da_or_ds='Dataset')
     setattr(cls, 'count', func)
+
+
+def inject_coarsen_methods(cls):
+    # standard numpy reduce methods
+    methods = [(name, getattr(duck_array_ops, name))
+               for name in NAN_REDUCE_METHODS]
+    for name, f in methods:
+        func = cls._reduce_method(f)
+        func.__name__ = name
+        func.__doc__ = _COARSEN_REDUCE_DOCSTRING_TEMPLATE.format(
+            name=func.__name__)
+        setattr(cls, name, func)