diff --git a/RELEASE.rst b/RELEASE.rst index b4d2a534a15c7..4894a795ea8f8 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -38,6 +38,8 @@ pandas 0.8.1 **Improvements to existing features** + - Use moving min/max algorithms from Bottleneck in rolling_min/rolling_max + for > 100x speedup. (#1504, #50) - Drastically improve ``to_datetime`` performance on ISO8601 datetime strings (with no time zones) (#1571) - Add ability to append hierarchical index levels with ``set_index`` and to diff --git a/pandas/src/moments.pyx b/pandas/src/moments.pyx index 8863327979202..7020661712d10 100644 --- a/pandas/src/moments.pyx +++ b/pandas/src/moments.pyx @@ -613,12 +613,100 @@ cdef double_t _get_median(object sl, int nobs, int minp): else: return NaN +#---------------------------------------------------------------------- + +# Moving maximum / minimum code taken from Bottleneck under the terms +# of its Simplified BSD license +# https://github.com/kwgoodman/bottleneck + +cdef struct pairs: + double value + int death + +from libc cimport stdlib + +@cython.boundscheck(False) +@cython.wraparound(False) +def roll_max2(ndarray[float64_t] a, int window, int minp): + "Moving max of 1d array of dtype=float64 along axis=0 ignoring NaNs." + cdef np.float64_t ai, aold + cdef Py_ssize_t count + cdef pairs* ring + cdef pairs* minpair + cdef pairs* end + cdef pairs* last + cdef Py_ssize_t i0 + cdef np.npy_intp *dim + dim = PyArray_DIMS(a) + cdef Py_ssize_t n0 = dim[0] + cdef np.npy_intp *dims = [n0] + cdef np.ndarray[np.float64_t, ndim=1] y = PyArray_EMPTY(1, dims, + NPY_float64, 0) + + minp = _check_minp(minp, n0) + + if (window < 1) or (window > n0): + raise ValueError('Invalid window size %d for len %d array' + % (window, n0)) + + ring = stdlib.malloc(window * sizeof(pairs)) + end = ring + window + last = ring + + minpair = ring + ai = a[0] + if ai == ai: + minpair.value = ai + else: + minpair.value = MINfloat64 + minpair.death = window + + count = 0 + for i0 in range(n0): + ai = a[i0] + if ai == ai: + count += 1 + else: + ai = MINfloat64 + if i0 >= window: + aold = a[i0 - window] + if aold == aold: + count -= 1 + if minpair.death == i0: + minpair += 1 + if minpair >= end: + minpair = ring + if ai >= minpair.value: + minpair.value = ai + minpair.death = i0 + window + last = minpair + else: + while last.value <= ai: + if last == ring: + last = end + last -= 1 + last += 1 + if last == end: + last = ring + last.value = ai + last.death = i0 + window + if count >= minp: + y[i0] = minpair.value + else: + y[i0] = NaN + for i0 in range(window - 1): + y[i0] = NaN + + stdlib.free(ring) + return y + def roll_max(ndarray input, int win, int minp): ''' O(N log(window)) implementation using skip list ''' return _roll_skiplist_op(input, win, minp, _get_max) + cdef double_t _get_max(object skiplist, int nobs, int minp): if nobs >= minp: return skiplist.get(nobs - 1) @@ -631,6 +719,80 @@ def roll_min(ndarray input, int win, int minp): ''' return _roll_skiplist_op(input, win, minp, _get_min) +@cython.boundscheck(False) +@cython.wraparound(False) +def roll_min2(np.ndarray[np.float64_t, ndim=1] a, int window, int minp): + "Moving min of 1d array of dtype=float64 along axis=0 ignoring NaNs." + cdef np.float64_t ai, aold + cdef Py_ssize_t count + cdef pairs* ring + cdef pairs* minpair + cdef pairs* end + cdef pairs* last + cdef Py_ssize_t i0 + cdef np.npy_intp *dim + dim = PyArray_DIMS(a) + cdef Py_ssize_t n0 = dim[0] + cdef np.npy_intp *dims = [n0] + cdef np.ndarray[np.float64_t, ndim=1] y = PyArray_EMPTY(1, dims, + NPY_float64, 0) + if (window < 1) or (window > n0): + raise ValueError('Invalid window size %d for len %d array' + % (window, n0)) + + minp = _check_minp(minp, n0) + + ring = stdlib.malloc(window * sizeof(pairs)) + end = ring + window + last = ring + + minpair = ring + ai = a[0] + if ai == ai: + minpair.value = ai + else: + minpair.value = MAXfloat64 + minpair.death = window + + count = 0 + for i0 in range(n0): + ai = a[i0] + if ai == ai: + count += 1 + else: + ai = MAXfloat64 + if i0 >= window: + aold = a[i0 - window] + if aold == aold: + count -= 1 + if minpair.death == i0: + minpair += 1 + if minpair >= end: + minpair = ring + if ai <= minpair.value: + minpair.value = ai + minpair.death = i0 + window + last = minpair + else: + while last.value >= ai: + if last == ring: + last = end + last -= 1 + last += 1 + if last == end: + last = ring + last.value = ai + last.death = i0 + window + if count >= minp: + y[i0] = minpair.value + else: + y[i0] = NaN + for i0 in range(window - 1): + y[i0] = NaN + + stdlib.free(ring) + return y + cdef double_t _get_min(object skiplist, int nobs, int minp): if nobs >= minp: return skiplist.get(0) diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py index a638fe0176075..35a994f01eb95 100644 --- a/pandas/stats/moments.py +++ b/pandas/stats/moments.py @@ -395,8 +395,8 @@ def call_cython(arg, window, minp, **kwds): return f -rolling_max = _rolling_func(_tseries.roll_max, 'Moving maximum') -rolling_min = _rolling_func(_tseries.roll_min, 'Moving minimum') +rolling_max = _rolling_func(_tseries.roll_max2, 'Moving maximum') +rolling_min = _rolling_func(_tseries.roll_min2, 'Moving minimum') rolling_sum = _rolling_func(_tseries.roll_sum, 'Moving sum') rolling_mean = _rolling_func(_tseries.roll_mean, 'Moving mean') rolling_median = _rolling_func(_tseries.roll_median_cython, 'Moving median')