Skip to content

Commit

Permalink
ENH: use bottleneck moving max/min impl for perf improvement close #1504
Browse files Browse the repository at this point in the history
, #50
  • Loading branch information
wesm committed Jul 12, 2012
1 parent f95a5d7 commit 2dcae6e
Show file tree
Hide file tree
Showing 3 changed files with 166 additions and 2 deletions.
2 changes: 2 additions & 0 deletions RELEASE.rst
Expand Up @@ -38,6 +38,8 @@ pandas 0.8.1

**Improvements to existing features**

- Use moving min/max algorithms from Bottleneck in rolling_min/rolling_max
for > 100x speedup. (#1504, #50)
- Drastically improve ``to_datetime`` performance on ISO8601 datetime strings
(with no time zones) (#1571)
- Add ability to append hierarchical index levels with ``set_index`` and to
Expand Down
162 changes: 162 additions & 0 deletions pandas/src/moments.pyx
Expand Up @@ -613,12 +613,100 @@ cdef double_t _get_median(object sl, int nobs, int minp):
else:
return NaN

#----------------------------------------------------------------------

# Moving maximum / minimum code taken from Bottleneck under the terms
# of its Simplified BSD license
# https://github.com/kwgoodman/bottleneck

cdef struct pairs:
double value
int death

from libc cimport stdlib

@cython.boundscheck(False)
@cython.wraparound(False)
def roll_max2(ndarray[float64_t] a, int window, int minp):
"Moving max of 1d array of dtype=float64 along axis=0 ignoring NaNs."
cdef np.float64_t ai, aold
cdef Py_ssize_t count
cdef pairs* ring
cdef pairs* minpair
cdef pairs* end
cdef pairs* last
cdef Py_ssize_t i0
cdef np.npy_intp *dim
dim = PyArray_DIMS(a)
cdef Py_ssize_t n0 = dim[0]
cdef np.npy_intp *dims = [n0]
cdef np.ndarray[np.float64_t, ndim=1] y = PyArray_EMPTY(1, dims,
NPY_float64, 0)

minp = _check_minp(minp, n0)

if (window < 1) or (window > n0):
raise ValueError('Invalid window size %d for len %d array'
% (window, n0))

ring = <pairs*>stdlib.malloc(window * sizeof(pairs))
end = ring + window
last = ring

minpair = ring
ai = a[0]
if ai == ai:
minpair.value = ai
else:
minpair.value = MINfloat64
minpair.death = window

count = 0
for i0 in range(n0):
ai = a[i0]
if ai == ai:
count += 1
else:
ai = MINfloat64
if i0 >= window:
aold = a[i0 - window]
if aold == aold:
count -= 1
if minpair.death == i0:
minpair += 1
if minpair >= end:
minpair = ring
if ai >= minpair.value:
minpair.value = ai
minpair.death = i0 + window
last = minpair
else:
while last.value <= ai:
if last == ring:
last = end
last -= 1
last += 1
if last == end:
last = ring
last.value = ai
last.death = i0 + window
if count >= minp:
y[i0] = minpair.value
else:
y[i0] = NaN
for i0 in range(window - 1):
y[i0] = NaN

stdlib.free(ring)
return y

def roll_max(ndarray input, int win, int minp):
'''
O(N log(window)) implementation using skip list
'''
return _roll_skiplist_op(input, win, minp, _get_max)


cdef double_t _get_max(object skiplist, int nobs, int minp):
if nobs >= minp:
return <IndexableSkiplist> skiplist.get(nobs - 1)
Expand All @@ -631,6 +719,80 @@ def roll_min(ndarray input, int win, int minp):
'''
return _roll_skiplist_op(input, win, minp, _get_min)

@cython.boundscheck(False)
@cython.wraparound(False)
def roll_min2(np.ndarray[np.float64_t, ndim=1] a, int window, int minp):
"Moving min of 1d array of dtype=float64 along axis=0 ignoring NaNs."
cdef np.float64_t ai, aold
cdef Py_ssize_t count
cdef pairs* ring
cdef pairs* minpair
cdef pairs* end
cdef pairs* last
cdef Py_ssize_t i0
cdef np.npy_intp *dim
dim = PyArray_DIMS(a)
cdef Py_ssize_t n0 = dim[0]
cdef np.npy_intp *dims = [n0]
cdef np.ndarray[np.float64_t, ndim=1] y = PyArray_EMPTY(1, dims,
NPY_float64, 0)
if (window < 1) or (window > n0):
raise ValueError('Invalid window size %d for len %d array'
% (window, n0))

minp = _check_minp(minp, n0)

ring = <pairs*>stdlib.malloc(window * sizeof(pairs))
end = ring + window
last = ring

minpair = ring
ai = a[0]
if ai == ai:
minpair.value = ai
else:
minpair.value = MAXfloat64
minpair.death = window

count = 0
for i0 in range(n0):
ai = a[i0]
if ai == ai:
count += 1
else:
ai = MAXfloat64
if i0 >= window:
aold = a[i0 - window]
if aold == aold:
count -= 1
if minpair.death == i0:
minpair += 1
if minpair >= end:
minpair = ring
if ai <= minpair.value:
minpair.value = ai
minpair.death = i0 + window
last = minpair
else:
while last.value >= ai:
if last == ring:
last = end
last -= 1
last += 1
if last == end:
last = ring
last.value = ai
last.death = i0 + window
if count >= minp:
y[i0] = minpair.value
else:
y[i0] = NaN
for i0 in range(window - 1):
y[i0] = NaN

stdlib.free(ring)
return y

cdef double_t _get_min(object skiplist, int nobs, int minp):
if nobs >= minp:
return <IndexableSkiplist> skiplist.get(0)
Expand Down
4 changes: 2 additions & 2 deletions pandas/stats/moments.py
Expand Up @@ -395,8 +395,8 @@ def call_cython(arg, window, minp, **kwds):

return f

rolling_max = _rolling_func(_tseries.roll_max, 'Moving maximum')
rolling_min = _rolling_func(_tseries.roll_min, 'Moving minimum')
rolling_max = _rolling_func(_tseries.roll_max2, 'Moving maximum')
rolling_min = _rolling_func(_tseries.roll_min2, 'Moving minimum')
rolling_sum = _rolling_func(_tseries.roll_sum, 'Moving sum')
rolling_mean = _rolling_func(_tseries.roll_mean, 'Moving mean')
rolling_median = _rolling_func(_tseries.roll_median_cython, 'Moving median')
Expand Down

0 comments on commit 2dcae6e

Please sign in to comment.