Skip to content

Commit

Permalink
FIX fix performance regression in trees with low-cardinality features (
Browse files Browse the repository at this point in the history
…scikit-learn#23410)

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
  • Loading branch information
3 people committed May 19, 2022
1 parent a656d8a commit 007ea89
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 11 deletions.
14 changes: 13 additions & 1 deletion doc/whats_new/v1.1.rst
Expand Up @@ -39,7 +39,19 @@ Changelog
classifier that always predicts the positive class: recall=100% and
precision=class balance.
:pr:`23214` by :user:`Stéphane Collot <stephanecollot>` and :user:`Max Baak <mbaak>`.


:mod:`sklearn.tree`
...................

- |Fix| Fixes performance regression with low cardinality features for
:class:`tree.DecisionTreeClassifier`,
:class:`tree.DecisionTreeRegressor`,
:class:`ensemble.RandomForestClassifier`,
:class:`ensemble.RandomForestRegressor`,
:class:`ensemble.GradientBoostingClassifier`, and
:class:`ensemble.GradientBoostingRegressor`.
:pr:`23410` by :user:`Loïc Estève <lesteve>`

:mod:`sklearn.utils`
....................

Expand Down
132 changes: 122 additions & 10 deletions sklearn/tree/_splitter.pyx
Expand Up @@ -29,7 +29,6 @@ from ._utils cimport rand_int
from ._utils cimport rand_uniform
from ._utils cimport RAND_R_MAX
from ._utils cimport safe_realloc
from ..utils._sorting cimport simultaneous_sort

cdef double INFINITY = np.inf

Expand Down Expand Up @@ -359,7 +358,7 @@ cdef class BestSplitter(BaseDenseSplitter):
for i in range(start, end):
Xf[i] = self.X[samples[i], current.feature]

simultaneous_sort(Xf + start, samples + start, end - start)
sort(Xf + start, samples + start, end - start)

if Xf[end - 1] <= Xf[start] + FEATURE_THRESHOLD:
features[f_j], features[n_total_constants] = features[n_total_constants], features[f_j]
Expand Down Expand Up @@ -452,6 +451,120 @@ cdef class BestSplitter(BaseDenseSplitter):
return 0


# Sort n-element arrays pointed to by Xf and samples, simultaneously,
# by the values in Xf. Algorithm: Introsort (Musser, SP&E, 1997).
cdef inline void sort(DTYPE_t* Xf, SIZE_t* samples, SIZE_t n) nogil:
if n == 0:
return
cdef int maxd = 2 * <int>log(n)
introsort(Xf, samples, n, maxd)


cdef inline void swap(DTYPE_t* Xf, SIZE_t* samples,
SIZE_t i, SIZE_t j) nogil:
# Helper for sort
Xf[i], Xf[j] = Xf[j], Xf[i]
samples[i], samples[j] = samples[j], samples[i]


cdef inline DTYPE_t median3(DTYPE_t* Xf, SIZE_t n) nogil:
# Median of three pivot selection, after Bentley and McIlroy (1993).
# Engineering a sort function. SP&E. Requires 8/3 comparisons on average.
cdef DTYPE_t a = Xf[0], b = Xf[n / 2], c = Xf[n - 1]
if a < b:
if b < c:
return b
elif a < c:
return c
else:
return a
elif b < c:
if a < c:
return a
else:
return c
else:
return b


# Introsort with median of 3 pivot selection and 3-way partition function
# (robust to repeated elements, e.g. lots of zero features).
cdef void introsort(DTYPE_t* Xf, SIZE_t *samples,
SIZE_t n, int maxd) nogil:
cdef DTYPE_t pivot
cdef SIZE_t i, l, r

while n > 1:
if maxd <= 0: # max depth limit exceeded ("gone quadratic")
heapsort(Xf, samples, n)
return
maxd -= 1

pivot = median3(Xf, n)

# Three-way partition.
i = l = 0
r = n
while i < r:
if Xf[i] < pivot:
swap(Xf, samples, i, l)
i += 1
l += 1
elif Xf[i] > pivot:
r -= 1
swap(Xf, samples, i, r)
else:
i += 1

introsort(Xf, samples, l, maxd)
Xf += r
samples += r
n -= r


cdef inline void sift_down(DTYPE_t* Xf, SIZE_t* samples,
SIZE_t start, SIZE_t end) nogil:
# Restore heap order in Xf[start:end] by moving the max element to start.
cdef SIZE_t child, maxind, root

root = start
while True:
child = root * 2 + 1

# find max of root, left child, right child
maxind = root
if child < end and Xf[maxind] < Xf[child]:
maxind = child
if child + 1 < end and Xf[maxind] < Xf[child + 1]:
maxind = child + 1

if maxind == root:
break
else:
swap(Xf, samples, root, maxind)
root = maxind


cdef void heapsort(DTYPE_t* Xf, SIZE_t* samples, SIZE_t n) nogil:
cdef SIZE_t start, end

# heapify
start = (n - 2) / 2
end = n
while True:
sift_down(Xf, samples, start, end)
if start == 0:
break
start -= 1

# sort by shrinking the heap, putting the max element immediately after it
end = n - 1
while end > 0:
swap(Xf, samples, 0, end)
sift_down(Xf, samples, 0, end)
end = end - 1


cdef class RandomSplitter(BaseDenseSplitter):
"""Splitter for finding the best random split."""
def __reduce__(self):
Expand Down Expand Up @@ -1077,14 +1190,13 @@ cdef class BestSparseSplitter(BaseSparseSplitter):
f_j += n_found_constants
# f_j in the interval [n_total_constants, f_i[

current.feature = features[f_j]
self.extract_nnz(current.feature,
&end_negative, &start_positive,
&is_samples_sorted)

# Sort the positive and negative parts of `Xf`
simultaneous_sort(Xf + start, samples + start, end_negative - start)
simultaneous_sort(Xf + start_positive, samples + start_positive,
current.feature = features[f_j]
self.extract_nnz(current.feature, &end_negative, &start_positive,
&is_samples_sorted)
# Sort the positive and negative parts of `Xf`
sort(Xf + start, samples + start, end_negative - start)
if start_positive < end:
sort(Xf + start_positive, samples + start_positive,
end - start_positive)

# Update index_to_samples to take into account the sort
Expand Down

0 comments on commit 007ea89

Please sign in to comment.