diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 6a66ba4024c6c..516d90cd9bc95 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -40,6 +40,18 @@ Changelog precision=class balance. :pr:`23214` by :user:`Stéphane Collot ` and :user:`Max Baak `. +:mod:`sklearn.tree` +................... + +- |Fix| Fixes performance regression with low cardinality features for + :class:`tree.DecisionTreeClassifier`, + :class:`tree.DecisionTreeRegressor`, + :class:`ensemble.RandomForestClassifier`, + :class:`ensemble.RandomForestRegressor`, + :class:`ensemble.GradientBoostingClassifier`, and + :class:`ensemble.GradientBoostingRegressor`. + :pr:`23410` by :user:`Loïc Estève ` + :mod:`sklearn.utils` .................... diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index a14d0ce26ee92..76b502f98f144 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -26,7 +26,6 @@ from ._utils cimport log from ._utils cimport rand_int from ._utils cimport rand_uniform from ._utils cimport RAND_R_MAX -from ..utils._sorting cimport simultaneous_sort cdef double INFINITY = np.inf @@ -342,7 +341,7 @@ cdef class BestSplitter(BaseDenseSplitter): for i in range(start, end): Xf[i] = self.X[samples[i], current.feature] - simultaneous_sort(&Xf[start], &samples[start], end - start) + sort(&Xf[start], &samples[start], end - start) if Xf[end - 1] <= Xf[start] + FEATURE_THRESHOLD: features[f_j], features[n_total_constants] = features[n_total_constants], features[f_j] @@ -438,6 +437,120 @@ cdef class BestSplitter(BaseDenseSplitter): return 0 +# Sort n-element arrays pointed to by Xf and samples, simultaneously, +# by the values in Xf. Algorithm: Introsort (Musser, SP&E, 1997). +cdef inline void sort(DTYPE_t* Xf, SIZE_t* samples, SIZE_t n) nogil: + if n == 0: + return + cdef int maxd = 2 * log(n) + introsort(Xf, samples, n, maxd) + + +cdef inline void swap(DTYPE_t* Xf, SIZE_t* samples, + SIZE_t i, SIZE_t j) nogil: + # Helper for sort + Xf[i], Xf[j] = Xf[j], Xf[i] + samples[i], samples[j] = samples[j], samples[i] + + +cdef inline DTYPE_t median3(DTYPE_t* Xf, SIZE_t n) nogil: + # Median of three pivot selection, after Bentley and McIlroy (1993). + # Engineering a sort function. SP&E. Requires 8/3 comparisons on average. + cdef DTYPE_t a = Xf[0], b = Xf[n / 2], c = Xf[n - 1] + if a < b: + if b < c: + return b + elif a < c: + return c + else: + return a + elif b < c: + if a < c: + return a + else: + return c + else: + return b + + +# Introsort with median of 3 pivot selection and 3-way partition function +# (robust to repeated elements, e.g. lots of zero features). +cdef void introsort(DTYPE_t* Xf, SIZE_t *samples, + SIZE_t n, int maxd) nogil: + cdef DTYPE_t pivot + cdef SIZE_t i, l, r + + while n > 1: + if maxd <= 0: # max depth limit exceeded ("gone quadratic") + heapsort(Xf, samples, n) + return + maxd -= 1 + + pivot = median3(Xf, n) + + # Three-way partition. + i = l = 0 + r = n + while i < r: + if Xf[i] < pivot: + swap(Xf, samples, i, l) + i += 1 + l += 1 + elif Xf[i] > pivot: + r -= 1 + swap(Xf, samples, i, r) + else: + i += 1 + + introsort(Xf, samples, l, maxd) + Xf += r + samples += r + n -= r + + +cdef inline void sift_down(DTYPE_t* Xf, SIZE_t* samples, + SIZE_t start, SIZE_t end) nogil: + # Restore heap order in Xf[start:end] by moving the max element to start. + cdef SIZE_t child, maxind, root + + root = start + while True: + child = root * 2 + 1 + + # find max of root, left child, right child + maxind = root + if child < end and Xf[maxind] < Xf[child]: + maxind = child + if child + 1 < end and Xf[maxind] < Xf[child + 1]: + maxind = child + 1 + + if maxind == root: + break + else: + swap(Xf, samples, root, maxind) + root = maxind + + +cdef void heapsort(DTYPE_t* Xf, SIZE_t* samples, SIZE_t n) nogil: + cdef SIZE_t start, end + + # heapify + start = (n - 2) / 2 + end = n + while True: + sift_down(Xf, samples, start, end) + if start == 0: + break + start -= 1 + + # sort by shrinking the heap, putting the max element immediately after it + end = n - 1 + while end > 0: + swap(Xf, samples, 0, end) + sift_down(Xf, samples, 0, end) + end = end - 1 + + cdef class RandomSplitter(BaseDenseSplitter): """Splitter for finding the best random split.""" def __reduce__(self): @@ -1047,11 +1160,11 @@ cdef class BestSparseSplitter(BaseSparseSplitter): current.feature = features[f_j] self.extract_nnz(current.feature, &end_negative, &start_positive, &is_samples_sorted) - # Sort the positive and negative parts of `Xf` - simultaneous_sort(&Xf[start], &samples[start], end_negative - start) + sort(&Xf[start], &samples[start], end_negative - start) if start_positive < end: - simultaneous_sort(&Xf[start_positive], &samples[start_positive], end - start_positive) + sort(&Xf[start_positive], &samples[start_positive], + end - start_positive) # Update index_to_samples to take into account the sort for p in range(start, end_negative):