FIX fix performance regression in trees with low-cardinality features (…

…scikit-learn#23410) Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com> Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
glemaitre · May 19, 2022 · 007ea89 · 007ea89
1 parent a656d8a
commit 007ea89
Show file tree

Hide file tree

Showing 2 changed files with 135 additions and 11 deletions.
diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
@@ -39,7 +39,19 @@ Changelog
   classifier that always predicts the positive class: recall=100% and
   precision=class balance.
   :pr:`23214` by :user:`Stéphane Collot <stephanecollot>` and :user:`Max Baak <mbaak>`.
-
+
+:mod:`sklearn.tree`
+...................
+
+- |Fix| Fixes performance regression with low cardinality features for
+  :class:`tree.DecisionTreeClassifier`,
+  :class:`tree.DecisionTreeRegressor`,
+  :class:`ensemble.RandomForestClassifier`,
+  :class:`ensemble.RandomForestRegressor`,
+  :class:`ensemble.GradientBoostingClassifier`, and
+  :class:`ensemble.GradientBoostingRegressor`.
+  :pr:`23410` by :user:`Loïc Estève <lesteve>`
+
 :mod:`sklearn.utils`
 ....................
 

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
@@ -29,7 +29,6 @@ from ._utils cimport rand_int
 from ._utils cimport rand_uniform
 from ._utils cimport RAND_R_MAX
 from ._utils cimport safe_realloc
-from ..utils._sorting cimport simultaneous_sort
 
 cdef double INFINITY = np.inf
 
@@ -359,7 +358,7 @@ cdef class BestSplitter(BaseDenseSplitter):
                 for i in range(start, end):
                     Xf[i] = self.X[samples[i], current.feature]
 
-                simultaneous_sort(Xf + start, samples + start, end - start)
+                sort(Xf + start, samples + start, end - start)
 
                 if Xf[end - 1] <= Xf[start] + FEATURE_THRESHOLD:
                     features[f_j], features[n_total_constants] = features[n_total_constants], features[f_j]
@@ -452,6 +451,120 @@ cdef class BestSplitter(BaseDenseSplitter):
         return 0
 
 
+# Sort n-element arrays pointed to by Xf and samples, simultaneously,
+# by the values in Xf. Algorithm: Introsort (Musser, SP&E, 1997).
+cdef inline void sort(DTYPE_t* Xf, SIZE_t* samples, SIZE_t n) nogil:
+    if n == 0:
+      return
+    cdef int maxd = 2 * <int>log(n)
+    introsort(Xf, samples, n, maxd)
+
+
+cdef inline void swap(DTYPE_t* Xf, SIZE_t* samples,
+        SIZE_t i, SIZE_t j) nogil:
+    # Helper for sort
+    Xf[i], Xf[j] = Xf[j], Xf[i]
+    samples[i], samples[j] = samples[j], samples[i]
+
+
+cdef inline DTYPE_t median3(DTYPE_t* Xf, SIZE_t n) nogil:
+    # Median of three pivot selection, after Bentley and McIlroy (1993).
+    # Engineering a sort function. SP&E. Requires 8/3 comparisons on average.
+    cdef DTYPE_t a = Xf[0], b = Xf[n / 2], c = Xf[n - 1]
+    if a < b:
+        if b < c:
+            return b
+        elif a < c:
+            return c
+        else:
+            return a
+    elif b < c:
+        if a < c:
+            return a
+        else:
+            return c
+    else:
+        return b
+
+
+# Introsort with median of 3 pivot selection and 3-way partition function
+# (robust to repeated elements, e.g. lots of zero features).
+cdef void introsort(DTYPE_t* Xf, SIZE_t *samples,
+                    SIZE_t n, int maxd) nogil:
+    cdef DTYPE_t pivot
+    cdef SIZE_t i, l, r
+
+    while n > 1:
+        if maxd <= 0:   # max depth limit exceeded ("gone quadratic")
+            heapsort(Xf, samples, n)
+            return
+        maxd -= 1
+
+        pivot = median3(Xf, n)
+
+        # Three-way partition.
+        i = l = 0
+        r = n
+        while i < r:
+            if Xf[i] < pivot:
+                swap(Xf, samples, i, l)
+                i += 1
+                l += 1
+            elif Xf[i] > pivot:
+                r -= 1
+                swap(Xf, samples, i, r)
+            else:
+                i += 1
+
+        introsort(Xf, samples, l, maxd)
+        Xf += r
+        samples += r
+        n -= r
+
+
+cdef inline void sift_down(DTYPE_t* Xf, SIZE_t* samples,
+                           SIZE_t start, SIZE_t end) nogil:
+    # Restore heap order in Xf[start:end] by moving the max element to start.
+    cdef SIZE_t child, maxind, root
+
+    root = start
+    while True:
+        child = root * 2 + 1
+
+        # find max of root, left child, right child
+        maxind = root
+        if child < end and Xf[maxind] < Xf[child]:
+            maxind = child
+        if child + 1 < end and Xf[maxind] < Xf[child + 1]:
+            maxind = child + 1
+
+        if maxind == root:
+            break
+        else:
+            swap(Xf, samples, root, maxind)
+            root = maxind
+
+
+cdef void heapsort(DTYPE_t* Xf, SIZE_t* samples, SIZE_t n) nogil:
+    cdef SIZE_t start, end
+
+    # heapify
+    start = (n - 2) / 2
+    end = n
+    while True:
+        sift_down(Xf, samples, start, end)
+        if start == 0:
+            break
+        start -= 1
+
+    # sort by shrinking the heap, putting the max element immediately after it
+    end = n - 1
+    while end > 0:
+        swap(Xf, samples, 0, end)
+        sift_down(Xf, samples, 0, end)
+        end = end - 1
+
+
 cdef class RandomSplitter(BaseDenseSplitter):
     """Splitter for finding the best random split."""
     def __reduce__(self):
@@ -1077,14 +1190,13 @@ cdef class BestSparseSplitter(BaseSparseSplitter):
                 f_j += n_found_constants
                 # f_j in the interval [n_total_constants, f_i[
 
-                current.feature = features[f_j]
-                self.extract_nnz(current.feature,
-                                 &end_negative, &start_positive,
-                                 &is_samples_sorted)
-
-                # Sort the positive and negative parts of `Xf`
-                simultaneous_sort(Xf + start, samples + start, end_negative - start)
-                simultaneous_sort(Xf + start_positive, samples + start_positive,
+            current.feature = features[f_j]
+            self.extract_nnz(current.feature, &end_negative, &start_positive,
+                             &is_samples_sorted)
+            # Sort the positive and negative parts of `Xf`
+            sort(Xf + start, samples + start, end_negative - start)
+            if start_positive < end:
+                sort(Xf + start_positive, samples + start_positive,
                      end - start_positive)
 
                 # Update index_to_samples to take into account the sort