Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX Fixes performance regression in trees #23404

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 10 additions & 0 deletions doc/whats_new/v1.1.rst
Expand Up @@ -33,6 +33,16 @@ Changelog
precision=class balance.
:pr:`23214` by :user:`Stéphane Collot <stephanecollot>` and :user:`Max Baak <mbaak>`.

:mod:`sklearn.tree`
...................

- |Fix| Fixes performance regression :class:`tree.DecisionTreeClassifier`,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
- |Fix| Fixes performance regression :class:`tree.DecisionTreeClassifier`,
- |Fix| Fixes performance regression with low cardinality features for
:class:`tree.DecisionTreeClassifier`,

:class:`tree.DecisionTreeRegressor`,
:class:`ensemble.RandomForestClassifier`,
:class:`ensemble.RandomForestRegressor`,
:class:`ensemble.GradientBoostingClassifier`, and
:class:`ensemble.GradientBoostingRegressor`. :pr:`23404` by `Thomas Fan`_.

:mod:`sklearn.utils`
....................

Expand Down
8 changes: 5 additions & 3 deletions sklearn/tree/_splitter.pyx
Expand Up @@ -342,7 +342,7 @@ cdef class BestSplitter(BaseDenseSplitter):
for i in range(start, end):
Xf[i] = self.X[samples[i], current.feature]

simultaneous_sort(&Xf[start], &samples[start], end - start)
simultaneous_sort(&Xf[start], &samples[start], end - start, use_introsort=1)

if Xf[end - 1] <= Xf[start] + FEATURE_THRESHOLD:
features[f_j], features[n_total_constants] = features[n_total_constants], features[f_j]
Expand Down Expand Up @@ -1049,9 +1049,11 @@ cdef class BestSparseSplitter(BaseSparseSplitter):
&is_samples_sorted)

# Sort the positive and negative parts of `Xf`
simultaneous_sort(&Xf[start], &samples[start], end_negative - start)
simultaneous_sort(&Xf[start], &samples[start], end_negative - start,
use_introsort=1)
if start_positive < end:
simultaneous_sort(&Xf[start_positive], &samples[start_positive], end - start_positive)
simultaneous_sort(&Xf[start_positive], &samples[start_positive],
end - start_positive, use_introsort=1)

# Update index_to_samples to take into account the sort
for p in range(start, end_negative):
Expand Down
1 change: 1 addition & 0 deletions sklearn/utils/_sorting.pxd
Expand Up @@ -6,4 +6,5 @@ cdef int simultaneous_sort(
floating *dist,
ITYPE_t *idx,
ITYPE_t size,
bint use_introsort=*,
) nogil
82 changes: 77 additions & 5 deletions sklearn/utils/_sorting.pyx
@@ -1,4 +1,5 @@
from cython cimport floating
from libc.math cimport log2

cdef inline void dual_swap(
floating* darr,
Expand All @@ -16,10 +17,62 @@ cdef inline void dual_swap(
iarr[b] = itmp


cdef int simultaneous_sort(
cdef inline void sift_down(
floating* values,
ITYPE_t* samples,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

samples => indices to be renamed here as well.

ITYPE_t start,
ITYPE_t end,
) nogil:
# Restore heap order in values[start:end] by moving the max element to start.
cdef ITYPE_t child, maxind, root

root = start
while True:
child = root * 2 + 1

# find max of root, left child, right child
maxind = root
if child < end and values[maxind] < values[child]:
maxind = child
if child + 1 < end and values[maxind] < values[child + 1]:
maxind = child + 1

if maxind == root:
break
else:
dual_swap(values, samples, root, maxind)
root = maxind


cdef inline void heapsort(
floating* values,
ITYPE_t* samples,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be renamed to indices to use a consistent notation with _simultaneous_sort.

ITYPE_t n
) nogil:
cdef:
ITYPE_t start = (n - 2) / 2
ITYPE_t end = n

# heapify
while True:
sift_down(values, samples, start, end)
if start == 0:
break
start -= 1

# sort by shrinking the heap, putting the max element immediately after it
end = n - 1
while end > 0:
dual_swap(values, samples, 0, end)
sift_down(values, samples, 0, end)
end = end - 1


cdef inline int simultaneous_sort(
floating* values,
ITYPE_t* indices,
ITYPE_t size,
bint use_introsort=0,
) nogil:
"""
Perform a recursive quicksort on the values array as to sort them ascendingly.
Expand All @@ -31,6 +84,10 @@ cdef int simultaneous_sort(
i = np.argsort(dist)
return dist[i], idx[i]

If use_introsort=1, then the introsort algorithm is used. This sorting algorithm
switches from quicksort to heapsort when the recursion depth based on
based on 2 * log2(size).

Notes
-----
Arrays are manipulated via a pointer to there first element and their size
Expand All @@ -41,6 +98,19 @@ cdef int simultaneous_sort(
# The best might be using a std::stable_sort and a Comparator which might need
# an Array of Structures (AoS) instead of the Structure of Arrays (SoA)
# currently used.
if use_introsort == 1:
_simultaneous_sort(values, indices, size, 2 * <int>log2(size), 1)
else:
_simultaneous_sort(values, indices, size, -1, 0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type is int but we never return anything (0 is implicit in this case I guess).

Let's switch to void?



cdef inline int _simultaneous_sort(
floating* values,
ITYPE_t* indices,
ITYPE_t size,
int max_depth,
bint use_introsort,
) nogil:
cdef:
ITYPE_t pivot_idx, i, store_idx
floating pivot_val
Expand All @@ -58,6 +128,8 @@ cdef int simultaneous_sort(
dual_swap(values, indices, 1, 2)
if values[0] > values[1]:
dual_swap(values, indices, 0, 1)
elif use_introsort and max_depth <= 0:
heapsort(values, indices, size)
else:
# Determine the pivot using the median-of-three rule.
# The smallest of the three is moved to the beginning of the array,
Expand Down Expand Up @@ -85,9 +157,9 @@ cdef int simultaneous_sort(

# Recursively sort each side of the pivot
if pivot_idx > 1:
simultaneous_sort(values, indices, pivot_idx)
_simultaneous_sort(values, indices, pivot_idx, max_depth - 1, use_introsort)
if pivot_idx + 2 < size:
simultaneous_sort(values + pivot_idx + 1,
indices + pivot_idx + 1,
size - pivot_idx - 1)
_simultaneous_sort(values + pivot_idx + 1,
indices + pivot_idx + 1,
size - pivot_idx - 1, max_depth - 1, use_introsort)
return 0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment for the private helper function: the return type should be void.