Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX fix performance regression in trees with low-cardinality features #23410

Merged
merged 4 commits into from May 19, 2022
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
123 changes: 118 additions & 5 deletions sklearn/tree/_splitter.pyx
Expand Up @@ -26,7 +26,6 @@ from ._utils cimport log
from ._utils cimport rand_int
from ._utils cimport rand_uniform
from ._utils cimport RAND_R_MAX
from ..utils._sorting cimport simultaneous_sort

cdef double INFINITY = np.inf

Expand Down Expand Up @@ -342,7 +341,7 @@ cdef class BestSplitter(BaseDenseSplitter):
for i in range(start, end):
Xf[i] = self.X[samples[i], current.feature]

simultaneous_sort(&Xf[start], &samples[start], end - start)
sort(&Xf[start], &samples[start], end - start)

if Xf[end - 1] <= Xf[start] + FEATURE_THRESHOLD:
features[f_j], features[n_total_constants] = features[n_total_constants], features[f_j]
Expand Down Expand Up @@ -438,6 +437,120 @@ cdef class BestSplitter(BaseDenseSplitter):
return 0


# Sort n-element arrays pointed to by Xf and samples, simultaneously,
# by the values in Xf. Algorithm: Introsort (Musser, SP&E, 1997).
cdef inline void sort(DTYPE_t* Xf, SIZE_t* samples, SIZE_t n) nogil:
if n == 0:
return
cdef int maxd = 2 * <int>log(n)
introsort(Xf, samples, n, maxd)


cdef inline void swap(DTYPE_t* Xf, SIZE_t* samples,
SIZE_t i, SIZE_t j) nogil:
# Helper for sort
Xf[i], Xf[j] = Xf[j], Xf[i]
samples[i], samples[j] = samples[j], samples[i]


cdef inline DTYPE_t median3(DTYPE_t* Xf, SIZE_t n) nogil:
# Median of three pivot selection, after Bentley and McIlroy (1993).
# Engineering a sort function. SP&E. Requires 8/3 comparisons on average.
cdef DTYPE_t a = Xf[0], b = Xf[n / 2], c = Xf[n - 1]
if a < b:
if b < c:
return b
elif a < c:
return c
else:
return a
elif b < c:
if a < c:
return a
else:
return c
else:
return b


# Introsort with median of 3 pivot selection and 3-way partition function
# (robust to repeated elements, e.g. lots of zero features).
cdef void introsort(DTYPE_t* Xf, SIZE_t *samples,
SIZE_t n, int maxd) nogil:
cdef DTYPE_t pivot
cdef SIZE_t i, l, r

while n > 1:
if maxd <= 0: # max depth limit exceeded ("gone quadratic")
heapsort(Xf, samples, n)
return
maxd -= 1

pivot = median3(Xf, n)

# Three-way partition.
i = l = 0
r = n
while i < r:
if Xf[i] < pivot:
swap(Xf, samples, i, l)
i += 1
l += 1
elif Xf[i] > pivot:
r -= 1
swap(Xf, samples, i, r)
else:
i += 1

introsort(Xf, samples, l, maxd)
Xf += r
samples += r
n -= r


cdef inline void sift_down(DTYPE_t* Xf, SIZE_t* samples,
SIZE_t start, SIZE_t end) nogil:
# Restore heap order in Xf[start:end] by moving the max element to start.
cdef SIZE_t child, maxind, root

root = start
while True:
child = root * 2 + 1

# find max of root, left child, right child
maxind = root
if child < end and Xf[maxind] < Xf[child]:
maxind = child
if child + 1 < end and Xf[maxind] < Xf[child + 1]:
maxind = child + 1

if maxind == root:
break
else:
swap(Xf, samples, root, maxind)
root = maxind


cdef void heapsort(DTYPE_t* Xf, SIZE_t* samples, SIZE_t n) nogil:
cdef SIZE_t start, end

# heapify
start = (n - 2) / 2
end = n
while True:
sift_down(Xf, samples, start, end)
if start == 0:
break
start -= 1

# sort by shrinking the heap, putting the max element immediately after it
end = n - 1
while end > 0:
swap(Xf, samples, 0, end)
sift_down(Xf, samples, 0, end)
end = end - 1


cdef class RandomSplitter(BaseDenseSplitter):
"""Splitter for finding the best random split."""
def __reduce__(self):
Expand Down Expand Up @@ -1047,11 +1160,11 @@ cdef class BestSparseSplitter(BaseSparseSplitter):
current.feature = features[f_j]
self.extract_nnz(current.feature, &end_negative, &start_positive,
&is_samples_sorted)

# Sort the positive and negative parts of `Xf`
simultaneous_sort(&Xf[start], &samples[start], end_negative - start)
sort(&Xf[start], &samples[start], end_negative - start)
if start_positive < end:
simultaneous_sort(&Xf[start_positive], &samples[start_positive], end - start_positive)
sort(&Xf[start_positive], &samples[start_positive],
end - start_positive)
Copy link
Member

@ogrisel ogrisel May 18, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might need to re-introduce the if start_positive < end: condition here.

lesteve marked this conversation as resolved.
Show resolved Hide resolved

# Update index_to_samples to take into account the sort
for p in range(start, end_negative):
Expand Down