forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 3
/
_sorting.pyx
165 lines (143 loc) · 5.09 KB
/
_sorting.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
from cython cimport floating
from libc.math cimport log2
cdef inline void dual_swap(
floating* darr,
ITYPE_t *iarr,
ITYPE_t a,
ITYPE_t b,
) nogil:
"""Swap the values at index a and b of both darr and iarr"""
cdef floating dtmp = darr[a]
darr[a] = darr[b]
darr[b] = dtmp
cdef ITYPE_t itmp = iarr[a]
iarr[a] = iarr[b]
iarr[b] = itmp
cdef inline void sift_down(
floating* values,
ITYPE_t* samples,
ITYPE_t start,
ITYPE_t end,
) nogil:
# Restore heap order in values[start:end] by moving the max element to start.
cdef ITYPE_t child, maxind, root
root = start
while True:
child = root * 2 + 1
# find max of root, left child, right child
maxind = root
if child < end and values[maxind] < values[child]:
maxind = child
if child + 1 < end and values[maxind] < values[child + 1]:
maxind = child + 1
if maxind == root:
break
else:
dual_swap(values, samples, root, maxind)
root = maxind
cdef inline void heapsort(
floating* values,
ITYPE_t* samples,
ITYPE_t n
) nogil:
cdef:
ITYPE_t start = (n - 2) / 2
ITYPE_t end = n
# heapify
while True:
sift_down(values, samples, start, end)
if start == 0:
break
start -= 1
# sort by shrinking the heap, putting the max element immediately after it
end = n - 1
while end > 0:
dual_swap(values, samples, 0, end)
sift_down(values, samples, 0, end)
end = end - 1
cdef inline int simultaneous_sort(
floating* values,
ITYPE_t* indices,
ITYPE_t size,
bint use_introsort=0,
) nogil:
"""
Perform a recursive quicksort on the values array as to sort them ascendingly.
This simultaneously performs the swaps on both the values and the indices arrays.
The numpy equivalent is:
def simultaneous_sort(dist, idx):
i = np.argsort(dist)
return dist[i], idx[i]
If use_introsort=1, then the introsort algorithm is used. This sorting algorithm
switches from quicksort to heapsort when the recursion depth based on
based on 2 * log2(size).
Notes
-----
Arrays are manipulated via a pointer to there first element and their size
as to ease the processing of dynamically allocated buffers.
"""
# TODO: In order to support discrete distance metrics, we need to have a
# simultaneous sort which breaks ties on indices when distances are identical.
# The best might be using a std::stable_sort and a Comparator which might need
# an Array of Structures (AoS) instead of the Structure of Arrays (SoA)
# currently used.
if use_introsort == 1:
_simultaneous_sort(values, indices, size, 2 * <int>log2(size), 1)
else:
_simultaneous_sort(values, indices, size, -1, 0)
cdef inline int _simultaneous_sort(
floating* values,
ITYPE_t* indices,
ITYPE_t size,
int max_depth,
bint use_introsort,
) nogil:
cdef:
ITYPE_t pivot_idx, i, store_idx
floating pivot_val
# in the small-array case, do things efficiently
if size <= 1:
pass
elif size == 2:
if values[0] > values[1]:
dual_swap(values, indices, 0, 1)
elif size == 3:
if values[0] > values[1]:
dual_swap(values, indices, 0, 1)
if values[1] > values[2]:
dual_swap(values, indices, 1, 2)
if values[0] > values[1]:
dual_swap(values, indices, 0, 1)
elif use_introsort and max_depth <= 0:
heapsort(values, indices, size)
else:
# Determine the pivot using the median-of-three rule.
# The smallest of the three is moved to the beginning of the array,
# the middle (the pivot value) is moved to the end, and the largest
# is moved to the pivot index.
pivot_idx = size // 2
if values[0] > values[size - 1]:
dual_swap(values, indices, 0, size - 1)
if values[size - 1] > values[pivot_idx]:
dual_swap(values, indices, size - 1, pivot_idx)
if values[0] > values[size - 1]:
dual_swap(values, indices, 0, size - 1)
pivot_val = values[size - 1]
# Partition indices about pivot. At the end of this operation,
# pivot_idx will contain the pivot value, everything to the left
# will be smaller, and everything to the right will be larger.
store_idx = 0
for i in range(size - 1):
if values[i] < pivot_val:
dual_swap(values, indices, i, store_idx)
store_idx += 1
dual_swap(values, indices, store_idx, size - 1)
pivot_idx = store_idx
# Recursively sort each side of the pivot
if pivot_idx > 1:
_simultaneous_sort(values, indices, pivot_idx, max_depth - 1, use_introsort)
if pivot_idx + 2 < size:
_simultaneous_sort(values + pivot_idx + 1,
indices + pivot_idx + 1,
size - pivot_idx - 1, max_depth - 1, use_introsort)
return 0