forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 1
/
binary_tree.pxi
executable file
·2619 lines (2213 loc) · 105 KB
/
binary_tree.pxi
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!python
# cython: language_level=3
# KD Tree and Ball Tree
# =====================
#
# Author: Jake Vanderplas <jakevdp@cs.washington.edu>, 2012-2013
# License: BSD
#
# This file is meant to be a literal include in a pyx file.
# See ball_tree.pyx and kd_tree.pyx
#
# The routines here are the core algorithms of the KDTree and BallTree
# structures. If Cython supported polymorphism, we would be able to
# create a subclass and derive KDTree and BallTree from it. Because
# polymorphism is not an option, we use this single BinaryTree class
# as a literal include to avoid duplicating the entire file.
#
# A series of functions are implemented in kd_tree.pyx and ball_tree.pyx
# which use the information here to calculate the lower and upper bounds
# between a node and a point, and between two nodes. These functions are
# used here, and are all that are needed to differentiate between the two
# tree types.
#
# Description of Binary Tree Algorithms
# -------------------------------------
# A binary tree can be thought of as a collection of nodes. The top node
# contains all the points. The next level consists of two nodes with half
# the points in each, and this continues recursively. Each node contains
# metadata which allow fast computation of distance bounds: in the case of
# a ball tree, the metadata is a center and a radius. In the case of a
# KD tree, the metadata is the minimum and maximum bound along each dimension.
#
# In a typical KD Tree or Ball Tree implementation, the nodes are implemented
# as dynamically allocated structures with pointers linking them. Here we
# take a different approach, storing all relevant data in a set of arrays
# so that the entire tree object can be saved in a pickle file. For efficiency,
# the data can be stored in such a way that explicit pointers are not
# necessary: for node data stored at index i, the two child nodes are at
# index (2 * i + 1) and (2 * i + 2); the parent node is (i - 1) // 2
# (where // indicates integer division).
#
# The data arrays used here are as follows:
# data : the [n_samples x n_features] array of data from which the tree
# is built
# idx_array : the length n_samples array used to keep track of the indices
# of data within each node. Each node has values idx_start and
# idx_end: the points within the node are given by (using numpy
# syntax) data[idx_array[idx_start:idx_end]].
# node_data : the length n_nodes array of structures which store the node
# indices, node radii, and leaf information for each node.
# node_bounds : the [* x n_nodes x n_features] array containing the node
# bound information. For ball tree, the first dimension is 1, and
# each row contains the centroid of the node. For kd tree, the first
# dimension is 2 and the rows for each point contain the arrays of
# lower bounds and upper bounds in each direction.
#
# The lack of dynamic allocation means the number of nodes must be computed
# before the building of the tree. This can be done assuming the points are
# divided equally between child nodes at each step; although this removes
# some flexibility in tree creation, it ensures a balanced tree and ensures
# that the number of nodes required can be computed beforehand. Given a
# specified leaf_size (the minimum number of points in any node), it is
# possible to show that a balanced tree will have
#
# n_levels = 1 + max(0, floor(log2((n_samples - 1) / leaf_size)))
#
# in order to satisfy
#
# leaf_size <= min(n_points) <= 2 * leaf_size
#
# with the exception of the special case where n_samples < leaf_size.
# for a given number of levels, the number of nodes in the tree is given by
#
# n_nodes = 2 ** n_levels - 1
#
# both these results can be straightforwardly shown by induction. The
# following code uses these values in the construction of the tree.
#
# Distance Metrics
# ----------------
# For flexibility, the trees can be built using a variety of distance metrics.
# The metrics are described in the DistanceMetric class: the standard
# Euclidean distance is the default, and is inlined to be faster than other
# metrics. In addition, each metric defines both a distance and a
# "reduced distance", which is often faster to compute, and is therefore
# used in the query architecture whenever possible. (For example, in the
# case of the standard Euclidean distance, the reduced distance is the
# squared-distance).
#
# Implementation Notes
# --------------------
# This implementation uses the common object-oriented approach of having an
# abstract base class which is extended by the KDTree and BallTree
# specializations.
#
# The BinaryTree "base class" is defined here and then subclassed in the BallTree
# and KDTree pyx files. These files include implementations of the
# "abstract" methods.
# Necessary Helper Functions
# --------------------------
# These are the names and descriptions of the "abstract" functions which are
# defined in kd_tree.pyx and ball_tree.pyx:
# cdef int allocate_data(BinaryTree tree, ITYPE_t n_nodes, ITYPE_t n_features):
# """Allocate arrays needed for the KD Tree"""
# cdef int init_node(BinaryTree tree, ITYPE_t i_node,
# ITYPE_t idx_start, ITYPE_t idx_end):
# """Initialize the node for the dataset stored in tree.data"""
# cdef DTYPE_t min_rdist(BinaryTree tree, ITYPE_t i_node, DTYPE_t* pt):
# """Compute the minimum reduced-distance between a point and a node"""
# cdef DTYPE_t min_dist(BinaryTree tree, ITYPE_t i_node, DTYPE_t* pt):
# """Compute the minimum distance between a point and a node"""
# cdef DTYPE_t max_rdist(BinaryTree tree, ITYPE_t i_node, DTYPE_t* pt):
# """Compute the maximum reduced-distance between a point and a node"""
# cdef DTYPE_t max_dist(BinaryTree tree, ITYPE_t i_node, DTYPE_t* pt):
# """Compute the maximum distance between a point and a node"""
# cdef inline int min_max_dist(BinaryTree tree, ITYPE_t i_node, DTYPE_t* pt,
# DTYPE_t* min_dist, DTYPE_t* max_dist):
# """Compute the minimum and maximum distance between a point and a node"""
# cdef inline DTYPE_t min_rdist_dual(BinaryTree tree1, ITYPE_t i_node1,
# BinaryTree tree2, ITYPE_t i_node2):
# """Compute the minimum reduced distance between two nodes"""
# cdef inline DTYPE_t min_dist_dual(BinaryTree tree1, ITYPE_t i_node1,
# BinaryTree tree2, ITYPE_t i_node2):
# """Compute the minimum distance between two nodes"""
# cdef inline DTYPE_t max_rdist_dual(BinaryTree tree1, ITYPE_t i_node1,
# BinaryTree tree2, ITYPE_t i_node2):
# """Compute the maximum reduced distance between two nodes"""
# cdef inline DTYPE_t max_dist_dual(BinaryTree tree1, ITYPE_t i_node1,
# BinaryTree tree2, ITYPE_t i_node2):
# """Compute the maximum distance between two nodes"""
cimport cython
cimport numpy as np
from libc.math cimport fabs, sqrt, exp, cos, pow, log
from libc.stdlib cimport calloc, malloc, free
from libc.string cimport memcpy
from sklearn.utils.lgamma cimport lgamma
import numpy as np
import warnings
from ..utils import check_array
from .typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t
from .typedefs import DTYPE, ITYPE
from .dist_metrics cimport (DistanceMetric, euclidean_dist, euclidean_rdist,
euclidean_dist_to_rdist, euclidean_rdist_to_dist)
cdef extern from "numpy/arrayobject.h":
void PyArray_ENABLEFLAGS(np.ndarray arr, int flags)
np.import_array()
# some handy constants
cdef DTYPE_t INF = np.inf
cdef DTYPE_t NEG_INF = -np.inf
cdef DTYPE_t PI = np.pi
cdef DTYPE_t ROOT_2PI = sqrt(2 * PI)
cdef DTYPE_t LOG_PI = log(PI)
cdef DTYPE_t LOG_2PI = log(2 * PI)
# Some compound datatypes used below:
cdef struct NodeHeapData_t:
DTYPE_t val
ITYPE_t i1
ITYPE_t i2
# build the corresponding numpy dtype for NodeHeapData
# There is no offsetof() function in cython, so we hack it.
# If we can ensure numpy 1.5 or greater, a cleaner way is to do
# cdef NodeHeapData_t nhd_tmp
# NodeHeapData = np.asarray(<NodeHeapData_t[:1]>(&nhd_tmp)).dtype
cdef NodeHeapData_t nhd_tmp
offsets = [<np.intp_t>&(nhd_tmp.val) - <np.intp_t>&nhd_tmp,
<np.intp_t>&(nhd_tmp.i1) - <np.intp_t>&nhd_tmp,
<np.intp_t>&(nhd_tmp.i2) - <np.intp_t>&nhd_tmp]
NodeHeapData = np.dtype({'names': ['val', 'i1', 'i2'],
'formats': [DTYPE, ITYPE, ITYPE],
'offsets': offsets,
'itemsize': sizeof(NodeHeapData_t)})
cdef struct NodeData_t:
ITYPE_t idx_start
ITYPE_t idx_end
ITYPE_t is_leaf
DTYPE_t radius
# build the corresponding numpy dtype for NodeData
# There is no offsetof() function in cython, so we hack it.
# If we can ensure numpy 1.5 or greater, a cleaner way is to do
# cdef NodeData_t nd_tmp
# NodeData = np.asarray(<NodeData_t[:1]>(&nd_tmp)).dtype
cdef NodeData_t nd_tmp
offsets = [<np.intp_t>&(nd_tmp.idx_start) - <np.intp_t>&nd_tmp,
<np.intp_t>&(nd_tmp.idx_end) - <np.intp_t>&nd_tmp,
<np.intp_t>&(nd_tmp.is_leaf) - <np.intp_t>&nd_tmp,
<np.intp_t>&(nd_tmp.radius) - <np.intp_t>&nd_tmp]
NodeData = np.dtype({'names': ['idx_start', 'idx_end', 'is_leaf', 'radius'],
'formats': [ITYPE, ITYPE, ITYPE, DTYPE],
'offsets': offsets,
'itemsize': sizeof(NodeData_t)})
######################################################################
# Numpy 1.3-1.4 compatibility utilities
cdef DTYPE_t[::1] get_memview_DTYPE_1D(
np.ndarray[DTYPE_t, ndim=1, mode='c'] X):
return <DTYPE_t[:X.shape[0]:1]> (<DTYPE_t*> X.data)
cdef DTYPE_t[:, ::1] get_memview_DTYPE_2D(
np.ndarray[DTYPE_t, ndim=2, mode='c'] X):
return <DTYPE_t[:X.shape[0], :X.shape[1]:1]> (<DTYPE_t*> X.data)
cdef DTYPE_t[:, :, ::1] get_memview_DTYPE_3D(
np.ndarray[DTYPE_t, ndim=3, mode='c'] X):
return <DTYPE_t[:X.shape[0], :X.shape[1], :X.shape[2]:1]>\
(<DTYPE_t*> X.data)
cdef ITYPE_t[::1] get_memview_ITYPE_1D(
np.ndarray[ITYPE_t, ndim=1, mode='c'] X):
return <ITYPE_t[:X.shape[0]:1]> (<ITYPE_t*> X.data)
cdef ITYPE_t[:, ::1] get_memview_ITYPE_2D(
np.ndarray[ITYPE_t, ndim=2, mode='c'] X):
return <ITYPE_t[:X.shape[0], :X.shape[1]:1]> (<ITYPE_t*> X.data)
cdef NodeHeapData_t[::1] get_memview_NodeHeapData_1D(
np.ndarray[NodeHeapData_t, ndim=1, mode='c'] X):
return <NodeHeapData_t[:X.shape[0]:1]> (<NodeHeapData_t*> X.data)
cdef NodeData_t[::1] get_memview_NodeData_1D(
np.ndarray[NodeData_t, ndim=1, mode='c'] X):
return <NodeData_t[:X.shape[0]:1]> (<NodeData_t*> X.data)
######################################################################
######################################################################
# Define doc strings, substituting the appropriate class name using
# the DOC_DICT variable defined in the pyx files.
CLASS_DOC = \
"""{BinaryTree} for fast generalized N-point problems
{BinaryTree}(X, leaf_size=40, metric='minkowski', \\**kwargs)
Parameters
----------
X : array-like, shape = [n_samples, n_features]
n_samples is the number of points in the data set, and
n_features is the dimension of the parameter space.
Note: if X is a C-contiguous array of doubles then data will
not be copied. Otherwise, an internal copy will be made.
leaf_size : positive integer (default = 40)
Number of points at which to switch to brute-force. Changing
leaf_size will not affect the results of a query, but can
significantly impact the speed of a query and the memory required
to store the constructed tree. The amount of memory needed to
store the tree scales as approximately n_samples / leaf_size.
For a specified ``leaf_size``, a leaf node is guaranteed to
satisfy ``leaf_size <= n_points <= 2 * leaf_size``, except in
the case that ``n_samples < leaf_size``.
metric : string or DistanceMetric object
the distance metric to use for the tree. Default='minkowski'
with p=2 (that is, a euclidean metric). See the documentation
of the DistanceMetric class for a list of available metrics.
{binary_tree}.valid_metrics gives a list of the metrics which
are valid for {BinaryTree}.
Additional keywords are passed to the distance metric class.
Attributes
----------
data : memory view
The training data
Examples
--------
Query for k-nearest neighbors
>>> import numpy as np
>>> rng = np.random.RandomState(0)
>>> X = rng.random_sample((10, 3)) # 10 points in 3 dimensions
>>> tree = {BinaryTree}(X, leaf_size=2) # doctest: +SKIP
>>> dist, ind = tree.query(X[:1], k=3) # doctest: +SKIP
>>> print(ind) # indices of 3 closest neighbors
[0 3 1]
>>> print(dist) # distances to 3 closest neighbors
[ 0. 0.19662693 0.29473397]
Pickle and Unpickle a tree. Note that the state of the tree is saved in the
pickle operation: the tree needs not be rebuilt upon unpickling.
>>> import numpy as np
>>> import pickle
>>> rng = np.random.RandomState(0)
>>> X = rng.random_sample((10, 3)) # 10 points in 3 dimensions
>>> tree = {BinaryTree}(X, leaf_size=2) # doctest: +SKIP
>>> s = pickle.dumps(tree) # doctest: +SKIP
>>> tree_copy = pickle.loads(s) # doctest: +SKIP
>>> dist, ind = tree_copy.query(X[:1], k=3) # doctest: +SKIP
>>> print(ind) # indices of 3 closest neighbors
[0 3 1]
>>> print(dist) # distances to 3 closest neighbors
[ 0. 0.19662693 0.29473397]
Query for neighbors within a given radius
>>> import numpy as np
>>> rng = np.random.RandomState(0)
>>> X = rng.random_sample((10, 3)) # 10 points in 3 dimensions
>>> tree = {BinaryTree}(X, leaf_size=2) # doctest: +SKIP
>>> print(tree.query_radius(X[:1], r=0.3, count_only=True))
3
>>> ind = tree.query_radius(X[:1], r=0.3) # doctest: +SKIP
>>> print(ind) # indices of neighbors within distance 0.3
[3 0 1]
Compute a gaussian kernel density estimate:
>>> import numpy as np
>>> rng = np.random.RandomState(42)
>>> X = rng.random_sample((100, 3))
>>> tree = {BinaryTree}(X) # doctest: +SKIP
>>> tree.kernel_density(X[:3], h=0.1, kernel='gaussian')
array([ 6.94114649, 7.83281226, 7.2071716 ])
Compute a two-point auto-correlation function
>>> import numpy as np
>>> rng = np.random.RandomState(0)
>>> X = rng.random_sample((30, 3))
>>> r = np.linspace(0, 1, 5)
>>> tree = {BinaryTree}(X) # doctest: +SKIP
>>> tree.two_point_correlation(X, r)
array([ 30, 62, 278, 580, 820])
"""
######################################################################
# Utility functions
cdef DTYPE_t logaddexp(DTYPE_t x1, DTYPE_t x2):
"""logaddexp(x1, x2) -> log(exp(x1) + exp(x2))"""
cdef DTYPE_t a = fmax(x1, x2)
if a == NEG_INF:
return NEG_INF
else:
return a + log(exp(x1 - a) + exp(x2 - a))
cdef DTYPE_t logsubexp(DTYPE_t x1, DTYPE_t x2):
"""logsubexp(x1, x2) -> log(exp(x1) - exp(x2))"""
if x1 <= x2:
return NEG_INF
else:
return x1 + log(1 - exp(x2 - x1))
######################################################################
# Kernel functions
#
# Note: Kernels assume dist is non-negative and h is positive
# All kernel functions are normalized such that K(0, h) = 1.
# The fully normalized kernel is:
# K = exp[kernel_norm(h, d, kernel) + compute_kernel(dist, h, kernel)]
# The code only works with non-negative kernels: i.e. K(d, h) >= 0
# for all valid d and h. Note that for precision, the log of both
# the kernel and kernel norm is returned.
cdef enum KernelType:
GAUSSIAN_KERNEL = 1
TOPHAT_KERNEL = 2
EPANECHNIKOV_KERNEL = 3
EXPONENTIAL_KERNEL = 4
LINEAR_KERNEL = 5
COSINE_KERNEL = 6
cdef inline DTYPE_t log_gaussian_kernel(DTYPE_t dist, DTYPE_t h):
"""log of the gaussian kernel for bandwidth h (unnormalized)"""
return -0.5 * (dist * dist) / (h * h)
cdef inline DTYPE_t log_tophat_kernel(DTYPE_t dist, DTYPE_t h):
"""log of the tophat kernel for bandwidth h (unnormalized)"""
if dist < h:
return 0.0
else:
return NEG_INF
cdef inline DTYPE_t log_epanechnikov_kernel(DTYPE_t dist, DTYPE_t h):
"""log of the epanechnikov kernel for bandwidth h (unnormalized)"""
if dist < h:
return log(1.0 - (dist * dist) / (h * h))
else:
return NEG_INF
cdef inline DTYPE_t log_exponential_kernel(DTYPE_t dist, DTYPE_t h):
"""log of the exponential kernel for bandwidth h (unnormalized)"""
return -dist / h
cdef inline DTYPE_t log_linear_kernel(DTYPE_t dist, DTYPE_t h):
"""log of the linear kernel for bandwidth h (unnormalized)"""
if dist < h:
return log(1 - dist / h)
else:
return NEG_INF
cdef inline DTYPE_t log_cosine_kernel(DTYPE_t dist, DTYPE_t h):
"""log of the cosine kernel for bandwidth h (unnormalized)"""
if dist < h:
return log(cos(0.5 * PI * dist / h))
else:
return NEG_INF
cdef inline DTYPE_t compute_log_kernel(DTYPE_t dist, DTYPE_t h,
KernelType kernel):
"""Given a KernelType enumeration, compute the appropriate log-kernel"""
if kernel == GAUSSIAN_KERNEL:
return log_gaussian_kernel(dist, h)
elif kernel == TOPHAT_KERNEL:
return log_tophat_kernel(dist, h)
elif kernel == EPANECHNIKOV_KERNEL:
return log_epanechnikov_kernel(dist, h)
elif kernel == EXPONENTIAL_KERNEL:
return log_exponential_kernel(dist, h)
elif kernel == LINEAR_KERNEL:
return log_linear_kernel(dist, h)
elif kernel == COSINE_KERNEL:
return log_cosine_kernel(dist, h)
#------------------------------------------------------------
# Kernel norms are defined via the volume element V_n
# and surface element S_(n-1) of an n-sphere.
cdef DTYPE_t logVn(ITYPE_t n):
"""V_n = pi^(n/2) / gamma(n/2 - 1)"""
return 0.5 * n * LOG_PI - lgamma(0.5 * n + 1)
cdef DTYPE_t logSn(ITYPE_t n):
"""V_(n+1) = int_0^1 S_n r^n dr"""
return LOG_2PI + logVn(n - 1)
cdef DTYPE_t _log_kernel_norm(DTYPE_t h, ITYPE_t d,
KernelType kernel) except -1:
"""Given a KernelType enumeration, compute the kernel normalization.
h is the bandwidth, d is the dimension.
"""
cdef DTYPE_t tmp, factor = 0
cdef ITYPE_t k
if kernel == GAUSSIAN_KERNEL:
factor = 0.5 * d * LOG_2PI
elif kernel == TOPHAT_KERNEL:
factor = logVn(d)
elif kernel == EPANECHNIKOV_KERNEL:
factor = logVn(d) + log(2. / (d + 2.))
elif kernel == EXPONENTIAL_KERNEL:
factor = logSn(d - 1) + lgamma(d)
elif kernel == LINEAR_KERNEL:
factor = logVn(d) - log(d + 1.)
elif kernel == COSINE_KERNEL:
# this is derived from a chain rule integration
factor = 0
tmp = 2. / PI
for k in range(1, d + 1, 2):
factor += tmp
tmp *= -(d - k) * (d - k - 1) * (2. / PI) ** 2
factor = log(factor) + logSn(d - 1)
else:
raise ValueError("Kernel code not recognized")
return -factor - d * log(h)
def kernel_norm(h, d, kernel, return_log=False):
"""Given a string specification of a kernel, compute the normalization.
Parameters
----------
h : float
the bandwidth of the kernel
d : int
the dimension of the space in which the kernel norm is computed
kernel : string
The kernel identifier. Must be one of
['gaussian'|'tophat'|'epanechnikov'|
'exponential'|'linear'|'cosine']
return_log : boolean
if True, return the log of the kernel norm. Otherwise, return the
kernel norm.
Returns
-------
knorm or log_knorm : float
the kernel norm or logarithm of the kernel norm.
"""
if kernel == 'gaussian':
result = _log_kernel_norm(h, d, GAUSSIAN_KERNEL)
elif kernel == 'tophat':
result = _log_kernel_norm(h, d, TOPHAT_KERNEL)
elif kernel == 'epanechnikov':
result = _log_kernel_norm(h, d, EPANECHNIKOV_KERNEL)
elif kernel == 'exponential':
result = _log_kernel_norm(h, d, EXPONENTIAL_KERNEL)
elif kernel == 'linear':
result = _log_kernel_norm(h, d, LINEAR_KERNEL)
elif kernel == 'cosine':
result = _log_kernel_norm(h, d, COSINE_KERNEL)
else:
raise ValueError('kernel not recognized')
if return_log:
return result
else:
return np.exp(result)
######################################################################
# Tree Utility Routines
cdef inline void swap(DITYPE_t* arr, ITYPE_t i1, ITYPE_t i2):
"""swap the values at index i1 and i2 of arr"""
cdef DITYPE_t tmp = arr[i1]
arr[i1] = arr[i2]
arr[i2] = tmp
cdef inline void dual_swap(DTYPE_t* darr, ITYPE_t* iarr,
ITYPE_t i1, ITYPE_t i2) nogil:
"""swap the values at inex i1 and i2 of both darr and iarr"""
cdef DTYPE_t dtmp = darr[i1]
darr[i1] = darr[i2]
darr[i2] = dtmp
cdef ITYPE_t itmp = iarr[i1]
iarr[i1] = iarr[i2]
iarr[i2] = itmp
cdef class NeighborsHeap:
"""A max-heap structure to keep track of distances/indices of neighbors
This implements an efficient pre-allocated set of fixed-size heaps
for chasing neighbors, holding both an index and a distance.
When any row of the heap is full, adding an additional point will push
the furthest point off the heap.
Parameters
----------
n_pts : int
the number of heaps to use
n_nbrs : int
the size of each heap.
"""
cdef np.ndarray distances_arr
cdef np.ndarray indices_arr
cdef DTYPE_t[:, ::1] distances
cdef ITYPE_t[:, ::1] indices
def __cinit__(self):
self.distances_arr = np.zeros((1, 1), dtype=DTYPE, order='C')
self.indices_arr = np.zeros((1, 1), dtype=ITYPE, order='C')
self.distances = get_memview_DTYPE_2D(self.distances_arr)
self.indices = get_memview_ITYPE_2D(self.indices_arr)
def __init__(self, n_pts, n_nbrs):
self.distances_arr = np.full((n_pts, n_nbrs), np.inf, dtype=DTYPE,
order='C')
self.indices_arr = np.zeros((n_pts, n_nbrs), dtype=ITYPE, order='C')
self.distances = get_memview_DTYPE_2D(self.distances_arr)
self.indices = get_memview_ITYPE_2D(self.indices_arr)
def get_arrays(self, sort=True):
"""Get the arrays of distances and indices within the heap.
If sort=True, then simultaneously sort the indices and distances,
so the closer points are listed first.
"""
if sort:
self._sort()
return self.distances_arr, self.indices_arr
cdef inline DTYPE_t largest(self, ITYPE_t row) nogil except -1:
"""Return the largest distance in the given row"""
return self.distances[row, 0]
def push(self, ITYPE_t row, DTYPE_t val, ITYPE_t i_val):
return self._push(row, val, i_val)
cdef int _push(self, ITYPE_t row, DTYPE_t val,
ITYPE_t i_val) nogil except -1:
"""push (val, i_val) into the given row"""
cdef ITYPE_t i, ic1, ic2, i_swap
cdef ITYPE_t size = self.distances.shape[1]
cdef DTYPE_t* dist_arr = &self.distances[row, 0]
cdef ITYPE_t* ind_arr = &self.indices[row, 0]
# check if val should be in heap
if val > dist_arr[0]:
return 0
# insert val at position zero
dist_arr[0] = val
ind_arr[0] = i_val
# descend the heap, swapping values until the max heap criterion is met
i = 0
while True:
ic1 = 2 * i + 1
ic2 = ic1 + 1
if ic1 >= size:
break
elif ic2 >= size:
if dist_arr[ic1] > val:
i_swap = ic1
else:
break
elif dist_arr[ic1] >= dist_arr[ic2]:
if val < dist_arr[ic1]:
i_swap = ic1
else:
break
else:
if val < dist_arr[ic2]:
i_swap = ic2
else:
break
dist_arr[i] = dist_arr[i_swap]
ind_arr[i] = ind_arr[i_swap]
i = i_swap
dist_arr[i] = val
ind_arr[i] = i_val
return 0
cdef int _sort(self) except -1:
"""simultaneously sort the distances and indices"""
cdef DTYPE_t[:, ::1] distances = self.distances
cdef ITYPE_t[:, ::1] indices = self.indices
cdef ITYPE_t row
for row in range(distances.shape[0]):
_simultaneous_sort(&distances[row, 0],
&indices[row, 0],
distances.shape[1])
return 0
cdef int _simultaneous_sort(DTYPE_t* dist, ITYPE_t* idx,
ITYPE_t size) nogil except -1:
"""
Perform a recursive quicksort on the dist array, simultaneously
performing the same swaps on the idx array. The equivalent in
numpy (though quite a bit slower) is
def simultaneous_sort(dist, idx):
i = np.argsort(dist)
return dist[i], idx[i]
"""
cdef ITYPE_t pivot_idx, i, store_idx
cdef DTYPE_t pivot_val
# in the small-array case, do things efficiently
if size <= 1:
pass
elif size == 2:
if dist[0] > dist[1]:
dual_swap(dist, idx, 0, 1)
elif size == 3:
if dist[0] > dist[1]:
dual_swap(dist, idx, 0, 1)
if dist[1] > dist[2]:
dual_swap(dist, idx, 1, 2)
if dist[0] > dist[1]:
dual_swap(dist, idx, 0, 1)
else:
# Determine the pivot using the median-of-three rule.
# The smallest of the three is moved to the beginning of the array,
# the middle (the pivot value) is moved to the end, and the largest
# is moved to the pivot index.
pivot_idx = size / 2
if dist[0] > dist[size - 1]:
dual_swap(dist, idx, 0, size - 1)
if dist[size - 1] > dist[pivot_idx]:
dual_swap(dist, idx, size - 1, pivot_idx)
if dist[0] > dist[size - 1]:
dual_swap(dist, idx, 0, size - 1)
pivot_val = dist[size - 1]
# partition indices about pivot. At the end of this operation,
# pivot_idx will contain the pivot value, everything to the left
# will be smaller, and everything to the right will be larger.
store_idx = 0
for i in range(size - 1):
if dist[i] < pivot_val:
dual_swap(dist, idx, i, store_idx)
store_idx += 1
dual_swap(dist, idx, store_idx, size - 1)
pivot_idx = store_idx
# recursively sort each side of the pivot
if pivot_idx > 1:
_simultaneous_sort(dist, idx, pivot_idx)
if pivot_idx + 2 < size:
_simultaneous_sort(dist + pivot_idx + 1,
idx + pivot_idx + 1,
size - pivot_idx - 1)
return 0
#------------------------------------------------------------
# find_node_split_dim:
# this computes the equivalent of
# j_max = np.argmax(np.max(data, 0) - np.min(data, 0))
cdef ITYPE_t find_node_split_dim(DTYPE_t* data,
ITYPE_t* node_indices,
ITYPE_t n_features,
ITYPE_t n_points) except -1:
"""Find the dimension with the largest spread.
Parameters
----------
data : double pointer
Pointer to a 2D array of the training data, of shape [N, n_features].
N must be greater than any of the values in node_indices.
node_indices : int pointer
Pointer to a 1D array of length n_points. This lists the indices of
each of the points within the current node.
Returns
-------
i_max : int
The index of the feature (dimension) within the node that has the
largest spread.
Notes
-----
In numpy, this operation is equivalent to
def find_node_split_dim(data, node_indices):
return np.argmax(data[node_indices].max(0) - data[node_indices].min(0))
The cython version is much more efficient in both computation and memory.
"""
cdef DTYPE_t min_val, max_val, val, spread, max_spread
cdef ITYPE_t i, j, j_max
j_max = 0
max_spread = 0
for j in range(n_features):
max_val = data[node_indices[0] * n_features + j]
min_val = max_val
for i in range(1, n_points):
val = data[node_indices[i] * n_features + j]
max_val = fmax(max_val, val)
min_val = fmin(min_val, val)
spread = max_val - min_val
if spread > max_spread:
max_spread = spread
j_max = j
return j_max
cdef int partition_node_indices(DTYPE_t* data,
ITYPE_t* node_indices,
ITYPE_t split_dim,
ITYPE_t split_index,
ITYPE_t n_features,
ITYPE_t n_points) except -1:
"""Partition points in the node into two equal-sized groups.
Upon return, the values in node_indices will be rearranged such that
(assuming numpy-style indexing):
data[node_indices[0:split_index], split_dim]
<= data[node_indices[split_index], split_dim]
and
data[node_indices[split_index], split_dim]
<= data[node_indices[split_index:n_points], split_dim]
The algorithm is essentially a partial in-place quicksort around a
set pivot.
Parameters
----------
data : double pointer
Pointer to a 2D array of the training data, of shape [N, n_features].
N must be greater than any of the values in node_indices.
node_indices : int pointer
Pointer to a 1D array of length n_points. This lists the indices of
each of the points within the current node. This will be modified
in-place.
split_dim : int
the dimension on which to split. This will usually be computed via
the routine ``find_node_split_dim``
split_index : int
the index within node_indices around which to split the points.
Returns
-------
status : int
integer exit status. On return, the contents of node_indices are
modified as noted above.
"""
cdef ITYPE_t left, right, midindex, i
cdef DTYPE_t d1, d2
left = 0
right = n_points - 1
while True:
midindex = left
for i in range(left, right):
d1 = data[node_indices[i] * n_features + split_dim]
d2 = data[node_indices[right] * n_features + split_dim]
if d1 < d2:
swap(node_indices, i, midindex)
midindex += 1
swap(node_indices, midindex, right)
if midindex == split_index:
break
elif midindex < split_index:
left = midindex + 1
else:
right = midindex - 1
return 0
######################################################################
# NodeHeap : min-heap used to keep track of nodes during
# breadth-first query
cdef inline void swap_nodes(NodeHeapData_t* arr, ITYPE_t i1, ITYPE_t i2):
cdef NodeHeapData_t tmp = arr[i1]
arr[i1] = arr[i2]
arr[i2] = tmp
cdef class NodeHeap:
"""NodeHeap
This is a min-heap implementation for keeping track of nodes
during a breadth-first search. Unlike the NeighborsHeap above,
the NodeHeap does not have a fixed size and must be able to grow
as elements are added.
Internally, the data is stored in a simple binary heap which meets
the min heap condition:
heap[i].val < min(heap[2 * i + 1].val, heap[2 * i + 2].val)
"""
cdef np.ndarray data_arr
cdef NodeHeapData_t[::1] data
cdef ITYPE_t n
def __cinit__(self):
self.data_arr = np.zeros(1, dtype=NodeHeapData, order='C')
self.data = get_memview_NodeHeapData_1D(self.data_arr)
def __init__(self, size_guess=100):
size_guess = max(size_guess, 1) # need space for at least one item
self.data_arr = np.zeros(size_guess, dtype=NodeHeapData, order='C')
self.data = get_memview_NodeHeapData_1D(self.data_arr)
self.n = size_guess
self.clear()
cdef int resize(self, ITYPE_t new_size) except -1:
"""Resize the heap to be either larger or smaller"""
cdef NodeHeapData_t *data_ptr
cdef NodeHeapData_t *new_data_ptr
cdef ITYPE_t i
cdef ITYPE_t size = self.data.shape[0]
cdef np.ndarray new_data_arr = np.zeros(new_size,
dtype=NodeHeapData)
cdef NodeHeapData_t[::1] new_data =\
get_memview_NodeHeapData_1D(new_data_arr)
if size > 0 and new_size > 0:
data_ptr = &self.data[0]
new_data_ptr = &new_data[0]
for i in range(min(size, new_size)):
new_data_ptr[i] = data_ptr[i]
if new_size < size:
self.n = new_size
self.data = new_data
self.data_arr = new_data_arr
return 0
cdef int push(self, NodeHeapData_t data) except -1:
"""Push a new item onto the heap"""
cdef ITYPE_t i, i_parent
cdef NodeHeapData_t* data_arr
self.n += 1
if self.n > self.data.shape[0]:
self.resize(2 * self.n)
# put the new element at the end,
# and then perform swaps until the heap is in order
data_arr = &self.data[0]
i = self.n - 1
data_arr[i] = data
while i > 0:
i_parent = (i - 1) // 2
if data_arr[i_parent].val <= data_arr[i].val:
break
else:
swap_nodes(data_arr, i, i_parent)
i = i_parent
return 0
cdef NodeHeapData_t peek(self):
"""Peek at the root of the heap, without removing it"""
return self.data[0]
cdef NodeHeapData_t pop(self):
"""Remove the root of the heap, and update the remaining nodes"""
if self.n == 0:
raise ValueError('cannot pop on empty heap')
cdef ITYPE_t i, i_child1, i_child2, i_swap
cdef NodeHeapData_t* data_arr = &self.data[0]
cdef NodeHeapData_t popped_element = data_arr[0]
# pop off the first element, move the last element to the front,
# and then perform swaps until the heap is back in order
data_arr[0] = data_arr[self.n - 1]
self.n -= 1
i = 0
while (i < self.n):
i_child1 = 2 * i + 1
i_child2 = 2 * i + 2
i_swap = 0
if i_child2 < self.n:
if data_arr[i_child1].val <= data_arr[i_child2].val:
i_swap = i_child1
else:
i_swap = i_child2
elif i_child1 < self.n:
i_swap = i_child1
else:
break
if (i_swap > 0) and (data_arr[i_swap].val <= data_arr[i].val):
swap_nodes(data_arr, i, i_swap)
i = i_swap
else:
break
return popped_element
cdef void clear(self):
"""Clear the heap"""
self.n = 0
######################################################################
# newObj function
# this is a helper function for pickling
def newObj(obj):
return obj.__new__(obj)