From 8a06c3f515c51ef351f43222a1050d83a066727d Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 30 Jun 2021 09:34:57 +0200
Subject: [PATCH 01/26] Rectify test

---
 sklearn/neighbors/tests/test_neighbors.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index ee9e92b0347ee..ca16f373212bc 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1801,15 +1801,11 @@ def test_pairwise_deprecated(NearestNeighbors):
 @pytest.mark.parametrize("d", [5, 10, 100])
 @pytest.mark.parametrize("ratio_train_test", [10, 2, 1, 0.5])
 @pytest.mark.parametrize("n_neighbors", [1, 10, 100, 1000])
-@pytest.mark.parametrize("chunk_size", [2 ** i for i in range(8, 11)])
-@pytest.mark.parametrize("strategy", ["chunk_on_train", "chunk_on_test"])
 def test_fast_sqeuclidean_correctness(
     n,
     d,
     ratio_train_test,
     n_neighbors,
-    chunk_size,
-    strategy,
     dtype=np.float64,
 ):
     # The fast squared euclidean strategy must return results

From 41bd644d2364246f5220d830e72e07fbe48e759b Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 30 Jun 2021 09:40:24 +0200
Subject: [PATCH 02/26] [WIP] Adapting to use class hierarchy

---
 sklearn/metrics/_argkmin_fast.pyx | 750 +++++++++++++++---------------
 sklearn/metrics/pairwise.py       |   6 +-
 sklearn/neighbors/_base.py        |   4 +-
 3 files changed, 381 insertions(+), 379 deletions(-)

diff --git a/sklearn/metrics/_argkmin_fast.pyx b/sklearn/metrics/_argkmin_fast.pyx
index 46549816d3b1b..e1367012bdffa 100644
--- a/sklearn/metrics/_argkmin_fast.pyx
+++ b/sklearn/metrics/_argkmin_fast.pyx
@@ -38,286 +38,6 @@ from ..utils._typedefs cimport ITYPE_t
 from ..utils._typedefs import ITYPE
 
 
-### argkmin helpers
-
-cdef void _argkmin_on_chunk(
-    floating[:, ::1] X_c,                  # IN
-    floating[:, ::1] Y_c,                  # IN
-    floating[::1] Y_sq_norms,              # IN
-    floating *dist_middle_terms,           # IN
-    floating *heaps_red_distances,         # IN/OUT
-    ITYPE_t *heaps_indices,                # IN/OUT
-    ITYPE_t k,                             # IN
-    # ID of the first element of Y_c
-    ITYPE_t Y_idx_offset,
-) nogil:
-    """
-    Critical part of the computation of pairwise distances.
-
-    "Fast Squared Euclidean" distances strategy relying
-    on the gemm-trick.
-    """
-    cdef:
-        ITYPE_t i, j
-    # Instead of computing the full pairwise squared distances matrix,
-    # ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²,
-    # we only need to store the - 2 X_c.Y_c^T + ||Y_c||²
-    # term since the argmin for a given sample X_c^{i} does not depend on
-    # ||X_c^{i}||²
-
-    # Careful: LDA, LDB and LDC are given for F-ordered arrays.
-    # Here, we use their counterpart values as indicated in the documentation.
-    # See the documentation of parameters here:
-    # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html
-    #
-    # dist_middle_terms = -2 * X_c.dot(Y_c.T)
-    _gemm(RowMajor, NoTrans, Trans,
-          X_c.shape[0], Y_c.shape[0], X_c.shape[1],
-          -2.0,
-          &X_c[0, 0], X_c.shape[1],
-          &Y_c[0, 0], X_c.shape[1], 0.0,
-          dist_middle_terms, Y_c.shape[0])
-
-    # Computing argmins here
-    for i in range(X_c.shape[0]):
-        for j in range(Y_c.shape[0]):
-            _push(heaps_red_distances + i * k,
-                  heaps_indices + i * k,
-                  k,
-                  # reduced distance: - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
-                  dist_middle_terms[i * Y_c.shape[0] + j] + Y_sq_norms[j],
-                  j + Y_idx_offset)
-
-
-
-cdef int _argkmin_on_X(
-    floating[:, ::1] X,                       # IN
-    floating[:, ::1] Y,                       # IN
-    floating[::1] Y_sq_norms,                 # IN
-    ITYPE_t chunk_size,                       # IN
-    ITYPE_t effective_n_threads,              # IN
-    ITYPE_t[:, ::1] argkmin_indices,          # OUT
-    floating[:, ::1] argkmin_red_distances,   # OUT
-) nogil:
-    """Computes the argkmin of each vector (row) of X on Y
-    by parallelising computation on chunks of X.
-    """
-    cdef:
-        ITYPE_t k = argkmin_indices.shape[1]
-        ITYPE_t d = X.shape[1]
-        ITYPE_t sf = sizeof(floating)
-        ITYPE_t si = sizeof(ITYPE_t)
-        ITYPE_t n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
-
-        ITYPE_t n_train = Y.shape[0]
-        ITYPE_t Y_n_samples_chunk = min(n_train, n_samples_chunk)
-        ITYPE_t Y_n_full_chunks = n_train / Y_n_samples_chunk
-        ITYPE_t Y_n_samples_rem = n_train % Y_n_samples_chunk
-
-        ITYPE_t n_test = X.shape[0]
-        ITYPE_t X_n_samples_chunk = min(n_test, n_samples_chunk)
-        ITYPE_t X_n_full_chunks = n_test // X_n_samples_chunk
-        ITYPE_t X_n_samples_rem = n_test % X_n_samples_chunk
-
-        # Counting remainder chunk in total number of chunks
-        ITYPE_t Y_n_chunks = Y_n_full_chunks + (
-            n_train != (Y_n_full_chunks * Y_n_samples_chunk)
-        )
-
-        ITYPE_t X_n_chunks = X_n_full_chunks + (
-            n_test != (X_n_full_chunks * X_n_samples_chunk)
-        )
-
-        ITYPE_t num_threads = min(Y_n_chunks, effective_n_threads)
-
-        ITYPE_t Y_start, Y_end, X_start, X_end
-        ITYPE_t X_chunk_idx, Y_chunk_idx, idx, jdx
-
-        floating *dist_middle_terms_chunks
-        floating *heaps_red_distances_chunks
-
-
-    with nogil, parallel(num_threads=num_threads):
-        # Thread local buffers
-
-        # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-        dist_middle_terms_chunks = <floating*> malloc(Y_n_samples_chunk * X_n_samples_chunk * sf)
-        heaps_red_distances_chunks = <floating*> malloc(X_n_samples_chunk * k * sf)
-
-        for X_chunk_idx in prange(X_n_chunks, schedule='static'):
-            # We reset the heap between X chunks (memset isn't suitable here)
-            for idx in range(X_n_samples_chunk * k):
-                heaps_red_distances_chunks[idx] = FLOAT_INF
-
-            X_start = X_chunk_idx * X_n_samples_chunk
-            if X_chunk_idx == X_n_chunks - 1 and X_n_samples_rem > 0:
-                X_end = X_start + X_n_samples_rem
-            else:
-                X_end = X_start + X_n_samples_chunk
-
-            for Y_chunk_idx in range(Y_n_chunks):
-                Y_start = Y_chunk_idx * Y_n_samples_chunk
-                if Y_chunk_idx == Y_n_chunks - 1 and Y_n_samples_rem > 0:
-                    Y_end = Y_start + Y_n_samples_rem
-                else:
-                    Y_end = Y_start + Y_n_samples_chunk
-
-                _argkmin_on_chunk(
-                    X[X_start:X_end, :],
-                    Y[Y_start:Y_end, :],
-                    Y_sq_norms[Y_start:Y_end],
-                    dist_middle_terms_chunks,
-                    heaps_red_distances_chunks,
-                    &argkmin_indices[X_start, 0],
-                    k,
-                    Y_start
-                )
-
-            # Sorting indices so that the closests' come first.
-            for idx in range(X_end - X_start):
-                _simultaneous_sort(
-                    heaps_red_distances_chunks + idx * k,
-                    &argkmin_indices[X_start + idx, 0],
-                    k
-                )
-
-        # end: for X_chunk_idx
-        free(dist_middle_terms_chunks)
-        free(heaps_red_distances_chunks)
-
-    # end: with nogil, parallel
-    return X_n_chunks
-
-
-cdef int _argkmin_on_Y(
-    floating[:, ::1] X,                       # IN
-    floating[:, ::1] Y,                       # IN
-    floating[::1] Y_sq_norms,                 # IN
-    ITYPE_t chunk_size,                       # IN
-    ITYPE_t effective_n_threads,              # IN
-    ITYPE_t[:, ::1] argkmin_indices,          # OUT
-    floating[:, ::1] argkmin_red_distances,   # OUT
-) nogil:
-    """Computes the argkmin of each vector (row) of X on Y
-    by parallelising computation on chunks of Y.
-
-    This parallelisation strategy is more costly (as we need
-    extra heaps and synchronisation), yet it is useful in
-    most contexts.
-    """
-    cdef:
-        ITYPE_t k = argkmin_indices.shape[1]
-        ITYPE_t d = X.shape[1]
-        ITYPE_t sf = sizeof(floating)
-        ITYPE_t si = sizeof(ITYPE_t)
-        ITYPE_t n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
-
-        ITYPE_t n_train = Y.shape[0]
-        ITYPE_t Y_n_samples_chunk = min(n_train, n_samples_chunk)
-        ITYPE_t Y_n_full_chunks = n_train / Y_n_samples_chunk
-        ITYPE_t Y_n_samples_rem = n_train % Y_n_samples_chunk
-
-        ITYPE_t n_test = X.shape[0]
-        ITYPE_t X_n_samples_chunk = min(n_test, n_samples_chunk)
-        ITYPE_t X_n_full_chunks = n_test // X_n_samples_chunk
-        ITYPE_t X_n_samples_rem = n_test % X_n_samples_chunk
-
-        # Counting remainder chunk in total number of chunks
-        ITYPE_t Y_n_chunks = Y_n_full_chunks + (
-            n_train != (Y_n_full_chunks * Y_n_samples_chunk)
-        )
-
-        ITYPE_t X_n_chunks = X_n_full_chunks + (
-            n_test != (X_n_full_chunks * X_n_samples_chunk)
-        )
-
-        ITYPE_t num_threads = min(Y_n_chunks, effective_n_threads)
-
-        ITYPE_t Y_start, Y_end, X_start, X_end
-        ITYPE_t X_chunk_idx, Y_chunk_idx, idx, jdx
-
-        floating *dist_middle_terms_chunks
-        floating *heaps_red_distances_chunks
-
-        # As chunks of X are shared across threads, so must their
-        # heaps. To solve this, each thread has its own locals
-        # heaps which are then synchronised back in the main ones.
-        ITYPE_t *heaps_indices_chunks
-
-    for X_chunk_idx in range(X_n_chunks):
-        X_start = X_chunk_idx * X_n_samples_chunk
-        if X_chunk_idx == X_n_chunks - 1 and X_n_samples_rem > 0:
-            X_end = X_start + X_n_samples_rem
-        else:
-            X_end = X_start + X_n_samples_chunk
-
-        with nogil, parallel(num_threads=num_threads):
-            # Thread local buffers
-
-            # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-            dist_middle_terms_chunks = <floating*> malloc(
-                Y_n_samples_chunk * X_n_samples_chunk * sf)
-            heaps_red_distances_chunks = <floating*> malloc(
-                X_n_samples_chunk * k * sf)
-            heaps_indices_chunks = <ITYPE_t*> malloc(
-                X_n_samples_chunk * k * sf)
-
-            # Initialising heaps (memset can't be used here)
-            for idx in range(X_n_samples_chunk * k):
-                heaps_red_distances_chunks[idx] = FLOAT_INF
-                heaps_indices_chunks[idx] = -1
-
-            for Y_chunk_idx in prange(Y_n_chunks, schedule='static'):
-                Y_start = Y_chunk_idx * Y_n_samples_chunk
-                if Y_chunk_idx == Y_n_chunks - 1 \
-                    and Y_n_samples_rem > 0:
-                    Y_end = Y_start + Y_n_samples_rem
-                else:
-                    Y_end = Y_start + Y_n_samples_chunk
-
-                _argkmin_on_chunk(
-                    X[X_start:X_end, :],
-                    Y[Y_start:Y_end, :],
-                    Y_sq_norms[Y_start:Y_end],
-                    dist_middle_terms_chunks,
-                    heaps_red_distances_chunks,
-                    heaps_indices_chunks,
-                    k,
-                    Y_start,
-                )
-
-            # end: for Y_chunk_idx
-            with gil:
-                # Synchronising the thread local heaps
-                # with the main heaps
-                for idx in range(X_end - X_start):
-                    for jdx in range(k):
-                        _push(
-                            &argkmin_red_distances[X_start + idx, 0],
-                            &argkmin_indices[X_start + idx, 0],
-                            k,
-                            heaps_red_distances_chunks[idx * k + jdx],
-                            heaps_indices_chunks[idx * k + jdx],
-                        )
-
-            free(dist_middle_terms_chunks)
-            free(heaps_red_distances_chunks)
-            free(heaps_indices_chunks)
-
-        # end: with nogil, parallel
-        # Sorting indices of the argkmin for each query vector of X
-        for idx in prange(n_test,schedule='static',
-                          nogil=True, num_threads=num_threads):
-            _simultaneous_sort(
-                &argkmin_red_distances[idx, 0],
-                &argkmin_indices[idx, 0],
-                k,
-            )
-        # end: prange
-
-    # end: for X_chunk_idx
-    return Y_n_chunks
-
 cdef inline floating _euclidean_dist(
     floating[:, ::1] X,
     floating[:, ::1] Y,
@@ -368,100 +88,380 @@ cdef int _exact_euclidean_dist(
                                               Y_indices[i, k])
 
 
-# Python interface
+cdef class ArgKmin:
+
+    cdef void _argkmin_on_chunk(self,
+        floating[:, ::1] X_c,                  # IN
+        floating[:, ::1] Y_c,                  # IN
+        floating[::1] Y_sq_norms,              # IN
+        floating *dist_middle_terms,           # IN
+        floating *heaps_red_distances,         # IN/OUT
+        ITYPE_t *heaps_indices,                # IN/OUT
+        ITYPE_t k,                             # IN
+        # ID of the first element of Y_c
+        ITYPE_t Y_idx_offset,
+    ) nogil:
+        """
+        Critical part of the computation of pairwise distances.
+
+        "Fast Squared Euclidean" distances strategy relying
+        on the gemm-trick.
+        """
+        cdef:
+            ITYPE_t i, j
+        # Instead of computing the full pairwise squared distances matrix,
+        # ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²,
+        # we only need to store the - 2 X_c.Y_c^T + ||Y_c||²
+        # term since the argmin for a given sample X_c^{i} does not depend on
+        # ||X_c^{i}||²
+
+        # Careful: LDA, LDB and LDC are given for F-ordered arrays.
+        # Here, we use their counterpart values as indicated in the documentation.
+        # See the documentation of parameters here:
+        # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html
+        #
+        # dist_middle_terms = -2 * X_c.dot(Y_c.T)
+        _gemm(RowMajor, NoTrans, Trans,
+              X_c.shape[0], Y_c.shape[0], X_c.shape[1],
+              -2.0,
+              &X_c[0, 0], X_c.shape[1],
+              &Y_c[0, 0], X_c.shape[1], 0.0,
+              dist_middle_terms, Y_c.shape[0])
+
+        # Computing argmins here
+        for i in range(X_c.shape[0]):
+            for j in range(Y_c.shape[0]):
+                _push(heaps_red_distances + i * k,
+                      heaps_indices + i * k,
+                      k,
+                      # reduced distance: - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                      dist_middle_terms[i * Y_c.shape[0] + j] + Y_sq_norms[j],
+                      j + Y_idx_offset)
+
+
+
+    cdef int _argkmin_on_X(self,
+        floating[:, ::1] X,                       # IN
+        floating[:, ::1] Y,                       # IN
+        floating[::1] Y_sq_norms,                 # IN
+        ITYPE_t chunk_size,                       # IN
+        ITYPE_t effective_n_threads,              # IN
+        ITYPE_t[:, ::1] argkmin_indices,          # OUT
+        floating[:, ::1] argkmin_red_distances,   # OUT
+    ) nogil:
+        """Computes the argkmin of each vector (row) of X on Y
+        by parallelising computation on chunks of X.
+        """
+        cdef:
+            ITYPE_t k = argkmin_indices.shape[1]
+            ITYPE_t d = X.shape[1]
+            ITYPE_t sf = sizeof(floating)
+            ITYPE_t si = sizeof(ITYPE_t)
+            ITYPE_t n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
+
+            ITYPE_t n_train = Y.shape[0]
+            ITYPE_t Y_n_samples_chunk = min(n_train, n_samples_chunk)
+            ITYPE_t Y_n_full_chunks = n_train / Y_n_samples_chunk
+            ITYPE_t Y_n_samples_rem = n_train % Y_n_samples_chunk
+
+            ITYPE_t n_test = X.shape[0]
+            ITYPE_t X_n_samples_chunk = min(n_test, n_samples_chunk)
+            ITYPE_t X_n_full_chunks = n_test // X_n_samples_chunk
+            ITYPE_t X_n_samples_rem = n_test % X_n_samples_chunk
+
+            # Counting remainder chunk in total number of chunks
+            ITYPE_t Y_n_chunks = Y_n_full_chunks + (
+                n_train != (Y_n_full_chunks * Y_n_samples_chunk)
+            )
+
+            ITYPE_t X_n_chunks = X_n_full_chunks + (
+                n_test != (X_n_full_chunks * X_n_samples_chunk)
+            )
 
-def _argkmin(
-    floating[:, ::1] X,
-    floating[:, ::1] Y,
-    ITYPE_t k,
-    ITYPE_t chunk_size = CHUNK_SIZE,
-    str strategy = "auto",
-    bint return_distance = False,
-):
-    """Computes the argkmin of vectors (rows) of X on Y for
-    the euclidean distance.
-
-    The implementation is parallelised on chunks whose size can
-    be set using ``chunk_size``.
-
-    Parameters
-    ----------
-    X: ndarray of shape (n, d)
-        Rows represent vectors
-
-    Y: ndarray of shape (m, d)
-        Rows represent vectors
-
-    chunk_size: int
-        The number of vectors per chunk.
-
-    strategy: str, {'auto', 'chunk_on_X', 'chunk_on_Y'}
-        The chunking strategy defining which dataset
-        parallelisation are made on.
-
-         - 'chunk_on_X' is embarassingly parallel but
-        is less used in practice.
-         - 'chunk_on_Y' comes with synchronisation but
-        is more useful in practice.
-         -'auto' relies on a simple heuristic to choose
-        between 'chunk_on_X' and 'chunk_on_Y'.
-
-    return_distance: boolean
-        Return distances between each X vectory and its
-        argkmin if set to True.
-
-    Returns
-    -------
-    distances: ndarray of shape (n, k)
-        Distances between each X vector and its argkmin
-        in Y. Only returned if ``return_distance=True``.
-
-    indices: ndarray of shape (n, k)
-        Indices of each X vector argkmin in Y.
-    """
-    int_dtype = np.intp
-    float_dtype = np.float32 if floating is float else np.float64
-    cdef:
-        ITYPE_t[:, ::1] argkmin_indices = np.full((X.shape[0], k), 0,
-                                               dtype=ITYPE)
-        floating[:, ::1] argkmin_distances = np.full((X.shape[0], k),
-                                                  FLOAT_INF,
-                                                  dtype=float_dtype)
-        floating[::1] Y_sq_norms = np.einsum('ij,ij->i', Y, Y)
-        ITYPE_t effective_n_threads = _openmp_effective_n_threads()
-
-    if strategy == 'auto':
-        # This is a simple heuristic whose constant for the
-        # comparison has been chosen based on experiments.
-        if 4 * chunk_size * effective_n_threads < X.shape[0]:
-            strategy = 'chunk_on_X'
+            ITYPE_t num_threads = min(Y_n_chunks, effective_n_threads)
+
+            ITYPE_t Y_start, Y_end, X_start, X_end
+            ITYPE_t X_chunk_idx, Y_chunk_idx, idx, jdx
+
+            floating *dist_middle_terms_chunks
+            floating *heaps_red_distances_chunks
+
+
+        with nogil, parallel(num_threads=num_threads):
+            # Thread local buffers
+
+            # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+            dist_middle_terms_chunks = <floating*> malloc(Y_n_samples_chunk * X_n_samples_chunk * sf)
+            heaps_red_distances_chunks = <floating*> malloc(X_n_samples_chunk * k * sf)
+
+            for X_chunk_idx in prange(X_n_chunks, schedule='static'):
+                # We reset the heap between X chunks (memset isn't suitable here)
+                for idx in range(X_n_samples_chunk * k):
+                    heaps_red_distances_chunks[idx] = FLOAT_INF
+
+                X_start = X_chunk_idx * X_n_samples_chunk
+                if X_chunk_idx == X_n_chunks - 1 and X_n_samples_rem > 0:
+                    X_end = X_start + X_n_samples_rem
+                else:
+                    X_end = X_start + X_n_samples_chunk
+
+                for Y_chunk_idx in range(Y_n_chunks):
+                    Y_start = Y_chunk_idx * Y_n_samples_chunk
+                    if Y_chunk_idx == Y_n_chunks - 1 and Y_n_samples_rem > 0:
+                        Y_end = Y_start + Y_n_samples_rem
+                    else:
+                        Y_end = Y_start + Y_n_samples_chunk
+
+                    self._argkmin_on_chunk(
+                        X[X_start:X_end, :],
+                        Y[Y_start:Y_end, :],
+                        Y_sq_norms[Y_start:Y_end],
+                        dist_middle_terms_chunks,
+                        heaps_red_distances_chunks,
+                        &argkmin_indices[X_start, 0],
+                        k,
+                        Y_start
+                    )
+
+                # Sorting indices so that the closests' come first.
+                for idx in range(X_end - X_start):
+                    _simultaneous_sort(
+                        heaps_red_distances_chunks + idx * k,
+                        &argkmin_indices[X_start + idx, 0],
+                        k
+                    )
+
+            # end: for X_chunk_idx
+            free(dist_middle_terms_chunks)
+            free(heaps_red_distances_chunks)
+
+        # end: with nogil, parallel
+        return X_n_chunks
+
+
+    cdef int _argkmin_on_Y(self,
+        floating[:, ::1] X,                       # IN
+        floating[:, ::1] Y,                       # IN
+        floating[::1] Y_sq_norms,                 # IN
+        ITYPE_t chunk_size,                       # IN
+        ITYPE_t effective_n_threads,              # IN
+        ITYPE_t[:, ::1] argkmin_indices,          # OUT
+        floating[:, ::1] argkmin_red_distances,   # OUT
+    ) nogil:
+        """Computes the argkmin of each vector (row) of X on Y
+        by parallelising computation on chunks of Y.
+
+        This parallelisation strategy is more costly (as we need
+        extra heaps and synchronisation), yet it is useful in
+        most contexts.
+        """
+        cdef:
+            ITYPE_t k = argkmin_indices.shape[1]
+            ITYPE_t d = X.shape[1]
+            ITYPE_t sf = sizeof(floating)
+            ITYPE_t si = sizeof(ITYPE_t)
+            ITYPE_t n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
+
+            ITYPE_t n_train = Y.shape[0]
+            ITYPE_t Y_n_samples_chunk = min(n_train, n_samples_chunk)
+            ITYPE_t Y_n_full_chunks = n_train / Y_n_samples_chunk
+            ITYPE_t Y_n_samples_rem = n_train % Y_n_samples_chunk
+
+            ITYPE_t n_test = X.shape[0]
+            ITYPE_t X_n_samples_chunk = min(n_test, n_samples_chunk)
+            ITYPE_t X_n_full_chunks = n_test // X_n_samples_chunk
+            ITYPE_t X_n_samples_rem = n_test % X_n_samples_chunk
+
+            # Counting remainder chunk in total number of chunks
+            ITYPE_t Y_n_chunks = Y_n_full_chunks + (
+                n_train != (Y_n_full_chunks * Y_n_samples_chunk)
+            )
+
+            ITYPE_t X_n_chunks = X_n_full_chunks + (
+                n_test != (X_n_full_chunks * X_n_samples_chunk)
+            )
+
+            ITYPE_t num_threads = min(Y_n_chunks, effective_n_threads)
+
+            ITYPE_t Y_start, Y_end, X_start, X_end
+            ITYPE_t X_chunk_idx, Y_chunk_idx, idx, jdx
+
+            floating *dist_middle_terms_chunks
+            floating *heaps_red_distances_chunks
+
+            # As chunks of X are shared across threads, so must their
+            # heaps. To solve this, each thread has its own locals
+            # heaps which are then synchronised back in the main ones.
+            ITYPE_t *heaps_indices_chunks
+
+        for X_chunk_idx in range(X_n_chunks):
+            X_start = X_chunk_idx * X_n_samples_chunk
+            if X_chunk_idx == X_n_chunks - 1 and X_n_samples_rem > 0:
+                X_end = X_start + X_n_samples_rem
+            else:
+                X_end = X_start + X_n_samples_chunk
+
+            with nogil, parallel(num_threads=num_threads):
+                # Thread local buffers
+
+                # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+                dist_middle_terms_chunks = <floating*> malloc(
+                    Y_n_samples_chunk * X_n_samples_chunk * sf)
+                heaps_red_distances_chunks = <floating*> malloc(
+                    X_n_samples_chunk * k * sf)
+                heaps_indices_chunks = <ITYPE_t*> malloc(
+                    X_n_samples_chunk * k * sf)
+
+                # Initialising heaps (memset can't be used here)
+                for idx in range(X_n_samples_chunk * k):
+                    heaps_red_distances_chunks[idx] = FLOAT_INF
+                    heaps_indices_chunks[idx] = -1
+
+                for Y_chunk_idx in prange(Y_n_chunks, schedule='static'):
+                    Y_start = Y_chunk_idx * Y_n_samples_chunk
+                    if Y_chunk_idx == Y_n_chunks - 1 \
+                        and Y_n_samples_rem > 0:
+                        Y_end = Y_start + Y_n_samples_rem
+                    else:
+                        Y_end = Y_start + Y_n_samples_chunk
+
+                    self._argkmin_on_chunk(
+                        X[X_start:X_end, :],
+                        Y[Y_start:Y_end, :],
+                        Y_sq_norms[Y_start:Y_end],
+                        dist_middle_terms_chunks,
+                        heaps_red_distances_chunks,
+                        heaps_indices_chunks,
+                        k,
+                        Y_start,
+                    )
+
+                # end: for Y_chunk_idx
+                with gil:
+                    # Synchronising the thread local heaps
+                    # with the main heaps
+                    for idx in range(X_end - X_start):
+                        for jdx in range(k):
+                            _push(
+                                &argkmin_red_distances[X_start + idx, 0],
+                                &argkmin_indices[X_start + idx, 0],
+                                k,
+                                heaps_red_distances_chunks[idx * k + jdx],
+                                heaps_indices_chunks[idx * k + jdx],
+                            )
+
+                free(dist_middle_terms_chunks)
+                free(heaps_red_distances_chunks)
+                free(heaps_indices_chunks)
+
+            # end: with nogil, parallel
+            # Sorting indices of the argkmin for each query vector of X
+            for idx in prange(n_test,schedule='static',
+                              nogil=True, num_threads=num_threads):
+                _simultaneous_sort(
+                    &argkmin_red_distances[idx, 0],
+                    &argkmin_indices[idx, 0],
+                    k,
+                )
+            # end: prange
+
+        # end: for X_chunk_idx
+        return Y_n_chunks
+
+    # Python interface
+
+    def _argkmin(self,
+        floating[:, ::1] X,
+        floating[:, ::1] Y,
+        ITYPE_t k,
+        ITYPE_t chunk_size = CHUNK_SIZE,
+        str strategy = "auto",
+        bint return_distance = False,
+    ):
+        """Computes the argkmin of vectors (rows) of X on Y for
+        the euclidean distance.
+
+        The implementation is parallelised on chunks whose size can
+        be set using ``chunk_size``.
+
+        Parameters
+        ----------
+        X: ndarray of shape (n, d)
+            Rows represent vectors
+
+        Y: ndarray of shape (m, d)
+            Rows represent vectors
+
+        chunk_size: int
+            The number of vectors per chunk.
+
+        strategy: str, {'auto', 'chunk_on_X', 'chunk_on_Y'}
+            The chunking strategy defining which dataset
+            parallelisation are made on.
+
+             - 'chunk_on_X' is embarassingly parallel but
+            is less used in practice.
+             - 'chunk_on_Y' comes with synchronisation but
+            is more useful in practice.
+             -'auto' relies on a simple heuristic to choose
+            between 'chunk_on_X' and 'chunk_on_Y'.
+
+        return_distance: boolean
+            Return distances between each X vectory and its
+            argkmin if set to True.
+
+        Returns
+        -------
+        distances: ndarray of shape (n, k)
+            Distances between each X vector and its argkmin
+            in Y. Only returned if ``return_distance=True``.
+
+        indices: ndarray of shape (n, k)
+            Indices of each X vector argkmin in Y.
+        """
+        int_dtype = np.intp
+        float_dtype = np.float32 if floating is float else np.float64
+        cdef:
+            ITYPE_t[:, ::1] argkmin_indices = np.full((X.shape[0], k), 0,
+                                                   dtype=ITYPE)
+            floating[:, ::1] argkmin_distances = np.full((X.shape[0], k),
+                                                      FLOAT_INF,
+                                                      dtype=float_dtype)
+            floating[::1] Y_sq_norms = np.einsum('ij,ij->i', Y, Y)
+            ITYPE_t effective_n_threads = _openmp_effective_n_threads()
+
+        if strategy == 'auto':
+            # This is a simple heuristic whose constant for the
+            # comparison has been chosen based on experiments.
+            if 4 * chunk_size * effective_n_threads < X.shape[0]:
+                strategy = 'chunk_on_X'
+            else:
+                strategy = 'chunk_on_Y'
+
+        if strategy == 'chunk_on_Y':
+            self._argkmin_on_Y(
+                X, Y, Y_sq_norms,
+                chunk_size, effective_n_threads,
+                argkmin_indices, argkmin_distances
+            )
+        elif strategy == 'chunk_on_X':
+            self._argkmin_on_X(
+                X, Y, Y_sq_norms,
+                chunk_size, effective_n_threads,
+                argkmin_indices, argkmin_distances
+            )
         else:
-            strategy = 'chunk_on_Y'
-
-    if strategy == 'chunk_on_Y':
-        _argkmin_on_Y(
-            X, Y, Y_sq_norms,
-            chunk_size, effective_n_threads,
-            argkmin_indices, argkmin_distances
-        )
-    elif strategy == 'chunk_on_X':
-        _argkmin_on_X(
-            X, Y, Y_sq_norms,
-            chunk_size, effective_n_threads,
-            argkmin_indices, argkmin_distances
-        )
-    else:
-        raise RuntimeError(f"strategy '{strategy}' not supported.")
-
-    if return_distance:
-        # We need to recompute distances because we relied on
-        # reduced distances using _gemm, which are missing a
-        # term for squarred norms and which are not the most
-        # precise (catastrophic cancellation might have happened).
-        _exact_euclidean_dist(X, Y, argkmin_indices,
-                              effective_n_threads,
-                              argkmin_distances)
-        return (np.asarray(argkmin_distances),
-                np.asarray(argkmin_indices))
-
-    return np.asarray(argkmin_indices)
+            raise RuntimeError(f"strategy '{strategy}' not supported.")
+
+        if return_distance:
+            # We need to recompute distances because we relied on
+            # reduced distances using _gemm, which are missing a
+            # term for squarred norms and which are not the most
+            # precise (catastrophic cancellation might have happened).
+            _exact_euclidean_dist(X, Y, argkmin_indices,
+                                  effective_n_threads,
+                                  argkmin_distances)
+            return (np.asarray(argkmin_distances),
+                    np.asarray(argkmin_indices))
+
+        return np.asarray(argkmin_indices)
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index a20c49a20346c..ba04685c07ab9 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -31,7 +31,7 @@
 from ..utils.fixes import delayed
 from ..utils.fixes import sp_version, parse_version
 
-from ._argkmin_fast import _argkmin
+from ._argkmin_fast import ArgKmin
 from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
 from ..exceptions import DataConversionWarning
 
@@ -648,7 +648,9 @@ def pairwise_distances_argmin_min(
 
     if metric == "fast_sqeuclidean":
         # TODO: generalise this simple plug here
-        values, indices = _argkmin(X, Y, k=1, strategy="auto", return_distance=True)
+        values, indices = ArgKmin()._argkmin(
+            X, Y, k=1, strategy="auto", return_distance=True
+        )
         values = np.ndarray.flatten(values)
         indices = np.ndarray.flatten(indices)
     else:
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index be561e0bd3f64..9f79be031b0a9 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -23,7 +23,7 @@
 from ..base import is_classifier
 from ..metrics import pairwise_distances_chunked
 from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
-from ..metrics._argkmin_fast import _argkmin
+from ..metrics._argkmin_fast import ArgKmin
 from ..utils import (
     check_array,
     gen_even_slices,
@@ -740,7 +740,7 @@ class from an array representing our data set and ask who's
             self._fit_method == "brute" and self.effective_metric_ == "fast_sqeuclidean"
         ):
             # TODO: generalise this simple plug here
-            results = _argkmin(
+            results = ArgKmin()._argkmin(
                 X,
                 Y=self._fit_X,
                 k=n_neighbors,

From 568ed2a40b78b55a853c5dbd6a9a4bc182997470 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 30 Jun 2021 10:04:41 +0200
Subject: [PATCH 03/26] [WIP] Adapting to use class hierarchy

---
 sklearn/metrics/_argkmin_fast.pyx | 191 ++++++++++++++----------------
 sklearn/metrics/pairwise.py       |   4 +-
 sklearn/neighbors/_base.py        |   5 +-
 3 files changed, 94 insertions(+), 106 deletions(-)

diff --git a/sklearn/metrics/_argkmin_fast.pyx b/sklearn/metrics/_argkmin_fast.pyx
index e1367012bdffa..b85aa1b94c388 100644
--- a/sklearn/metrics/_argkmin_fast.pyx
+++ b/sklearn/metrics/_argkmin_fast.pyx
@@ -4,16 +4,16 @@
 # cython: wraparound=False
 # cython: profile=False
 # cython: linetrace=False
+# cython: initializedcheck=False
 # cython: binding=False
 # distutils: define_macros=CYTHON_TRACE_NOGIL=0
 
 import numpy as np
 cimport numpy as np
 
-from libc.math cimport floor, sqrt
+from libc.math cimport sqrt
 from libc.stdlib cimport free, malloc
 
-from cython cimport floating
 from cython.parallel cimport parallel, prange
 
 DEF CHUNK_SIZE = 256  # number of vectors
@@ -34,18 +34,18 @@ from ..utils._cython_blas cimport (
 
 from ..utils._heap cimport _simultaneous_sort, _push
 from ..utils._openmp_helpers import _openmp_effective_n_threads
-from ..utils._typedefs cimport ITYPE_t
-from ..utils._typedefs import ITYPE
+from ..utils._typedefs cimport ITYPE_t, DTYPE_t
+from ..utils._typedefs import ITYPE, DTYPE
 
 
-cdef inline floating _euclidean_dist(
-    floating[:, ::1] X,
-    floating[:, ::1] Y,
+cdef inline DTYPE_t _euclidean_dist(
+    DTYPE_t[:, ::1] X,
+    DTYPE_t[:, ::1] Y,
     ITYPE_t i,
     ITYPE_t j,
 ) nogil:
     cdef:
-        floating dist = 0
+        DTYPE_t dist = 0
         ITYPE_t k
         ITYPE_t upper_unrolled_idx = (X.shape[1] // 4) * 4
 
@@ -62,11 +62,11 @@ cdef inline floating _euclidean_dist(
     return sqrt(dist)
 
 cdef int _exact_euclidean_dist(
-    floating[:, ::1] X,                  # IN
-    floating[:, ::1] Y,                  # IN
+    DTYPE_t[:, ::1] X,                  # IN
+    DTYPE_t[:, ::1] Y,                  # IN
     ITYPE_t[:, ::1] Y_indices,           # IN
     ITYPE_t effective_n_threads,         # IN
-    floating[:, ::1] distances,          # OUT
+    DTYPE_t[:, ::1] distances,          # OUT
 ) nogil:
     """
     Compute exact pairwise euclidean distances in parallel.
@@ -90,12 +90,33 @@ cdef int _exact_euclidean_dist(
 
 cdef class ArgKmin:
 
-    cdef void _argkmin_on_chunk(self,
-        floating[:, ::1] X_c,                  # IN
-        floating[:, ::1] Y_c,                  # IN
-        floating[::1] Y_sq_norms,              # IN
-        floating *dist_middle_terms,           # IN
-        floating *heaps_red_distances,         # IN/OUT
+    cdef:
+        ITYPE_t k
+        ITYPE_t chunk_size
+
+        DTYPE_t[:, ::1] X
+        DTYPE_t[:, ::1] Y
+
+        DTYPE_t[::1] Y_sq_norms
+
+    def __init__(self,
+                  DTYPE_t[:, ::1] X,
+                  DTYPE_t[:, ::1] Y,
+                  ITYPE_t k,
+                  ITYPE_t chunk_size = CHUNK_SIZE,
+    ):
+        self.X = X
+        self.Y = Y
+        self.k = k
+        self.chunk_size = chunk_size
+        self.Y_sq_norms = np.einsum('ij,ij->i', Y, Y)
+
+    cdef void _reduce_on_chunks(self,
+        DTYPE_t[:, ::1] X_c,                  # IN
+        DTYPE_t[:, ::1] Y_c,                  # IN
+        DTYPE_t[::1] Y_sq_norms,              # IN
+        DTYPE_t *dist_middle_terms,           # IN
+        DTYPE_t *heaps_red_distances,         # IN/OUT
         ITYPE_t *heaps_indices,                # IN/OUT
         ITYPE_t k,                             # IN
         # ID of the first element of Y_c
@@ -139,32 +160,28 @@ cdef class ArgKmin:
                       j + Y_idx_offset)
 
 
-
-    cdef int _argkmin_on_X(self,
-        floating[:, ::1] X,                       # IN
-        floating[:, ::1] Y,                       # IN
-        floating[::1] Y_sq_norms,                 # IN
+    cdef int _parallel_on_X(self,
         ITYPE_t chunk_size,                       # IN
         ITYPE_t effective_n_threads,              # IN
         ITYPE_t[:, ::1] argkmin_indices,          # OUT
-        floating[:, ::1] argkmin_red_distances,   # OUT
+        DTYPE_t[:, ::1] argkmin_red_distances,   # OUT
     ) nogil:
         """Computes the argkmin of each vector (row) of X on Y
         by parallelising computation on chunks of X.
         """
         cdef:
             ITYPE_t k = argkmin_indices.shape[1]
-            ITYPE_t d = X.shape[1]
-            ITYPE_t sf = sizeof(floating)
+            ITYPE_t d = self.X.shape[1]
+            ITYPE_t sf = sizeof(DTYPE_t)
             ITYPE_t si = sizeof(ITYPE_t)
             ITYPE_t n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
 
-            ITYPE_t n_train = Y.shape[0]
+            ITYPE_t n_train = self.Y.shape[0]
             ITYPE_t Y_n_samples_chunk = min(n_train, n_samples_chunk)
             ITYPE_t Y_n_full_chunks = n_train / Y_n_samples_chunk
             ITYPE_t Y_n_samples_rem = n_train % Y_n_samples_chunk
 
-            ITYPE_t n_test = X.shape[0]
+            ITYPE_t n_test = self.X.shape[0]
             ITYPE_t X_n_samples_chunk = min(n_test, n_samples_chunk)
             ITYPE_t X_n_full_chunks = n_test // X_n_samples_chunk
             ITYPE_t X_n_samples_rem = n_test % X_n_samples_chunk
@@ -183,16 +200,16 @@ cdef class ArgKmin:
             ITYPE_t Y_start, Y_end, X_start, X_end
             ITYPE_t X_chunk_idx, Y_chunk_idx, idx, jdx
 
-            floating *dist_middle_terms_chunks
-            floating *heaps_red_distances_chunks
+            DTYPE_t *dist_middle_terms_chunks
+            DTYPE_t *heaps_red_distances_chunks
 
 
         with nogil, parallel(num_threads=num_threads):
             # Thread local buffers
 
             # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-            dist_middle_terms_chunks = <floating*> malloc(Y_n_samples_chunk * X_n_samples_chunk * sf)
-            heaps_red_distances_chunks = <floating*> malloc(X_n_samples_chunk * k * sf)
+            dist_middle_terms_chunks = <DTYPE_t*> malloc(Y_n_samples_chunk * X_n_samples_chunk * sf)
+            heaps_red_distances_chunks = <DTYPE_t*> malloc(X_n_samples_chunk * k * sf)
 
             for X_chunk_idx in prange(X_n_chunks, schedule='static'):
                 # We reset the heap between X chunks (memset isn't suitable here)
@@ -212,10 +229,10 @@ cdef class ArgKmin:
                     else:
                         Y_end = Y_start + Y_n_samples_chunk
 
-                    self._argkmin_on_chunk(
-                        X[X_start:X_end, :],
-                        Y[Y_start:Y_end, :],
-                        Y_sq_norms[Y_start:Y_end],
+                    self._reduce_on_chunks(
+                        self.X[X_start:X_end, :],
+                        self.Y[Y_start:Y_end, :],
+                        self.Y_sq_norms[Y_start:Y_end],
                         dist_middle_terms_chunks,
                         heaps_red_distances_chunks,
                         &argkmin_indices[X_start, 0],
@@ -239,14 +256,11 @@ cdef class ArgKmin:
         return X_n_chunks
 
 
-    cdef int _argkmin_on_Y(self,
-        floating[:, ::1] X,                       # IN
-        floating[:, ::1] Y,                       # IN
-        floating[::1] Y_sq_norms,                 # IN
+    cdef int _parallel_on_Y(self,
         ITYPE_t chunk_size,                       # IN
         ITYPE_t effective_n_threads,              # IN
         ITYPE_t[:, ::1] argkmin_indices,          # OUT
-        floating[:, ::1] argkmin_red_distances,   # OUT
+        DTYPE_t[:, ::1] argkmin_red_distances,   # OUT
     ) nogil:
         """Computes the argkmin of each vector (row) of X on Y
         by parallelising computation on chunks of Y.
@@ -257,17 +271,17 @@ cdef class ArgKmin:
         """
         cdef:
             ITYPE_t k = argkmin_indices.shape[1]
-            ITYPE_t d = X.shape[1]
-            ITYPE_t sf = sizeof(floating)
+            ITYPE_t d = self.X.shape[1]
+            ITYPE_t sf = sizeof(DTYPE_t)
             ITYPE_t si = sizeof(ITYPE_t)
             ITYPE_t n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
 
-            ITYPE_t n_train = Y.shape[0]
+            ITYPE_t n_train = self.Y.shape[0]
             ITYPE_t Y_n_samples_chunk = min(n_train, n_samples_chunk)
             ITYPE_t Y_n_full_chunks = n_train / Y_n_samples_chunk
             ITYPE_t Y_n_samples_rem = n_train % Y_n_samples_chunk
 
-            ITYPE_t n_test = X.shape[0]
+            ITYPE_t n_test = self.X.shape[0]
             ITYPE_t X_n_samples_chunk = min(n_test, n_samples_chunk)
             ITYPE_t X_n_full_chunks = n_test // X_n_samples_chunk
             ITYPE_t X_n_samples_rem = n_test % X_n_samples_chunk
@@ -286,8 +300,8 @@ cdef class ArgKmin:
             ITYPE_t Y_start, Y_end, X_start, X_end
             ITYPE_t X_chunk_idx, Y_chunk_idx, idx, jdx
 
-            floating *dist_middle_terms_chunks
-            floating *heaps_red_distances_chunks
+            DTYPE_t *dist_middle_terms_chunks
+            DTYPE_t *heaps_red_distances_chunks
 
             # As chunks of X are shared across threads, so must their
             # heaps. To solve this, each thread has its own locals
@@ -305,9 +319,9 @@ cdef class ArgKmin:
                 # Thread local buffers
 
                 # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-                dist_middle_terms_chunks = <floating*> malloc(
+                dist_middle_terms_chunks = <DTYPE_t*> malloc(
                     Y_n_samples_chunk * X_n_samples_chunk * sf)
-                heaps_red_distances_chunks = <floating*> malloc(
+                heaps_red_distances_chunks = <DTYPE_t*> malloc(
                     X_n_samples_chunk * k * sf)
                 heaps_indices_chunks = <ITYPE_t*> malloc(
                     X_n_samples_chunk * k * sf)
@@ -325,10 +339,10 @@ cdef class ArgKmin:
                     else:
                         Y_end = Y_start + Y_n_samples_chunk
 
-                    self._argkmin_on_chunk(
-                        X[X_start:X_end, :],
-                        Y[Y_start:Y_end, :],
-                        Y_sq_norms[Y_start:Y_end],
+                    self._reduce_on_chunks(
+                        self.X[X_start:X_end, :],
+                        self.Y[Y_start:Y_end, :],
+                        self.Y_sq_norms[Y_start:Y_end],
                         dist_middle_terms_chunks,
                         heaps_red_distances_chunks,
                         heaps_indices_chunks,
@@ -369,47 +383,28 @@ cdef class ArgKmin:
         return Y_n_chunks
 
     # Python interface
+    def compute(self,
+           str strategy = "auto",
+           bint return_distance = False
+           ):
+        """Computes the argkmin of vectors (rows) of X on Y.
 
-    def _argkmin(self,
-        floating[:, ::1] X,
-        floating[:, ::1] Y,
-        ITYPE_t k,
-        ITYPE_t chunk_size = CHUNK_SIZE,
-        str strategy = "auto",
-        bint return_distance = False,
-    ):
-        """Computes the argkmin of vectors (rows) of X on Y for
-        the euclidean distance.
-
-        The implementation is parallelised on chunks whose size can
-        be set using ``chunk_size``.
-
-        Parameters
-        ----------
-        X: ndarray of shape (n, d)
-            Rows represent vectors
-
-        Y: ndarray of shape (m, d)
-            Rows represent vectors
-
-        chunk_size: int
-            The number of vectors per chunk.
-
-        strategy: str, {'auto', 'chunk_on_X', 'chunk_on_Y'}
+        strategy: str, {'auto', 'parallel_on_X', 'parallel_on_Y'}
             The chunking strategy defining which dataset
             parallelisation are made on.
 
-             - 'chunk_on_X' is embarassingly parallel but
+             - 'parallel_on_X' is embarassingly parallel but
             is less used in practice.
-             - 'chunk_on_Y' comes with synchronisation but
+             - 'parallel_on_Y' comes with synchronisation but
             is more useful in practice.
              -'auto' relies on a simple heuristic to choose
-            between 'chunk_on_X' and 'chunk_on_Y'.
+            between 'parallel_on_X' and 'parallel_on_Y'.
 
         return_distance: boolean
             Return distances between each X vectory and its
             argkmin if set to True.
 
+
         Returns
         -------
         distances: ndarray of shape (n, k)
@@ -419,35 +414,31 @@ cdef class ArgKmin:
         indices: ndarray of shape (n, k)
             Indices of each X vector argkmin in Y.
         """
-        int_dtype = np.intp
-        float_dtype = np.float32 if floating is float else np.float64
         cdef:
-            ITYPE_t[:, ::1] argkmin_indices = np.full((X.shape[0], k), 0,
+            ITYPE_t n_X = self.X.shape[0]
+            ITYPE_t[:, ::1] argkmin_indices = np.full((n_X, self.k), 0,
                                                    dtype=ITYPE)
-            floating[:, ::1] argkmin_distances = np.full((X.shape[0], k),
+            DTYPE_t[:, ::1] argkmin_distances = np.full((n_X, self.k),
                                                       FLOAT_INF,
-                                                      dtype=float_dtype)
-            floating[::1] Y_sq_norms = np.einsum('ij,ij->i', Y, Y)
+                                                      dtype=DTYPE)
             ITYPE_t effective_n_threads = _openmp_effective_n_threads()
 
         if strategy == 'auto':
             # This is a simple heuristic whose constant for the
             # comparison has been chosen based on experiments.
-            if 4 * chunk_size * effective_n_threads < X.shape[0]:
-                strategy = 'chunk_on_X'
+            if 4 * self.chunk_size * effective_n_threads < n_X:
+                strategy = 'parallel_on_X'
             else:
-                strategy = 'chunk_on_Y'
+                strategy = 'parallel_on_Y'
 
-        if strategy == 'chunk_on_Y':
-            self._argkmin_on_Y(
-                X, Y, Y_sq_norms,
-                chunk_size, effective_n_threads,
+        if strategy == 'parallel_on_Y':
+            self._parallel_on_Y(
+                self.chunk_size, effective_n_threads,
                 argkmin_indices, argkmin_distances
             )
-        elif strategy == 'chunk_on_X':
-            self._argkmin_on_X(
-                X, Y, Y_sq_norms,
-                chunk_size, effective_n_threads,
+        elif strategy == 'parallel_on_X':
+            self._parallel_on_X(
+                self.chunk_size, effective_n_threads,
                 argkmin_indices, argkmin_distances
             )
         else:
@@ -456,9 +447,9 @@ cdef class ArgKmin:
         if return_distance:
             # We need to recompute distances because we relied on
             # reduced distances using _gemm, which are missing a
-            # term for squarred norms and which are not the most
+            # term for squared norms and which are not the most
             # precise (catastrophic cancellation might have happened).
-            _exact_euclidean_dist(X, Y, argkmin_indices,
+            _exact_euclidean_dist(self.X, self.Y, argkmin_indices,
                                   effective_n_threads,
                                   argkmin_distances)
             return (np.asarray(argkmin_distances),
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index ba04685c07ab9..c78ed7bc23e17 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -648,8 +648,8 @@ def pairwise_distances_argmin_min(
 
     if metric == "fast_sqeuclidean":
         # TODO: generalise this simple plug here
-        values, indices = ArgKmin()._argkmin(
-            X, Y, k=1, strategy="auto", return_distance=True
+        values, indices = ArgKmin(X, Y, k=1).compute(
+            k=1, strategy="auto", return_distance=True
         )
         values = np.ndarray.flatten(values)
         indices = np.ndarray.flatten(indices)
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 9f79be031b0a9..53774a3b78510 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -740,10 +740,7 @@ class from an array representing our data set and ask who's
             self._fit_method == "brute" and self.effective_metric_ == "fast_sqeuclidean"
         ):
             # TODO: generalise this simple plug here
-            results = ArgKmin()._argkmin(
-                X,
-                Y=self._fit_X,
-                k=n_neighbors,
+            results = ArgKmin(X=X, Y=self._fit_X, k=n_neighbors).compute(
                 strategy="auto",
                 return_distance=return_distance,
             )

From 2bf34aae7b2cdf4296fdcaccc8de7fcbe72e5dc7 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 30 Jun 2021 10:22:35 +0200
Subject: [PATCH 04/26] [WIP] Adapting to use class hierarchy

---
 sklearn/metrics/_argkmin_fast.pyx | 184 ++++++++++++++----------------
 1 file changed, 85 insertions(+), 99 deletions(-)

diff --git a/sklearn/metrics/_argkmin_fast.pyx b/sklearn/metrics/_argkmin_fast.pyx
index b85aa1b94c388..192b8782d2f6a 100644
--- a/sklearn/metrics/_argkmin_fast.pyx
+++ b/sklearn/metrics/_argkmin_fast.pyx
@@ -91,24 +91,63 @@ cdef int _exact_euclidean_dist(
 cdef class ArgKmin:
 
     cdef:
-        ITYPE_t k
-        ITYPE_t chunk_size
+        ITYPE_t k, d, sf, si
+        ITYPE_t n_samples_chunk, chunk_size
+
+        ITYPE_t n_Y, Y_n_samples_chunk, Y_n_samples_rem
+        ITYPE_t n_X, X_n_samples_chunk, X_n_samples_rem
+
+        # Counting remainder chunk in total number of chunks
+        ITYPE_t Y_n_chunks, X_n_chunks, num_threads
 
         DTYPE_t[:, ::1] X
         DTYPE_t[:, ::1] Y
 
         DTYPE_t[::1] Y_sq_norms
 
+    def __cinit__(self):
+        # Initializing memory view to prevent memory errors and seg-faults
+        # in rare cases where __init__ is not called
+        self.X = np.empty((1, 1), dtype=DTYPE, order='c')
+        self.Y = np.empty((1, 1), dtype=DTYPE, order='c')
+
     def __init__(self,
                   DTYPE_t[:, ::1] X,
                   DTYPE_t[:, ::1] Y,
                   ITYPE_t k,
                   ITYPE_t chunk_size = CHUNK_SIZE,
     ):
+        cdef:
+            ITYPE_t X_n_full_chunks, Y_n_full_chunks
         self.X = X
         self.Y = Y
+
         self.k = k
+        self.d = X.shape[1]
+        self.sf = sizeof(DTYPE_t)
+        self.si = sizeof(ITYPE_t)
         self.chunk_size = chunk_size
+        self.n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
+
+        self.n_Y = Y.shape[0]
+        self.Y_n_samples_chunk = min(self.n_Y, self.n_samples_chunk)
+        Y_n_full_chunks = self.n_Y // self.Y_n_samples_chunk
+        self.Y_n_samples_rem = self.n_Y % self.Y_n_samples_chunk
+
+        self.n_X = X.shape[0]
+        self.X_n_samples_chunk = min(self.n_X, self.n_samples_chunk)
+        X_n_full_chunks = self.n_X // self.X_n_samples_chunk
+        self.X_n_samples_rem = self.n_X % self.X_n_samples_chunk
+
+        # Counting remainder chunk in total number of chunks
+        self.Y_n_chunks = Y_n_full_chunks + (
+            self.n_Y != (Y_n_full_chunks * self.Y_n_samples_chunk)
+        )
+
+        self.X_n_chunks = X_n_full_chunks + (
+            self.n_X != (X_n_full_chunks * self.X_n_samples_chunk)
+        )
+
         self.Y_sq_norms = np.einsum('ij,ij->i', Y, Y)
 
     cdef void _reduce_on_chunks(self,
@@ -170,35 +209,9 @@ cdef class ArgKmin:
         by parallelising computation on chunks of X.
         """
         cdef:
-            ITYPE_t k = argkmin_indices.shape[1]
-            ITYPE_t d = self.X.shape[1]
-            ITYPE_t sf = sizeof(DTYPE_t)
-            ITYPE_t si = sizeof(ITYPE_t)
-            ITYPE_t n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
-
-            ITYPE_t n_train = self.Y.shape[0]
-            ITYPE_t Y_n_samples_chunk = min(n_train, n_samples_chunk)
-            ITYPE_t Y_n_full_chunks = n_train / Y_n_samples_chunk
-            ITYPE_t Y_n_samples_rem = n_train % Y_n_samples_chunk
-
-            ITYPE_t n_test = self.X.shape[0]
-            ITYPE_t X_n_samples_chunk = min(n_test, n_samples_chunk)
-            ITYPE_t X_n_full_chunks = n_test // X_n_samples_chunk
-            ITYPE_t X_n_samples_rem = n_test % X_n_samples_chunk
-
-            # Counting remainder chunk in total number of chunks
-            ITYPE_t Y_n_chunks = Y_n_full_chunks + (
-                n_train != (Y_n_full_chunks * Y_n_samples_chunk)
-            )
-
-            ITYPE_t X_n_chunks = X_n_full_chunks + (
-                n_test != (X_n_full_chunks * X_n_samples_chunk)
-            )
-
-            ITYPE_t num_threads = min(Y_n_chunks, effective_n_threads)
+            ITYPE_t num_threads = min(self.Y_n_chunks, effective_n_threads)
 
-            ITYPE_t Y_start, Y_end, X_start, X_end
-            ITYPE_t X_chunk_idx, Y_chunk_idx, idx, jdx
+            ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx, idx, jdx
 
             DTYPE_t *dist_middle_terms_chunks
             DTYPE_t *heaps_red_distances_chunks
@@ -208,26 +221,26 @@ cdef class ArgKmin:
             # Thread local buffers
 
             # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-            dist_middle_terms_chunks = <DTYPE_t*> malloc(Y_n_samples_chunk * X_n_samples_chunk * sf)
-            heaps_red_distances_chunks = <DTYPE_t*> malloc(X_n_samples_chunk * k * sf)
+            dist_middle_terms_chunks = <DTYPE_t*> malloc(self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf)
+            heaps_red_distances_chunks = <DTYPE_t*> malloc(self.X_n_samples_chunk * self.k * self.sf)
 
-            for X_chunk_idx in prange(X_n_chunks, schedule='static'):
+            for X_chunk_idx in prange(self.X_n_chunks, schedule='static'):
                 # We reset the heap between X chunks (memset isn't suitable here)
-                for idx in range(X_n_samples_chunk * k):
+                for idx in range(self.X_n_samples_chunk * self.k):
                     heaps_red_distances_chunks[idx] = FLOAT_INF
 
-                X_start = X_chunk_idx * X_n_samples_chunk
-                if X_chunk_idx == X_n_chunks - 1 and X_n_samples_rem > 0:
-                    X_end = X_start + X_n_samples_rem
+                X_start = X_chunk_idx * self.X_n_samples_chunk
+                if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_rem > 0:
+                    X_end = X_start + self.X_n_samples_rem
                 else:
-                    X_end = X_start + X_n_samples_chunk
+                    X_end = X_start + self.X_n_samples_chunk
 
-                for Y_chunk_idx in range(Y_n_chunks):
-                    Y_start = Y_chunk_idx * Y_n_samples_chunk
-                    if Y_chunk_idx == Y_n_chunks - 1 and Y_n_samples_rem > 0:
-                        Y_end = Y_start + Y_n_samples_rem
+                for Y_chunk_idx in range(self.Y_n_chunks):
+                    Y_start = Y_chunk_idx * self.Y_n_samples_chunk
+                    if Y_chunk_idx == self.Y_n_chunks - 1 and self.Y_n_samples_rem > 0:
+                        Y_end = Y_start + self.Y_n_samples_rem
                     else:
-                        Y_end = Y_start + Y_n_samples_chunk
+                        Y_end = Y_start + self.Y_n_samples_chunk
 
                     self._reduce_on_chunks(
                         self.X[X_start:X_end, :],
@@ -236,16 +249,16 @@ cdef class ArgKmin:
                         dist_middle_terms_chunks,
                         heaps_red_distances_chunks,
                         &argkmin_indices[X_start, 0],
-                        k,
+                        self.k,
                         Y_start
                     )
 
                 # Sorting indices so that the closests' come first.
                 for idx in range(X_end - X_start):
                     _simultaneous_sort(
-                        heaps_red_distances_chunks + idx * k,
+                        heaps_red_distances_chunks + idx * self.k,
                         &argkmin_indices[X_start + idx, 0],
-                        k
+                        self.k
                     )
 
             # end: for X_chunk_idx
@@ -253,8 +266,7 @@ cdef class ArgKmin:
             free(heaps_red_distances_chunks)
 
         # end: with nogil, parallel
-        return X_n_chunks
-
+        return self.X_n_chunks
 
     cdef int _parallel_on_Y(self,
         ITYPE_t chunk_size,                       # IN
@@ -270,35 +282,9 @@ cdef class ArgKmin:
         most contexts.
         """
         cdef:
-            ITYPE_t k = argkmin_indices.shape[1]
-            ITYPE_t d = self.X.shape[1]
-            ITYPE_t sf = sizeof(DTYPE_t)
-            ITYPE_t si = sizeof(ITYPE_t)
-            ITYPE_t n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
-
-            ITYPE_t n_train = self.Y.shape[0]
-            ITYPE_t Y_n_samples_chunk = min(n_train, n_samples_chunk)
-            ITYPE_t Y_n_full_chunks = n_train / Y_n_samples_chunk
-            ITYPE_t Y_n_samples_rem = n_train % Y_n_samples_chunk
-
-            ITYPE_t n_test = self.X.shape[0]
-            ITYPE_t X_n_samples_chunk = min(n_test, n_samples_chunk)
-            ITYPE_t X_n_full_chunks = n_test // X_n_samples_chunk
-            ITYPE_t X_n_samples_rem = n_test % X_n_samples_chunk
-
-            # Counting remainder chunk in total number of chunks
-            ITYPE_t Y_n_chunks = Y_n_full_chunks + (
-                n_train != (Y_n_full_chunks * Y_n_samples_chunk)
-            )
-
-            ITYPE_t X_n_chunks = X_n_full_chunks + (
-                n_test != (X_n_full_chunks * X_n_samples_chunk)
-            )
-
-            ITYPE_t num_threads = min(Y_n_chunks, effective_n_threads)
+            ITYPE_t num_threads = min(self.Y_n_chunks, effective_n_threads)
 
-            ITYPE_t Y_start, Y_end, X_start, X_end
-            ITYPE_t X_chunk_idx, Y_chunk_idx, idx, jdx
+            ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx, idx, jdx
 
             DTYPE_t *dist_middle_terms_chunks
             DTYPE_t *heaps_red_distances_chunks
@@ -308,36 +294,36 @@ cdef class ArgKmin:
             # heaps which are then synchronised back in the main ones.
             ITYPE_t *heaps_indices_chunks
 
-        for X_chunk_idx in range(X_n_chunks):
-            X_start = X_chunk_idx * X_n_samples_chunk
-            if X_chunk_idx == X_n_chunks - 1 and X_n_samples_rem > 0:
-                X_end = X_start + X_n_samples_rem
+        for X_chunk_idx in range(self.X_n_chunks):
+            X_start = X_chunk_idx * self.X_n_samples_chunk
+            if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_rem > 0:
+                X_end = X_start + self.X_n_samples_rem
             else:
-                X_end = X_start + X_n_samples_chunk
+                X_end = X_start + self.X_n_samples_chunk
 
             with nogil, parallel(num_threads=num_threads):
                 # Thread local buffers
 
                 # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
                 dist_middle_terms_chunks = <DTYPE_t*> malloc(
-                    Y_n_samples_chunk * X_n_samples_chunk * sf)
+                    self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf)
                 heaps_red_distances_chunks = <DTYPE_t*> malloc(
-                    X_n_samples_chunk * k * sf)
+                    self.X_n_samples_chunk * self.k * self.sf)
                 heaps_indices_chunks = <ITYPE_t*> malloc(
-                    X_n_samples_chunk * k * sf)
+                    self.X_n_samples_chunk * self.k * self.sf)
 
                 # Initialising heaps (memset can't be used here)
-                for idx in range(X_n_samples_chunk * k):
+                for idx in range(self.X_n_samples_chunk * self.k):
                     heaps_red_distances_chunks[idx] = FLOAT_INF
                     heaps_indices_chunks[idx] = -1
 
-                for Y_chunk_idx in prange(Y_n_chunks, schedule='static'):
-                    Y_start = Y_chunk_idx * Y_n_samples_chunk
-                    if Y_chunk_idx == Y_n_chunks - 1 \
-                        and Y_n_samples_rem > 0:
-                        Y_end = Y_start + Y_n_samples_rem
+                for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
+                    Y_start = Y_chunk_idx * self.Y_n_samples_chunk
+                    if Y_chunk_idx == self.Y_n_chunks - 1 \
+                        and self.Y_n_samples_rem > 0:
+                        Y_end = Y_start + self.Y_n_samples_rem
                     else:
-                        Y_end = Y_start + Y_n_samples_chunk
+                        Y_end = Y_start + self.Y_n_samples_chunk
 
                     self._reduce_on_chunks(
                         self.X[X_start:X_end, :],
@@ -346,7 +332,7 @@ cdef class ArgKmin:
                         dist_middle_terms_chunks,
                         heaps_red_distances_chunks,
                         heaps_indices_chunks,
-                        k,
+                        self.k,
                         Y_start,
                     )
 
@@ -355,13 +341,13 @@ cdef class ArgKmin:
                     # Synchronising the thread local heaps
                     # with the main heaps
                     for idx in range(X_end - X_start):
-                        for jdx in range(k):
+                        for jdx in range(self.k):
                             _push(
                                 &argkmin_red_distances[X_start + idx, 0],
                                 &argkmin_indices[X_start + idx, 0],
-                                k,
-                                heaps_red_distances_chunks[idx * k + jdx],
-                                heaps_indices_chunks[idx * k + jdx],
+                                self.k,
+                                heaps_red_distances_chunks[idx * self.k + jdx],
+                                heaps_indices_chunks[idx * self.k + jdx],
                             )
 
                 free(dist_middle_terms_chunks)
@@ -370,17 +356,17 @@ cdef class ArgKmin:
 
             # end: with nogil, parallel
             # Sorting indices of the argkmin for each query vector of X
-            for idx in prange(n_test,schedule='static',
+            for idx in prange(self.n_X, schedule='static',
                               nogil=True, num_threads=num_threads):
                 _simultaneous_sort(
                     &argkmin_red_distances[idx, 0],
                     &argkmin_indices[idx, 0],
-                    k,
+                    self.k,
                 )
             # end: prange
 
         # end: for X_chunk_idx
-        return Y_n_chunks
+        return self.Y_n_chunks
 
     # Python interface
     def compute(self,

From c1415d693beaa67f9657d0f27706070920c3886e Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 30 Jun 2021 11:05:43 +0200
Subject: [PATCH 05/26] [WIP] Adapting to use class hierarchy

This test segfaults:

test_neighbors.py::test_fast_sqeuclidean_correctness[1-10-5-1000]
---
 sklearn/metrics/_argkmin_fast.pyx | 74 ++++++++++++++++---------------
 1 file changed, 39 insertions(+), 35 deletions(-)

diff --git a/sklearn/metrics/_argkmin_fast.pyx b/sklearn/metrics/_argkmin_fast.pyx
index 192b8782d2f6a..dcdfca80c9c22 100644
--- a/sklearn/metrics/_argkmin_fast.pyx
+++ b/sklearn/metrics/_argkmin_fast.pyx
@@ -105,6 +105,17 @@ cdef class ArgKmin:
 
         DTYPE_t[::1] Y_sq_norms
 
+        # ArgKmin
+        DTYPE_t * dist_middle_terms_chunks
+        DTYPE_t * heaps_red_distances_chunks
+
+        # Used for parallel_on_Y:
+
+        # As chunks of X are shared across threads, so must their
+        # heaps. To solve this, each thread has its own locals
+        # heaps which are then synchronised back in the main ones.
+        ITYPE_t * heaps_indices_chunks
+
     def __cinit__(self):
         # Initializing memory view to prevent memory errors and seg-faults
         # in rare cases where __init__ is not called
@@ -210,24 +221,23 @@ cdef class ArgKmin:
         """
         cdef:
             ITYPE_t num_threads = min(self.Y_n_chunks, effective_n_threads)
-
             ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx, idx, jdx
 
-            DTYPE_t *dist_middle_terms_chunks
-            DTYPE_t *heaps_red_distances_chunks
-
+            # in bytes
+            ITYPE_t size_dist_middle_terms = self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf
+            ITYPE_t heap_size = self.X_n_samples_chunk * self.k * self.sf
 
         with nogil, parallel(num_threads=num_threads):
             # Thread local buffers
 
             # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-            dist_middle_terms_chunks = <DTYPE_t*> malloc(self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf)
-            heaps_red_distances_chunks = <DTYPE_t*> malloc(self.X_n_samples_chunk * self.k * self.sf)
+            self.dist_middle_terms_chunks = <DTYPE_t*> malloc(size_dist_middle_terms)
+            self.heaps_red_distances_chunks = <DTYPE_t*> malloc(heap_size)
 
             for X_chunk_idx in prange(self.X_n_chunks, schedule='static'):
                 # We reset the heap between X chunks (memset isn't suitable here)
                 for idx in range(self.X_n_samples_chunk * self.k):
-                    heaps_red_distances_chunks[idx] = FLOAT_INF
+                    self.heaps_red_distances_chunks[idx] = FLOAT_INF
 
                 X_start = X_chunk_idx * self.X_n_samples_chunk
                 if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_rem > 0:
@@ -246,8 +256,8 @@ cdef class ArgKmin:
                         self.X[X_start:X_end, :],
                         self.Y[Y_start:Y_end, :],
                         self.Y_sq_norms[Y_start:Y_end],
-                        dist_middle_terms_chunks,
-                        heaps_red_distances_chunks,
+                        self.dist_middle_terms_chunks,
+                        self.heaps_red_distances_chunks,
                         &argkmin_indices[X_start, 0],
                         self.k,
                         Y_start
@@ -256,14 +266,14 @@ cdef class ArgKmin:
                 # Sorting indices so that the closests' come first.
                 for idx in range(X_end - X_start):
                     _simultaneous_sort(
-                        heaps_red_distances_chunks + idx * self.k,
+                        self.heaps_red_distances_chunks + idx * self.k,
                         &argkmin_indices[X_start + idx, 0],
                         self.k
                     )
 
             # end: for X_chunk_idx
-            free(dist_middle_terms_chunks)
-            free(heaps_red_distances_chunks)
+            free(self.dist_middle_terms_chunks)
+            free(self.heaps_red_distances_chunks)
 
         # end: with nogil, parallel
         return self.X_n_chunks
@@ -286,13 +296,10 @@ cdef class ArgKmin:
 
             ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx, idx, jdx
 
-            DTYPE_t *dist_middle_terms_chunks
-            DTYPE_t *heaps_red_distances_chunks
-
-            # As chunks of X are shared across threads, so must their
-            # heaps. To solve this, each thread has its own locals
-            # heaps which are then synchronised back in the main ones.
-            ITYPE_t *heaps_indices_chunks
+            # in bytes
+            ITYPE_t size_dist_middle_terms = self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf
+            ITYPE_t int_heap_size = self.X_n_samples_chunk * self.k * self.si
+            ITYPE_t float_heap_size = self.X_n_samples_chunk * self.k * self.sf
 
         for X_chunk_idx in range(self.X_n_chunks):
             X_start = X_chunk_idx * self.X_n_samples_chunk
@@ -305,17 +312,14 @@ cdef class ArgKmin:
                 # Thread local buffers
 
                 # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-                dist_middle_terms_chunks = <DTYPE_t*> malloc(
-                    self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf)
-                heaps_red_distances_chunks = <DTYPE_t*> malloc(
-                    self.X_n_samples_chunk * self.k * self.sf)
-                heaps_indices_chunks = <ITYPE_t*> malloc(
-                    self.X_n_samples_chunk * self.k * self.sf)
+                self.dist_middle_terms_chunks = <DTYPE_t*> malloc(size_dist_middle_terms)
+                self.heaps_red_distances_chunks = <DTYPE_t*> malloc(float_heap_size)
+                self.heaps_indices_chunks = <ITYPE_t*> malloc(int_heap_size)
 
                 # Initialising heaps (memset can't be used here)
                 for idx in range(self.X_n_samples_chunk * self.k):
-                    heaps_red_distances_chunks[idx] = FLOAT_INF
-                    heaps_indices_chunks[idx] = -1
+                    self.heaps_red_distances_chunks[idx] = FLOAT_INF
+                    self.heaps_indices_chunks[idx] = -1
 
                 for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
                     Y_start = Y_chunk_idx * self.Y_n_samples_chunk
@@ -329,9 +333,9 @@ cdef class ArgKmin:
                         self.X[X_start:X_end, :],
                         self.Y[Y_start:Y_end, :],
                         self.Y_sq_norms[Y_start:Y_end],
-                        dist_middle_terms_chunks,
-                        heaps_red_distances_chunks,
-                        heaps_indices_chunks,
+                        self.dist_middle_terms_chunks,
+                        self.heaps_red_distances_chunks,
+                        self.heaps_indices_chunks,
                         self.k,
                         Y_start,
                     )
@@ -346,13 +350,13 @@ cdef class ArgKmin:
                                 &argkmin_red_distances[X_start + idx, 0],
                                 &argkmin_indices[X_start + idx, 0],
                                 self.k,
-                                heaps_red_distances_chunks[idx * self.k + jdx],
-                                heaps_indices_chunks[idx * self.k + jdx],
+                                self.heaps_red_distances_chunks[idx * self.k + jdx],
+                                self.heaps_indices_chunks[idx * self.k + jdx],
                             )
 
-                free(dist_middle_terms_chunks)
-                free(heaps_red_distances_chunks)
-                free(heaps_indices_chunks)
+                free(self.dist_middle_terms_chunks)
+                free(self.heaps_red_distances_chunks)
+                free(self.heaps_indices_chunks)
 
             # end: with nogil, parallel
             # Sorting indices of the argkmin for each query vector of X

From 80aaf0bcb13cf31f5b2a251d615410c6b680db8f Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 30 Jun 2021 12:05:57 +0200
Subject: [PATCH 06/26] [WIP] Adapting to use class hierarchy

The segfaults was due to reallocation
of on the same pointers, causing multiple
freeing on the same reference and memory leaks.

To resolve this, arrays of pointers for local
datastructures are allocated at the initialisation
of the interface so that they can be handled
separately in threads with proper allocation
and deallocation.

The memory management will be wrapped in
subsequent private template method for
each types of reduction and parallelisation
strategy. This is one of the next iteration.
---
 sklearn/metrics/_argkmin_fast.pyx | 97 +++++++++++++++++--------------
 1 file changed, 53 insertions(+), 44 deletions(-)

diff --git a/sklearn/metrics/_argkmin_fast.pyx b/sklearn/metrics/_argkmin_fast.pyx
index dcdfca80c9c22..d1b374dcae299 100644
--- a/sklearn/metrics/_argkmin_fast.pyx
+++ b/sklearn/metrics/_argkmin_fast.pyx
@@ -10,6 +10,7 @@
 
 import numpy as np
 cimport numpy as np
+cimport openmp
 
 from libc.math cimport sqrt
 from libc.stdlib cimport free, malloc
@@ -64,8 +65,8 @@ cdef inline DTYPE_t _euclidean_dist(
 cdef int _exact_euclidean_dist(
     DTYPE_t[:, ::1] X,                  # IN
     DTYPE_t[:, ::1] Y,                  # IN
-    ITYPE_t[:, ::1] Y_indices,           # IN
-    ITYPE_t effective_n_threads,         # IN
+    ITYPE_t[:, ::1] Y_indices,          # IN
+    ITYPE_t n_threads,                  # IN
     DTYPE_t[:, ::1] distances,          # OUT
 ) nogil:
     """
@@ -82,7 +83,7 @@ cdef int _exact_euclidean_dist(
         ITYPE_t i, k
 
     for i in prange(X.shape[0], schedule='static',
-                    nogil=True, num_threads=effective_n_threads):
+                    nogil=True, num_threads=n_threads):
         for k in range(Y_indices.shape[1]):
             distances[i, k] = _euclidean_dist(X, Y, i,
                                               Y_indices[i, k])
@@ -91,6 +92,8 @@ cdef int _exact_euclidean_dist(
 cdef class ArgKmin:
 
     cdef:
+        ITYPE_t effective_omp_n_thread
+
         ITYPE_t k, d, sf, si
         ITYPE_t n_samples_chunk, chunk_size
 
@@ -103,18 +106,16 @@ cdef class ArgKmin:
         DTYPE_t[:, ::1] X
         DTYPE_t[:, ::1] Y
 
-        DTYPE_t[::1] Y_sq_norms
-
         # ArgKmin
-        DTYPE_t * dist_middle_terms_chunks
-        DTYPE_t * heaps_red_distances_chunks
+        DTYPE_t[::1] Y_sq_norms
+        DTYPE_t ** dist_middle_terms_chunks
+        DTYPE_t ** heaps_red_distances_chunks
 
         # Used for parallel_on_Y:
-
         # As chunks of X are shared across threads, so must their
         # heaps. To solve this, each thread has its own locals
         # heaps which are then synchronised back in the main ones.
-        ITYPE_t * heaps_indices_chunks
+        ITYPE_t ** heaps_indices_chunks
 
     def __cinit__(self):
         # Initializing memory view to prevent memory errors and seg-faults
@@ -159,8 +160,23 @@ cdef class ArgKmin:
             self.n_X != (X_n_full_chunks * self.X_n_samples_chunk)
         )
 
+        self.effective_omp_n_thread = _openmp_effective_n_threads()
+
+        # ArgKmin
         self.Y_sq_norms = np.einsum('ij,ij->i', Y, Y)
 
+        self.dist_middle_terms_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
+        self.heaps_red_distances_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
+        self.heaps_indices_chunks = <ITYPE_t **> malloc(sizeof(ITYPE_t *) * self.effective_omp_n_thread)
+
+    def __dealloc__(self):
+        if self.dist_middle_terms_chunks is not NULL:
+            free(self.dist_middle_terms_chunks)
+        if self.heaps_red_distances_chunks is not NULL:
+            free(self.heaps_red_distances_chunks)
+        if self.heaps_indices_chunks is not NULL:
+            free(self.heaps_indices_chunks)
+
     cdef void _reduce_on_chunks(self,
         DTYPE_t[:, ::1] X_c,                  # IN
         DTYPE_t[:, ::1] Y_c,                  # IN
@@ -211,8 +227,6 @@ cdef class ArgKmin:
 
 
     cdef int _parallel_on_X(self,
-        ITYPE_t chunk_size,                       # IN
-        ITYPE_t effective_n_threads,              # IN
         ITYPE_t[:, ::1] argkmin_indices,          # OUT
         DTYPE_t[:, ::1] argkmin_red_distances,   # OUT
     ) nogil:
@@ -220,8 +234,9 @@ cdef class ArgKmin:
         by parallelising computation on chunks of X.
         """
         cdef:
-            ITYPE_t num_threads = min(self.Y_n_chunks, effective_n_threads)
             ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx, idx, jdx
+            ITYPE_t num_threads = min(self.X_n_chunks, self.effective_omp_n_thread)
+            ITYPE_t thread_num
 
             # in bytes
             ITYPE_t size_dist_middle_terms = self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf
@@ -229,15 +244,15 @@ cdef class ArgKmin:
 
         with nogil, parallel(num_threads=num_threads):
             # Thread local buffers
-
+            thread_num = openmp.omp_get_thread_num()
             # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-            self.dist_middle_terms_chunks = <DTYPE_t*> malloc(size_dist_middle_terms)
-            self.heaps_red_distances_chunks = <DTYPE_t*> malloc(heap_size)
+            self.dist_middle_terms_chunks[thread_num] = <DTYPE_t*> malloc(size_dist_middle_terms)
+            self.heaps_red_distances_chunks[thread_num] = <DTYPE_t*> malloc(heap_size)
 
             for X_chunk_idx in prange(self.X_n_chunks, schedule='static'):
                 # We reset the heap between X chunks (memset isn't suitable here)
                 for idx in range(self.X_n_samples_chunk * self.k):
-                    self.heaps_red_distances_chunks[idx] = FLOAT_INF
+                    self.heaps_red_distances_chunks[thread_num][idx] = FLOAT_INF
 
                 X_start = X_chunk_idx * self.X_n_samples_chunk
                 if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_rem > 0:
@@ -256,8 +271,8 @@ cdef class ArgKmin:
                         self.X[X_start:X_end, :],
                         self.Y[Y_start:Y_end, :],
                         self.Y_sq_norms[Y_start:Y_end],
-                        self.dist_middle_terms_chunks,
-                        self.heaps_red_distances_chunks,
+                        self.dist_middle_terms_chunks[thread_num],
+                        self.heaps_red_distances_chunks[thread_num],
                         &argkmin_indices[X_start, 0],
                         self.k,
                         Y_start
@@ -266,21 +281,19 @@ cdef class ArgKmin:
                 # Sorting indices so that the closests' come first.
                 for idx in range(X_end - X_start):
                     _simultaneous_sort(
-                        self.heaps_red_distances_chunks + idx * self.k,
+                        self.heaps_red_distances_chunks[thread_num] + idx * self.k,
                         &argkmin_indices[X_start + idx, 0],
                         self.k
                     )
 
             # end: for X_chunk_idx
-            free(self.dist_middle_terms_chunks)
-            free(self.heaps_red_distances_chunks)
+            free(self.dist_middle_terms_chunks[thread_num])
+            free(self.heaps_red_distances_chunks[thread_num])
 
         # end: with nogil, parallel
         return self.X_n_chunks
 
     cdef int _parallel_on_Y(self,
-        ITYPE_t chunk_size,                       # IN
-        ITYPE_t effective_n_threads,              # IN
         ITYPE_t[:, ::1] argkmin_indices,          # OUT
         DTYPE_t[:, ::1] argkmin_red_distances,   # OUT
     ) nogil:
@@ -292,9 +305,9 @@ cdef class ArgKmin:
         most contexts.
         """
         cdef:
-            ITYPE_t num_threads = min(self.Y_n_chunks, effective_n_threads)
-
             ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx, idx, jdx
+            ITYPE_t num_threads = min(self.X_n_chunks, self.effective_omp_n_thread)
+            ITYPE_t thread_num
 
             # in bytes
             ITYPE_t size_dist_middle_terms = self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf
@@ -310,16 +323,17 @@ cdef class ArgKmin:
 
             with nogil, parallel(num_threads=num_threads):
                 # Thread local buffers
+                thread_num = openmp.omp_get_thread_num()
 
                 # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-                self.dist_middle_terms_chunks = <DTYPE_t*> malloc(size_dist_middle_terms)
-                self.heaps_red_distances_chunks = <DTYPE_t*> malloc(float_heap_size)
-                self.heaps_indices_chunks = <ITYPE_t*> malloc(int_heap_size)
+                self.dist_middle_terms_chunks[thread_num] = <DTYPE_t*> malloc(size_dist_middle_terms)
+                self.heaps_red_distances_chunks[thread_num] = <DTYPE_t*> malloc(float_heap_size)
+                self.heaps_indices_chunks[thread_num] = <ITYPE_t*> malloc(int_heap_size)
 
                 # Initialising heaps (memset can't be used here)
                 for idx in range(self.X_n_samples_chunk * self.k):
-                    self.heaps_red_distances_chunks[idx] = FLOAT_INF
-                    self.heaps_indices_chunks[idx] = -1
+                    self.heaps_red_distances_chunks[thread_num][idx] = FLOAT_INF
+                    self.heaps_indices_chunks[thread_num][idx] = -1
 
                 for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
                     Y_start = Y_chunk_idx * self.Y_n_samples_chunk
@@ -329,13 +343,14 @@ cdef class ArgKmin:
                     else:
                         Y_end = Y_start + self.Y_n_samples_chunk
 
+
                     self._reduce_on_chunks(
                         self.X[X_start:X_end, :],
                         self.Y[Y_start:Y_end, :],
                         self.Y_sq_norms[Y_start:Y_end],
-                        self.dist_middle_terms_chunks,
-                        self.heaps_red_distances_chunks,
-                        self.heaps_indices_chunks,
+                        self.dist_middle_terms_chunks[thread_num],
+                        self.heaps_red_distances_chunks[thread_num],
+                        self.heaps_indices_chunks[thread_num],
                         self.k,
                         Y_start,
                     )
@@ -350,15 +365,12 @@ cdef class ArgKmin:
                                 &argkmin_red_distances[X_start + idx, 0],
                                 &argkmin_indices[X_start + idx, 0],
                                 self.k,
-                                self.heaps_red_distances_chunks[idx * self.k + jdx],
-                                self.heaps_indices_chunks[idx * self.k + jdx],
+                                self.heaps_red_distances_chunks[thread_num][idx * self.k + jdx],
+                                self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
                             )
 
-                free(self.dist_middle_terms_chunks)
-                free(self.heaps_red_distances_chunks)
-                free(self.heaps_indices_chunks)
-
             # end: with nogil, parallel
+
             # Sorting indices of the argkmin for each query vector of X
             for idx in prange(self.n_X, schedule='static',
                               nogil=True, num_threads=num_threads):
@@ -411,24 +423,21 @@ cdef class ArgKmin:
             DTYPE_t[:, ::1] argkmin_distances = np.full((n_X, self.k),
                                                       FLOAT_INF,
                                                       dtype=DTYPE)
-            ITYPE_t effective_n_threads = _openmp_effective_n_threads()
 
         if strategy == 'auto':
             # This is a simple heuristic whose constant for the
             # comparison has been chosen based on experiments.
-            if 4 * self.chunk_size * effective_n_threads < n_X:
+            if 4 * self.chunk_size * self.effective_omp_n_thread < n_X:
                 strategy = 'parallel_on_X'
             else:
                 strategy = 'parallel_on_Y'
 
         if strategy == 'parallel_on_Y':
             self._parallel_on_Y(
-                self.chunk_size, effective_n_threads,
                 argkmin_indices, argkmin_distances
             )
         elif strategy == 'parallel_on_X':
             self._parallel_on_X(
-                self.chunk_size, effective_n_threads,
                 argkmin_indices, argkmin_distances
             )
         else:
@@ -440,7 +449,7 @@ cdef class ArgKmin:
             # term for squared norms and which are not the most
             # precise (catastrophic cancellation might have happened).
             _exact_euclidean_dist(self.X, self.Y, argkmin_indices,
-                                  effective_n_threads,
+                                  self.effective_omp_n_thread,
                                   argkmin_distances)
             return (np.asarray(argkmin_distances),
                     np.asarray(argkmin_indices))

From 49e247d80b1b1b23f4a619983afd478123921a5e Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 30 Jun 2021 12:05:57 +0200
Subject: [PATCH 07/26] [WIP] Adapting to use class hierarchy

Refactor ArgKmin._reduce_on_chunks to pave the
way to general interface for reductions.

Private datastructures will have to be accessed via
the implementation of this private method.
---
 sklearn/metrics/_argkmin_fast.pyx | 64 +++++++++++++++----------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/sklearn/metrics/_argkmin_fast.pyx b/sklearn/metrics/_argkmin_fast.pyx
index d1b374dcae299..96042432a80a5 100644
--- a/sklearn/metrics/_argkmin_fast.pyx
+++ b/sklearn/metrics/_argkmin_fast.pyx
@@ -110,11 +110,6 @@ cdef class ArgKmin:
         DTYPE_t[::1] Y_sq_norms
         DTYPE_t ** dist_middle_terms_chunks
         DTYPE_t ** heaps_red_distances_chunks
-
-        # Used for parallel_on_Y:
-        # As chunks of X are shared across threads, so must their
-        # heaps. To solve this, each thread has its own locals
-        # heaps which are then synchronised back in the main ones.
         ITYPE_t ** heaps_indices_chunks
 
     def __cinit__(self):
@@ -178,15 +173,13 @@ cdef class ArgKmin:
             free(self.heaps_indices_chunks)
 
     cdef void _reduce_on_chunks(self,
-        DTYPE_t[:, ::1] X_c,                  # IN
-        DTYPE_t[:, ::1] Y_c,                  # IN
-        DTYPE_t[::1] Y_sq_norms,              # IN
-        DTYPE_t *dist_middle_terms,           # IN
-        DTYPE_t *heaps_red_distances,         # IN/OUT
-        ITYPE_t *heaps_indices,                # IN/OUT
-        ITYPE_t k,                             # IN
-        # ID of the first element of Y_c
-        ITYPE_t Y_idx_offset,
+        DTYPE_t[:, ::1] X,                  # IN
+        DTYPE_t[:, ::1] Y,                  # IN
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
     ) nogil:
         """
         Critical part of the computation of pairwise distances.
@@ -196,6 +189,13 @@ cdef class ArgKmin:
         """
         cdef:
             ITYPE_t i, j
+            DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :]
+            DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
+            ITYPE_t k = self.k
+            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
+            DTYPE_t *heaps_red_distances = self.heaps_red_distances_chunks[thread_num]
+            ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
+
         # Instead of computing the full pairwise squared distances matrix,
         # ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²,
         # we only need to store the - 2 X_c.Y_c^T + ||Y_c||²
@@ -222,8 +222,8 @@ cdef class ArgKmin:
                       heaps_indices + i * k,
                       k,
                       # reduced distance: - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
-                      dist_middle_terms[i * Y_c.shape[0] + j] + Y_sq_norms[j],
-                      j + Y_idx_offset)
+                      dist_middle_terms[i * Y_c.shape[0] + j] + self.Y_sq_norms[j + Y_start],
+                      j + Y_start)
 
 
     cdef int _parallel_on_X(self,
@@ -260,6 +260,8 @@ cdef class ArgKmin:
                 else:
                     X_end = X_start + self.X_n_samples_chunk
 
+                self.heaps_indices_chunks[thread_num] = &argkmin_indices[X_start, 0]
+
                 for Y_chunk_idx in range(self.Y_n_chunks):
                     Y_start = Y_chunk_idx * self.Y_n_samples_chunk
                     if Y_chunk_idx == self.Y_n_chunks - 1 and self.Y_n_samples_rem > 0:
@@ -268,14 +270,11 @@ cdef class ArgKmin:
                         Y_end = Y_start + self.Y_n_samples_chunk
 
                     self._reduce_on_chunks(
-                        self.X[X_start:X_end, :],
-                        self.Y[Y_start:Y_end, :],
-                        self.Y_sq_norms[Y_start:Y_end],
-                        self.dist_middle_terms_chunks[thread_num],
-                        self.heaps_red_distances_chunks[thread_num],
-                        &argkmin_indices[X_start, 0],
-                        self.k,
-                        Y_start
+                        self.X,
+                        self.Y,
+                        X_start, X_end,
+                        Y_start, Y_end,
+                        thread_num,
                     )
 
                 # Sorting indices so that the closests' come first.
@@ -328,6 +327,10 @@ cdef class ArgKmin:
                 # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
                 self.dist_middle_terms_chunks[thread_num] = <DTYPE_t*> malloc(size_dist_middle_terms)
                 self.heaps_red_distances_chunks[thread_num] = <DTYPE_t*> malloc(float_heap_size)
+
+                # As chunks of X are shared across threads, so must their
+                # heaps. To solve this, each thread has its own locals
+                # heaps which are then synchronised back in the main ones.
                 self.heaps_indices_chunks[thread_num] = <ITYPE_t*> malloc(int_heap_size)
 
                 # Initialising heaps (memset can't be used here)
@@ -345,14 +348,11 @@ cdef class ArgKmin:
 
 
                     self._reduce_on_chunks(
-                        self.X[X_start:X_end, :],
-                        self.Y[Y_start:Y_end, :],
-                        self.Y_sq_norms[Y_start:Y_end],
-                        self.dist_middle_terms_chunks[thread_num],
-                        self.heaps_red_distances_chunks[thread_num],
-                        self.heaps_indices_chunks[thread_num],
-                        self.k,
-                        Y_start,
+                        self.X,
+                        self.Y,
+                        X_start, X_end,
+                        Y_start, Y_end,
+                        thread_num,
                     )
 
                 # end: for Y_chunk_idx

From 25c9a2c90b280e15c7c7d7943bb93345ff7ad8af Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 30 Jun 2021 18:25:29 +0200
Subject: [PATCH 08/26] fixup! [WIP] Adapting to use class hierarchy

---
 sklearn/metrics/pairwise.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index c78ed7bc23e17..f657ad5ccb49b 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -649,7 +649,7 @@ def pairwise_distances_argmin_min(
     if metric == "fast_sqeuclidean":
         # TODO: generalise this simple plug here
         values, indices = ArgKmin(X, Y, k=1).compute(
-            k=1, strategy="auto", return_distance=True
+            strategy="auto", return_distance=True
         )
         values = np.ndarray.flatten(values)
         indices = np.ndarray.flatten(indices)

From eb8b9313ac6ace0db051f727931aa8cd54d2c64b Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 30 Jun 2021 12:05:57 +0200
Subject: [PATCH 09/26] [WIP] Adapting to use class hierarchy

Introduce ParallelReduction as a abstract class,
and extend it using ArgKmin.

FastSquaredEuclideanArgKmin extends ArgKmin
for the "fast_sqeuclidean" strategy.
---
 sklearn/metrics/_argkmin_fast.pyx | 217 ++++++++++++++++++++++--------
 sklearn/metrics/pairwise.py       |   4 +-
 sklearn/neighbors/_base.py        |   6 +-
 3 files changed, 170 insertions(+), 57 deletions(-)

diff --git a/sklearn/metrics/_argkmin_fast.pyx b/sklearn/metrics/_argkmin_fast.pyx
index 96042432a80a5..62dce34bceb1c 100644
--- a/sklearn/metrics/_argkmin_fast.pyx
+++ b/sklearn/metrics/_argkmin_fast.pyx
@@ -17,6 +17,8 @@ from libc.stdlib cimport free, malloc
 
 from cython.parallel cimport parallel, prange
 
+# from ..neighbors._dist_metrics cimport DistanceMetric
+
 DEF CHUNK_SIZE = 256  # number of vectors
 
 DEF MIN_CHUNK_SAMPLES = 20
@@ -89,7 +91,21 @@ cdef int _exact_euclidean_dist(
                                               Y_indices[i, k])
 
 
-cdef class ArgKmin:
+cdef class ParallelReduction:
+    """Abstract class to computes a reduction of a set of
+    vectors (rows) of X on another set of vectors (rows) of Y
+
+    The implementation of the reduction is done parallelised
+    on chunks whose size can be set using ``chunk_size``.
+    Parameters
+    ----------
+    X: ndarray of shape (n, d)
+        Rows represent vectors
+    Y: ndarray of shape (m, d)
+        Rows represent vectors
+    chunk_size: int
+        The number of vectors per chunk.
+    """
 
     cdef:
         ITYPE_t effective_omp_n_thread
@@ -106,11 +122,10 @@ cdef class ArgKmin:
         DTYPE_t[:, ::1] X
         DTYPE_t[:, ::1] Y
 
-        # ArgKmin
-        DTYPE_t[::1] Y_sq_norms
-        DTYPE_t ** dist_middle_terms_chunks
-        DTYPE_t ** heaps_red_distances_chunks
-        ITYPE_t ** heaps_indices_chunks
+        # TODO: needs to move DistanceMetric
+        # from neighbors to be able to use them
+        # some adaptation
+        # DistanceMetric distance_metric
 
     def __cinit__(self):
         # Initializing memory view to prevent memory errors and seg-faults
@@ -129,6 +144,13 @@ cdef class ArgKmin:
         self.X = X
         self.Y = Y
 
+        # TODO: use proper internals checks of scikit-learn
+        assert X.shape[1] == Y.shape[1], (
+            f"Vectors of X and Y must have the same "
+            f"number of dimensions but are respectively "
+            f"{X.shape[1]}-dimensional and {Y.shape[1]}-dimensional."
+        )
+
         self.k = k
         self.d = X.shape[1]
         self.sf = sizeof(DTYPE_t)
@@ -157,22 +179,56 @@ cdef class ArgKmin:
 
         self.effective_omp_n_thread = _openmp_effective_n_threads()
 
-        # ArgKmin
-        self.Y_sq_norms = np.einsum('ij,ij->i', Y, Y)
+
+    cdef int _reduce_on_chunks(self,
+        DTYPE_t[:, ::1] X,                  # IN
+        DTYPE_t[:, ::1] Y,                  # IN
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil except -1:
+        """ Abstract method: Sub-classes implemented the reduction
+        on a pair of chunks"""
+        return -1
+
+cdef class ArgKmin(ParallelReduction):
+
+    cdef:
+        DTYPE_t ** dist_middle_terms_chunks
+        DTYPE_t ** heaps_red_distances_chunks
+        ITYPE_t ** heaps_indices_chunks
+
+    def __init__(self,
+                  DTYPE_t[:, ::1] X,
+                  DTYPE_t[:, ::1] Y,
+                  ITYPE_t k,
+                  ITYPE_t chunk_size = CHUNK_SIZE,
+    ):
+        ParallelReduction.__init__(self, X, Y, k)
 
         self.dist_middle_terms_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
         self.heaps_red_distances_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
         self.heaps_indices_chunks = <ITYPE_t **> malloc(sizeof(ITYPE_t *) * self.effective_omp_n_thread)
 
     def __dealloc__(self):
-        if self.dist_middle_terms_chunks is not NULL:
-            free(self.dist_middle_terms_chunks)
-        if self.heaps_red_distances_chunks is not NULL:
-            free(self.heaps_red_distances_chunks)
         if self.heaps_indices_chunks is not NULL:
             free(self.heaps_indices_chunks)
+        else:
+            raise RuntimeError("Trying to free heaps_indices_chunks which is NULL")
+
+        if self.heaps_red_distances_chunks is not NULL:
+            free(self.heaps_red_distances_chunks)
+        else:
+            raise RuntimeError("Trying to free heaps_red_distances_chunks which is NULL")
 
-    cdef void _reduce_on_chunks(self,
+        if self.dist_middle_terms_chunks is not NULL:
+            free(self.dist_middle_terms_chunks)
+        else:
+            raise RuntimeError("Trying to free dist_middle_terms_chunks which is NULL")
+
+    cdef int _reduce_on_chunks(self,
         DTYPE_t[:, ::1] X,                  # IN
         DTYPE_t[:, ::1] Y,                  # IN
         ITYPE_t X_start,
@@ -180,55 +236,38 @@ cdef class ArgKmin:
         ITYPE_t Y_start,
         ITYPE_t Y_end,
         ITYPE_t thread_num,
-    ) nogil:
-        """
-        Critical part of the computation of pairwise distances.
-
-        "Fast Squared Euclidean" distances strategy relying
-        on the gemm-trick.
-        """
+    ) nogil except -1:
         cdef:
             ITYPE_t i, j
-            DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :]
-            DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :]
+            DTYPE_t[:, ::1] X_c = X[X_start:X_end, :]
+            DTYPE_t[:, ::1] Y_c = Y[Y_start:Y_end, :]
             ITYPE_t k = self.k
             DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
             DTYPE_t *heaps_red_distances = self.heaps_red_distances_chunks[thread_num]
             ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
 
-        # Instead of computing the full pairwise squared distances matrix,
-        # ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²,
-        # we only need to store the - 2 X_c.Y_c^T + ||Y_c||²
-        # term since the argmin for a given sample X_c^{i} does not depend on
-        # ||X_c^{i}||²
+            ITYPE_t n_x = X_end - X_start
+            ITYPE_t n_y = Y_end - Y_start
 
-        # Careful: LDA, LDB and LDC are given for F-ordered arrays.
-        # Here, we use their counterpart values as indicated in the documentation.
-        # See the documentation of parameters here:
-        # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html
-        #
-        # dist_middle_terms = -2 * X_c.dot(Y_c.T)
-        _gemm(RowMajor, NoTrans, Trans,
-              X_c.shape[0], Y_c.shape[0], X_c.shape[1],
-              -2.0,
-              &X_c[0, 0], X_c.shape[1],
-              &Y_c[0, 0], X_c.shape[1], 0.0,
-              dist_middle_terms, Y_c.shape[0])
-
-        # Computing argmins here
         for i in range(X_c.shape[0]):
             for j in range(Y_c.shape[0]):
-                _push(heaps_red_distances + i * k,
-                      heaps_indices + i * k,
+                _push(heaps_red_distances + i * self.k,
+                      heaps_indices + i * self.k,
                       k,
-                      # reduced distance: - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
-                      dist_middle_terms[i * Y_c.shape[0] + j] + self.Y_sq_norms[j + Y_start],
-                      j + Y_start)
+                      0,
+                      # TODO: needs to move DistanceMetric
+                      # from neighbors to be able to use them
+                      # some adaptation
+                      # self.distance_metric.rdist(&X_c[i, 0],
+                      #                           &Y_c[j, 0],
+                      #                           self.d),
+                      Y_start + j)
 
+        return 0
 
     cdef int _parallel_on_X(self,
-        ITYPE_t[:, ::1] argkmin_indices,          # OUT
-        DTYPE_t[:, ::1] argkmin_red_distances,   # OUT
+        ITYPE_t[:, ::1] argkmin_indices,
+        DTYPE_t[:, ::1] argkmin_red_distances,
     ) nogil:
         """Computes the argkmin of each vector (row) of X on Y
         by parallelising computation on chunks of X.
@@ -250,7 +289,7 @@ cdef class ArgKmin:
             self.heaps_red_distances_chunks[thread_num] = <DTYPE_t*> malloc(heap_size)
 
             for X_chunk_idx in prange(self.X_n_chunks, schedule='static'):
-                # We reset the heap between X chunks (memset isn't suitable here)
+                # We reset the heap between X chunks (memset can't be used here)
                 for idx in range(self.X_n_samples_chunk * self.k):
                     self.heaps_red_distances_chunks[thread_num][idx] = FLOAT_INF
 
@@ -260,6 +299,8 @@ cdef class ArgKmin:
                 else:
                     X_end = X_start + self.X_n_samples_chunk
 
+                # Referencing the thread-local heaps via the thread-scope pointer
+                # of pointers attached to the instance
                 self.heaps_indices_chunks[thread_num] = &argkmin_indices[X_start, 0]
 
                 for Y_chunk_idx in range(self.Y_n_chunks):
@@ -292,6 +333,7 @@ cdef class ArgKmin:
         # end: with nogil, parallel
         return self.X_n_chunks
 
+
     cdef int _parallel_on_Y(self,
         ITYPE_t[:, ::1] argkmin_indices,          # OUT
         DTYPE_t[:, ::1] argkmin_red_distances,   # OUT
@@ -384,12 +426,13 @@ cdef class ArgKmin:
         # end: for X_chunk_idx
         return self.Y_n_chunks
 
+
     # Python interface
     def compute(self,
            str strategy = "auto",
            bint return_distance = False
-           ):
-        """Computes the argkmin of vectors (rows) of X on Y.
+    ):
+        """Computes the reduction of vectors (rows) of X on Y.
 
         strategy: str, {'auto', 'parallel_on_X', 'parallel_on_Y'}
             The chunking strategy defining which dataset
@@ -403,10 +446,9 @@ cdef class ArgKmin:
             between 'parallel_on_X' and 'parallel_on_Y'.
 
         return_distance: boolean
-            Return distances between each X vectory and its
+            Return distances between each X vector and its
             argkmin if set to True.
 
-
         Returns
         -------
         distances: ndarray of shape (n, k)
@@ -455,3 +497,72 @@ cdef class ArgKmin:
                     np.asarray(argkmin_indices))
 
         return np.asarray(argkmin_indices)
+
+cdef class FastSquaredEuclideanArgKmin(ArgKmin):
+
+    cdef:
+        DTYPE_t[::1] Y_sq_norms
+
+    def __init__(self,
+                  DTYPE_t[:, ::1] X,
+                  DTYPE_t[:, ::1] Y,
+                  ITYPE_t k,
+                  ITYPE_t chunk_size = CHUNK_SIZE,
+    ):
+        ArgKmin.__init__(self, X, Y, k)
+        self.Y_sq_norms = np.einsum('ij,ij->i', self.Y, self.Y)
+
+
+    cdef int _reduce_on_chunks(self,
+        DTYPE_t[:, ::1] X,                  # IN
+        DTYPE_t[:, ::1] Y,                  # IN
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+        ITYPE_t Y_start,
+        ITYPE_t Y_end,
+        ITYPE_t thread_num,
+    ) nogil except -1:
+        """
+        Critical part of the computation of pairwise distances.
+
+        "Fast Squared Euclidean" distances strategy relying
+        on the gemm-trick.
+        """
+        cdef:
+            ITYPE_t i, j
+            DTYPE_t[:, ::1] X_c = X[X_start:X_end, :]
+            DTYPE_t[:, ::1] Y_c = Y[Y_start:Y_end, :]
+            ITYPE_t k = self.k
+            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
+            DTYPE_t *heaps_red_distances = self.heaps_red_distances_chunks[thread_num]
+            ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
+
+        # Instead of computing the full pairwise squared distances matrix,
+        # ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²,
+        # we only need to store the - 2 X_c.Y_c^T + ||Y_c||²
+        # term since the argmin for a given sample X_c^{i} does not depend on
+        # ||X_c^{i}||²
+
+        # Careful: LDA, LDB and LDC are given for F-ordered arrays.
+        # Here, we use their counterpart values as indicated in the documentation.
+        # See the documentation of parameters here:
+        # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html
+        #
+        # dist_middle_terms = -2 * X_c.dot(Y_c.T)
+        _gemm(RowMajor, NoTrans, Trans,
+              X_c.shape[0], Y_c.shape[0], X_c.shape[1],
+              -2.0,
+              &X_c[0, 0], X_c.shape[1],
+              &Y_c[0, 0], X_c.shape[1], 0.0,
+              dist_middle_terms, Y_c.shape[0])
+
+        # Computing argmins here
+        for i in range(X_c.shape[0]):
+            for j in range(Y_c.shape[0]):
+                _push(heaps_red_distances + i * k,
+                      heaps_indices + i * k,
+                      k,
+                      # reduced distance: - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                      dist_middle_terms[i * Y_c.shape[0] + j] + self.Y_sq_norms[j + Y_start],
+                      j + Y_start)
+        return 0
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index f657ad5ccb49b..c800412677b09 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -31,7 +31,7 @@
 from ..utils.fixes import delayed
 from ..utils.fixes import sp_version, parse_version
 
-from ._argkmin_fast import ArgKmin
+from ._argkmin_fast import FastSquaredEuclideanArgKmin
 from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
 from ..exceptions import DataConversionWarning
 
@@ -648,7 +648,7 @@ def pairwise_distances_argmin_min(
 
     if metric == "fast_sqeuclidean":
         # TODO: generalise this simple plug here
-        values, indices = ArgKmin(X, Y, k=1).compute(
+        values, indices = FastSquaredEuclideanArgKmin(X, Y, k=1).compute(
             strategy="auto", return_distance=True
         )
         values = np.ndarray.flatten(values)
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 53774a3b78510..d3c886fc17e40 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -23,7 +23,7 @@
 from ..base import is_classifier
 from ..metrics import pairwise_distances_chunked
 from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
-from ..metrics._argkmin_fast import ArgKmin
+from ..metrics._argkmin_fast import FastSquaredEuclideanArgKmin
 from ..utils import (
     check_array,
     gen_even_slices,
@@ -740,7 +740,9 @@ class from an array representing our data set and ask who's
             self._fit_method == "brute" and self.effective_metric_ == "fast_sqeuclidean"
         ):
             # TODO: generalise this simple plug here
-            results = ArgKmin(X=X, Y=self._fit_X, k=n_neighbors).compute(
+            results = FastSquaredEuclideanArgKmin(
+                X=X, Y=self._fit_X, k=n_neighbors
+            ).compute(
                 strategy="auto",
                 return_distance=return_distance,
             )

From e0d1c99f69fb0061574c1db0c20fc3054c410c8a Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Fri, 25 Jun 2021 12:08:37 +0200
Subject: [PATCH 10/26] Move neighbors.DistanceMetric to metrics

The associated _typedefs.pyx file has been
moved to utils to avoid circular dependencies
has it is being used in neighbors.
---
 sklearn/cluster/_agglomerative.py                 |  4 ++--
 sklearn/cluster/_hierarchical_fast.pyx            | 15 +++++++--------
 sklearn/cluster/tests/test_hierarchical.py        |  5 +++--
 sklearn/metrics/__init__.py                       |  3 +++
 sklearn/{neighbors => metrics}/_dist_metrics.pxd  |  0
 sklearn/{neighbors => metrics}/_dist_metrics.pyx  |  4 ++--
 sklearn/metrics/pairwise.py                       |  2 +-
 sklearn/metrics/setup.py                          |  8 ++++++++
 .../tests/test_dist_metrics.py                    | 13 +------------
 sklearn/neighbors/__init__.py                     |  2 --
 sklearn/neighbors/_binary_tree.pxi                |  6 ++----
 sklearn/neighbors/_classification.py              |  8 ++++----
 sklearn/neighbors/_graph.py                       | 14 ++++++++------
 sklearn/neighbors/_partition_nodes.pxd            |  2 +-
 sklearn/neighbors/_regression.py                  |  8 ++++----
 sklearn/neighbors/_unsupervised.py                |  4 ++--
 sklearn/neighbors/setup.py                        |  7 -------
 sklearn/neighbors/tests/test_ball_tree.py         | 13 ++++++++++++-
 sklearn/neighbors/tests/test_neighbors_tree.py    |  2 +-
 19 files changed, 61 insertions(+), 59 deletions(-)
 rename sklearn/{neighbors => metrics}/_dist_metrics.pxd (100%)
 rename sklearn/{neighbors => metrics}/_dist_metrics.pyx (99%)
 rename sklearn/{neighbors => metrics}/tests/test_dist_metrics.py (95%)

diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py
index 48e2d38ebf32b..2c259e0287065 100644
--- a/sklearn/cluster/_agglomerative.py
+++ b/sklearn/cluster/_agglomerative.py
@@ -16,8 +16,8 @@
 
 from ..base import BaseEstimator, ClusterMixin
 from ..metrics.pairwise import paired_distances, pairwise_distances
-from ..neighbors import DistanceMetric
-from ..neighbors._dist_metrics import METRIC_MAPPING
+from ..metrics import DistanceMetric
+from ..metrics._dist_metrics import METRIC_MAPPING
 from ..utils import check_array
 from ..utils._fast_dict import IntFloatDict
 from ..utils.fixes import _astype_copy_false
diff --git a/sklearn/cluster/_hierarchical_fast.pyx b/sklearn/cluster/_hierarchical_fast.pyx
index 2a58757ce327d..11ea3294c086a 100644
--- a/sklearn/cluster/_hierarchical_fast.pyx
+++ b/sklearn/cluster/_hierarchical_fast.pyx
@@ -13,7 +13,7 @@ ctypedef np.int8_t INT8
 
 np.import_array()
 
-from ..neighbors._dist_metrics cimport DistanceMetric
+from ..metrics._dist_metrics cimport DistanceMetric
 from ..utils._fast_dict cimport IntFloatDict
 
 # C++
@@ -236,8 +236,8 @@ def max_merge(IntFloatDict a, IntFloatDict b,
 def average_merge(IntFloatDict a, IntFloatDict b,
               np.ndarray[ITYPE_t, ndim=1] mask,
               ITYPE_t n_a, ITYPE_t n_b):
-    """Merge two IntFloatDicts with the average strategy: when the 
-    same key is present in the two dicts, the weighted average of the two 
+    """Merge two IntFloatDicts with the average strategy: when the
+    same key is present in the two dicts, the weighted average of the two
     values is used.
 
     Parameters
@@ -290,13 +290,13 @@ def average_merge(IntFloatDict a, IntFloatDict b,
 
 
 ###############################################################################
-# An edge object for fast comparisons 
+# An edge object for fast comparisons
 
 cdef class WeightedEdge:
     cdef public ITYPE_t a
     cdef public ITYPE_t b
     cdef public DTYPE_t weight
-    
+
     def __init__(self, DTYPE_t weight, ITYPE_t a, ITYPE_t b):
         self.weight = weight
         self.a = a
@@ -326,7 +326,7 @@ cdef class WeightedEdge:
             return self.weight > other.weight
         elif op == 5:
             return self.weight >= other.weight
-        
+
     def __repr__(self):
         return "%s(weight=%f, a=%i, b=%i)" % (self.__class__.__name__,
                                               self.weight,
@@ -475,7 +475,7 @@ def mst_linkage_core(
 
     dist_metric: DistanceMetric
         A DistanceMetric object conforming to the API from
-        ``sklearn.neighbors._dist_metrics.pxd`` that will be
+        ``sklearn.metrics._dist_metrics.pxd`` that will be
         used to compute distances.
 
     Returns
@@ -534,4 +534,3 @@ def mst_linkage_core(
         current_node = new_node
 
     return np.array(result)
-
diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py
index 8aff7136c574f..73fee94b1b016 100644
--- a/sklearn/cluster/tests/test_hierarchical.py
+++ b/sklearn/cluster/tests/test_hierarchical.py
@@ -16,7 +16,7 @@
 from scipy.cluster import hierarchy
 
 from sklearn.metrics.cluster import adjusted_rand_score
-from sklearn.neighbors.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS
+from sklearn.metrics.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS
 from sklearn.utils._testing import assert_almost_equal, create_memmap_backed_data
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import ignore_warnings
@@ -30,6 +30,7 @@
     _fix_connectivity,
 )
 from sklearn.feature_extraction.image import grid_to_graph
+from sklearn.metrics import DistanceMetric
 from sklearn.metrics.pairwise import (
     PAIRED_DISTANCES,
     cosine_distances,
@@ -37,7 +38,7 @@
     pairwise_distances,
 )
 from sklearn.metrics.cluster import normalized_mutual_info_score
-from sklearn.neighbors import kneighbors_graph, DistanceMetric
+from sklearn.neighbors import kneighbors_graph
 from sklearn.cluster._hierarchical_fast import (
     average_merge,
     max_merge,
diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
index a0b06a02ad6d1..68409a7f85d35 100644
--- a/sklearn/metrics/__init__.py
+++ b/sklearn/metrics/__init__.py
@@ -36,6 +36,8 @@
 from ._classification import brier_score_loss
 from ._classification import multilabel_confusion_matrix
 
+from ._dist_metrics import DistanceMetric
+
 from . import cluster
 from .cluster import adjusted_mutual_info_score
 from .cluster import adjusted_rand_score
@@ -113,6 +115,7 @@
     "davies_bouldin_score",
     "DetCurveDisplay",
     "det_curve",
+    "DistanceMetric",
     "euclidean_distances",
     "explained_variance_score",
     "f1_score",
diff --git a/sklearn/neighbors/_dist_metrics.pxd b/sklearn/metrics/_dist_metrics.pxd
similarity index 100%
rename from sklearn/neighbors/_dist_metrics.pxd
rename to sklearn/metrics/_dist_metrics.pxd
diff --git a/sklearn/neighbors/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx
similarity index 99%
rename from sklearn/neighbors/_dist_metrics.pyx
rename to sklearn/metrics/_dist_metrics.pyx
index c9941cab0fc60..8d28773821127 100755
--- a/sklearn/neighbors/_dist_metrics.pyx
+++ b/sklearn/metrics/_dist_metrics.pyx
@@ -108,7 +108,7 @@ cdef class DistanceMetric:
 
     Examples
     --------
-    >>> from sklearn.neighbors import DistanceMetric
+    >>> from sklearn.metrics import DistanceMetric
     >>> dist = DistanceMetric.get_metric('euclidean')
     >>> X = [[0, 1, 2],
              [3, 4, 5]]
@@ -513,7 +513,7 @@ cdef class ChebyshevDistance(DistanceMetric):
 
     Examples
     --------
-    >>> from sklearn.neighbors.dist_metrics import DistanceMetric
+    >>> from sklearn.metrics import DistanceMetric
     >>> dist = DistanceMetric.get_metric('chebyshev')
     >>> X = [[0, 1, 2],
     ...      [3, 4, 5]]
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index c800412677b09..c7fc93034f85c 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -788,7 +788,7 @@ def haversine_distances(X, Y=None):
     array([[    0.        , 11099.54035582],
            [11099.54035582,     0.        ]])
     """
-    from ..neighbors import DistanceMetric
+    from ..metrics import DistanceMetric
 
     return DistanceMetric.get_metric("haversine").pairwise(X, Y)
 
diff --git a/sklearn/metrics/setup.py b/sklearn/metrics/setup.py
index 1edd6fe368d5e..346898eb60a94 100644
--- a/sklearn/metrics/setup.py
+++ b/sklearn/metrics/setup.py
@@ -1,4 +1,5 @@
 import os
+import numpy as np
 
 from numpy.distutils.misc_util import Configuration
 
@@ -22,6 +23,13 @@ def configuration(parent_package="", top_path=None):
         "_argkmin_fast", sources=["_argkmin_fast.pyx"], libraries=libraries
     )
 
+    config.add_extension(
+        "_dist_metrics",
+        sources=["_dist_metrics.pyx"],
+        include_dirs=[np.get_include(), os.path.join(np.get_include(), "numpy")],
+        libraries=libraries,
+    )
+
     config.add_subpackage("tests")
 
     return config
diff --git a/sklearn/neighbors/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py
similarity index 95%
rename from sklearn/neighbors/tests/test_dist_metrics.py
rename to sklearn/metrics/tests/test_dist_metrics.py
index 0703819536916..efa8031c53935 100644
--- a/sklearn/neighbors/tests/test_dist_metrics.py
+++ b/sklearn/metrics/tests/test_dist_metrics.py
@@ -7,8 +7,7 @@
 import pytest
 
 from scipy.spatial.distance import cdist
-from sklearn.neighbors import DistanceMetric
-from sklearn.neighbors import BallTree
+from sklearn.metrics import DistanceMetric
 from sklearn.utils import check_random_state
 from sklearn.utils._testing import create_memmap_backed_data
 from sklearn.utils.fixes import sp_version, parse_version
@@ -230,16 +229,6 @@ def test_pyfunc_metric():
     assert_array_almost_equal(D1_pkl, D2_pkl)
 
 
-def test_bad_pyfunc_metric():
-    def wrong_distance(x, y):
-        return "1"
-
-    X = np.ones((5, 2))
-    msg = "Custom distance function must accept two vectors"
-    with pytest.raises(TypeError, match=msg):
-        BallTree(X, metric=wrong_distance)
-
-
 def test_input_data_size():
     # Regression test for #6288
     # Previously, a metric requiring a particular input dimension would fail
diff --git a/sklearn/neighbors/__init__.py b/sklearn/neighbors/__init__.py
index 8a0934eecf142..3cd1d7925acf6 100644
--- a/sklearn/neighbors/__init__.py
+++ b/sklearn/neighbors/__init__.py
@@ -5,7 +5,6 @@
 
 from ._ball_tree import BallTree
 from ._kd_tree import KDTree
-from ._dist_metrics import DistanceMetric
 from ._graph import kneighbors_graph, radius_neighbors_graph
 from ._graph import KNeighborsTransformer, RadiusNeighborsTransformer
 from ._unsupervised import NearestNeighbors
@@ -19,7 +18,6 @@
 
 __all__ = [
     "BallTree",
-    "DistanceMetric",
     "KDTree",
     "KNeighborsClassifier",
     "KNeighborsRegressor",
diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi
index 37aa13b0a4f30..b64dbecac9e24 100755
--- a/sklearn/neighbors/_binary_tree.pxi
+++ b/sklearn/neighbors/_binary_tree.pxi
@@ -142,7 +142,6 @@
 #                                   BinaryTree tree2, ITYPE_t i_node2):
 #     """Compute the maximum distance between two nodes"""
 
-cimport cython
 cimport numpy as np
 from libc.math cimport fabs, sqrt, exp, cos, pow, log, lgamma
 from libc.math cimport fmin, fmax
@@ -152,8 +151,7 @@ from libc.string cimport memcpy
 import numpy as np
 import warnings
 
-from ._dist_metrics cimport (DistanceMetric, euclidean_dist, euclidean_rdist,
-                             euclidean_dist_to_rdist, euclidean_rdist_to_dist)
+from ..metrics._dist_metrics cimport (DistanceMetric, euclidean_dist, euclidean_rdist, euclidean_dist_to_rdist)
 
 from ._partition_nodes cimport partition_node_indices
 
@@ -796,7 +794,7 @@ def newObj(obj):
 
 ######################################################################
 # define the reverse mapping of VALID_METRICS
-from ._dist_metrics import get_valid_metric_ids
+from sklearn.metrics._dist_metrics import get_valid_metric_ids
 VALID_METRIC_IDS = get_valid_metric_ids(VALID_METRICS)
 
 
diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py
index 1e47e1b8020f2..bf433fea30aea 100644
--- a/sklearn/neighbors/_classification.py
+++ b/sklearn/neighbors/_classification.py
@@ -67,8 +67,8 @@ class KNeighborsClassifier(KNeighborsMixin, ClassifierMixin, NeighborsBase):
     metric : str or callable, default='minkowski'
         the distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of :class:`DistanceMetric` for a
-        list of available metrics.
+        metric. See the documentation of :class:`metrics.DistanceMetric`
+        for a list of available metrics.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
@@ -339,8 +339,8 @@ class RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, Neighbors
     metric : str or callable, default='minkowski'
         the distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of :class:`DistanceMetric` for a
-        list of available metrics.
+        metric. See the documentation of :class:`metrics.DistanceMetric`
+        for a list of available metrics.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py
index d5bcaf9408c72..1fcb568e5dff4 100644
--- a/sklearn/neighbors/_graph.py
+++ b/sklearn/neighbors/_graph.py
@@ -65,10 +65,11 @@ def kneighbors_graph(
         between neighbors according to the given metric.
 
     metric : str, default='minkowski'
-        The distance metric used to calculate the k-Neighbors for each sample
-        point. The DistanceMetric class gives a list of available metrics.
-        The default distance is 'euclidean' ('minkowski' metric with the p
-        param equal to 2.)
+        The distance metric used to calculate the neighbors within a
+        given radius for each sample point. The default distance is
+        'euclidean' ('minkowski' metric with the param equal to 2.)
+        See the documentation of :class:`metrics.DistanceMetric`
+        for a list of available metrics.
 
     p : int, default=2
         Power parameter for the Minkowski metric. When p = 1, this is
@@ -158,9 +159,10 @@ def radius_neighbors_graph(
 
     metric : str, default='minkowski'
         The distance metric used to calculate the neighbors within a
-        given radius for each sample point. The DistanceMetric class
-        gives a list of available metrics. The default distance is
+        given radius for each sample point. The default distance is
         'euclidean' ('minkowski' metric with the param equal to 2.)
+        See the documentation of :class:`metrics.DistanceMetric`
+        for a list of available metrics.
 
     p : int, default=2
         Power parameter for the Minkowski metric. When p = 1, this is
diff --git a/sklearn/neighbors/_partition_nodes.pxd b/sklearn/neighbors/_partition_nodes.pxd
index 1659801db469d..94b02002d7a1e 100644
--- a/sklearn/neighbors/_partition_nodes.pxd
+++ b/sklearn/neighbors/_partition_nodes.pxd
@@ -1,4 +1,4 @@
-from sklearn.utils._typedefs cimport DTYPE_t, ITYPE_t
+from ..utils._typedefs cimport DTYPE_t, ITYPE_t
 
 cdef int partition_node_indices(
         DTYPE_t *data,
diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
index fe536f06c20a5..77179f3bb317f 100644
--- a/sklearn/neighbors/_regression.py
+++ b/sklearn/neighbors/_regression.py
@@ -75,8 +75,8 @@ class KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase):
     metric : str or callable, default='minkowski'
         the distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of :class:`DistanceMetric` for a
-        list of available metrics.
+        metric. See the documentation of :class:`metrics.DistanceMetric`
+        for a list of available metrics.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
@@ -301,8 +301,8 @@ class RadiusNeighborsRegressor(RadiusNeighborsMixin, RegressorMixin, NeighborsBa
     metric : str or callable, default='minkowski'
         the distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of :class:`DistanceMetric` for a
-        list of available metrics.
+        metric. See the documentation of :class:`metrics.DistanceMetric`
+        for a list of available metrics.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py
index 06566b0807b7a..b11df8af8790f 100644
--- a/sklearn/neighbors/_unsupervised.py
+++ b/sklearn/neighbors/_unsupervised.py
@@ -41,8 +41,8 @@ class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, NeighborsBase):
     metric : str or callable, default='minkowski'
         the distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of :class:`DistanceMetric` for a
-        list of available metrics.
+        metric. See the documentation of :class:`metrics.DistanceMetric`
+        for a list of available metrics.
         If metric is "precomputed", X is assumed to be a distance matrix and
         must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
diff --git a/sklearn/neighbors/setup.py b/sklearn/neighbors/setup.py
index 34921de75041a..aa19ba501b18d 100644
--- a/sklearn/neighbors/setup.py
+++ b/sklearn/neighbors/setup.py
@@ -32,13 +32,6 @@ def configuration(parent_package="", top_path=None):
         libraries=libraries,
     )
 
-    config.add_extension(
-        "_dist_metrics",
-        sources=["_dist_metrics.pyx"],
-        include_dirs=[numpy.get_include(), os.path.join(numpy.get_include(), "numpy")],
-        libraries=libraries,
-    )
-
     config.add_extension(
         "_quad_tree",
         sources=["_quad_tree.pyx"],
diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py
index c751539f2a1ae..a823a03251a1b 100644
--- a/sklearn/neighbors/tests/test_ball_tree.py
+++ b/sklearn/neighbors/tests/test_ball_tree.py
@@ -4,7 +4,6 @@
 import pytest
 from numpy.testing import assert_array_almost_equal
 from sklearn.neighbors._ball_tree import BallTree
-from sklearn.neighbors import DistanceMetric
 from sklearn.utils import check_random_state
 from sklearn.utils.validation import check_array
 from sklearn.utils._testing import _convert_container
@@ -40,6 +39,8 @@
 
 
 def brute_force_neighbors(X, Y, k, metric, **kwargs):
+    from sklearn.metrics import DistanceMetric
+
     X, Y = check_array(X), check_array(Y)
     D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X)
     ind = np.argsort(D, axis=1)[:, :k]
@@ -84,3 +85,13 @@ def test_array_object_type():
     X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object)
     with pytest.raises(ValueError, match="setting an array element with a sequence"):
         BallTree(X)
+
+
+def test_bad_pyfunc_metric():
+    def wrong_distance(x, y):
+        return "1"
+
+    X = np.ones((5, 2))
+    msg = "Custom distance function must accept two vectors"
+    with pytest.raises(TypeError, match=msg):
+        BallTree(X, metric=wrong_distance)
diff --git a/sklearn/neighbors/tests/test_neighbors_tree.py b/sklearn/neighbors/tests/test_neighbors_tree.py
index de34b4d230171..e043ffb730708 100644
--- a/sklearn/neighbors/tests/test_neighbors_tree.py
+++ b/sklearn/neighbors/tests/test_neighbors_tree.py
@@ -6,7 +6,7 @@
 import numpy as np
 import pytest
 
-from sklearn.neighbors import DistanceMetric
+from sklearn.metrics import DistanceMetric
 from sklearn.neighbors._ball_tree import (
     BallTree,
     kernel_norm,

From ac5ddc1dcfbfb4599371d7099fadfa8547bc8424 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 30 Jun 2021 19:33:35 +0200
Subject: [PATCH 11/26] Rename private submodule to _parallel_reductions

---
 .../metrics/{_argkmin_fast.pyx => _parallel_reductions.pyx}   | 0
 sklearn/metrics/pairwise.py                                   | 2 +-
 sklearn/metrics/setup.py                                      | 4 +++-
 sklearn/neighbors/_base.py                                    | 2 +-
 4 files changed, 5 insertions(+), 3 deletions(-)
 rename sklearn/metrics/{_argkmin_fast.pyx => _parallel_reductions.pyx} (100%)

diff --git a/sklearn/metrics/_argkmin_fast.pyx b/sklearn/metrics/_parallel_reductions.pyx
similarity index 100%
rename from sklearn/metrics/_argkmin_fast.pyx
rename to sklearn/metrics/_parallel_reductions.pyx
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index c7fc93034f85c..4370b6d39d4b1 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -31,7 +31,7 @@
 from ..utils.fixes import delayed
 from ..utils.fixes import sp_version, parse_version
 
-from ._argkmin_fast import FastSquaredEuclideanArgKmin
+from ._parallel_reductions import FastSquaredEuclideanArgKmin
 from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
 from ..exceptions import DataConversionWarning
 
diff --git a/sklearn/metrics/setup.py b/sklearn/metrics/setup.py
index 346898eb60a94..6fd445d2c1a00 100644
--- a/sklearn/metrics/setup.py
+++ b/sklearn/metrics/setup.py
@@ -20,7 +20,9 @@ def configuration(parent_package="", top_path=None):
     )
 
     config.add_extension(
-        "_argkmin_fast", sources=["_argkmin_fast.pyx"], libraries=libraries
+        "_parallel_reductions",
+        sources=["_parallel_reductions.pyx"],
+        libraries=libraries,
     )
 
     config.add_extension(
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index d3c886fc17e40..ff601ec3bb59a 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -23,7 +23,7 @@
 from ..base import is_classifier
 from ..metrics import pairwise_distances_chunked
 from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
-from ..metrics._argkmin_fast import FastSquaredEuclideanArgKmin
+from ..metrics._parallel_reductions import FastSquaredEuclideanArgKmin
 from ..utils import (
     check_array,
     gen_even_slices,

From 4e7b3cb7e67033b098d90110c8cb7e95d49e9936 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 30 Jun 2021 19:57:33 +0200
Subject: [PATCH 12/26] Introduce DistanceMetric and ArgKmin factory method

---
 sklearn/metrics/_parallel_reductions.pyx | 86 +++++++++++++++++-------
 sklearn/metrics/pairwise.py              |  4 +-
 sklearn/neighbors/_base.py               |  6 +-
 3 files changed, 65 insertions(+), 31 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 62dce34bceb1c..0a1d870d748fa 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -17,7 +17,7 @@ from libc.stdlib cimport free, malloc
 
 from cython.parallel cimport parallel, prange
 
-# from ..neighbors._dist_metrics cimport DistanceMetric
+from ._dist_metrics cimport DistanceMetric
 
 DEF CHUNK_SIZE = 256  # number of vectors
 
@@ -93,9 +93,9 @@ cdef int _exact_euclidean_dist(
 
 cdef class ParallelReduction:
     """Abstract class to computes a reduction of a set of
-    vectors (rows) of X on another set of vectors (rows) of Y
+    vectors (rows) of X on another set of vectors (rows) of Y.
 
-    The implementation of the reduction is done parallelised
+    The implementation of the reduction is done parallelized
     on chunks whose size can be set using ``chunk_size``.
     Parameters
     ----------
@@ -103,8 +103,10 @@ cdef class ParallelReduction:
         Rows represent vectors
     Y: ndarray of shape (m, d)
         Rows represent vectors
+    distance_metric: DistanceMetric
+        The distance to use
     chunk_size: int
-        The number of vectors per chunk.
+        The number of vectors per chunk
     """
 
     cdef:
@@ -122,10 +124,7 @@ cdef class ParallelReduction:
         DTYPE_t[:, ::1] X
         DTYPE_t[:, ::1] Y
 
-        # TODO: needs to move DistanceMetric
-        # from neighbors to be able to use them
-        # some adaptation
-        # DistanceMetric distance_metric
+        DistanceMetric distance_metric
 
     def __cinit__(self):
         # Initializing memory view to prevent memory errors and seg-faults
@@ -134,10 +133,10 @@ cdef class ParallelReduction:
         self.Y = np.empty((1, 1), dtype=DTYPE, order='c')
 
     def __init__(self,
-                  DTYPE_t[:, ::1] X,
-                  DTYPE_t[:, ::1] Y,
-                  ITYPE_t k,
-                  ITYPE_t chunk_size = CHUNK_SIZE,
+                 DTYPE_t[:, ::1] X,
+                 DTYPE_t[:, ::1] Y,
+                 DistanceMetric distance_metric,
+                 ITYPE_t chunk_size = CHUNK_SIZE,
     ):
         cdef:
             ITYPE_t X_n_full_chunks, Y_n_full_chunks
@@ -151,13 +150,14 @@ cdef class ParallelReduction:
             f"{X.shape[1]}-dimensional and {Y.shape[1]}-dimensional."
         )
 
-        self.k = k
         self.d = X.shape[1]
         self.sf = sizeof(DTYPE_t)
         self.si = sizeof(ITYPE_t)
         self.chunk_size = chunk_size
         self.n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size)
 
+        self.distance_metric = distance_metric
+
         self.n_Y = Y.shape[0]
         self.Y_n_samples_chunk = min(self.n_Y, self.n_samples_chunk)
         Y_n_full_chunks = self.n_Y // self.Y_n_samples_chunk
@@ -194,20 +194,55 @@ cdef class ParallelReduction:
         return -1
 
 cdef class ArgKmin(ParallelReduction):
+    """Computes the argkmin of vectors (rows) of a set of
+    vectors (rows) of X on another set of vectors (rows) of Y.
+
+    The implementation is parallelized on chunks whose size can
+    be set using ``chunk_size``.
+    Parameters
+    ----------
+    X: ndarray of shape (n, d)
+        Rows represent vectors
+    Y: ndarray of shape (m, d)
+        Rows represent vectors
+    distance_metric: DistanceMetric
+        The distance to use
+    k: int
+        The k for the argkmin reduction
+    chunk_size: int
+        The number of vectors per chunk
+    """
 
     cdef:
         DTYPE_t ** dist_middle_terms_chunks
         DTYPE_t ** heaps_red_distances_chunks
         ITYPE_t ** heaps_indices_chunks
 
+    @classmethod
+    def get_for(cls,
+                DTYPE_t[:, ::1] X,
+                DTYPE_t[:, ::1] Y,
+                ITYPE_t k,
+                str metric="fast_sqeuclidean",
+                ITYPE_t chunk_size=CHUNK_SIZE,
+    ):
+        if metric == "fast_sqeuclidean":
+            return FastSquaredEuclideanArgKmin(X=X, Y=Y, k=k, chunk_size=chunk_size)
+        return ArgKmin(X=X, Y=Y,
+                       distance_metric=DistanceMetric.get_metric(metric),
+                       k=k,
+                       chunk_size=chunk_size)
+
     def __init__(self,
-                  DTYPE_t[:, ::1] X,
-                  DTYPE_t[:, ::1] Y,
-                  ITYPE_t k,
-                  ITYPE_t chunk_size = CHUNK_SIZE,
+                 DTYPE_t[:, ::1] X,
+                 DTYPE_t[:, ::1] Y,
+                 DistanceMetric distance_metric,
+                 ITYPE_t k,
+                 ITYPE_t chunk_size = CHUNK_SIZE,
     ):
-        ParallelReduction.__init__(self, X, Y, k)
+        ParallelReduction.__init__(self, X, Y, distance_metric, chunk_size)
 
+        self.k = k
         self.dist_middle_terms_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
         self.heaps_red_distances_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
         self.heaps_indices_chunks = <ITYPE_t **> malloc(sizeof(ITYPE_t *) * self.effective_omp_n_thread)
@@ -254,13 +289,9 @@ cdef class ArgKmin(ParallelReduction):
                 _push(heaps_red_distances + i * self.k,
                       heaps_indices + i * self.k,
                       k,
-                      0,
-                      # TODO: needs to move DistanceMetric
-                      # from neighbors to be able to use them
-                      # some adaptation
-                      # self.distance_metric.rdist(&X_c[i, 0],
-                      #                           &Y_c[j, 0],
-                      #                           self.d),
+                      self.distance_metric.rdist(&X_c[i, 0],
+                                                 &Y_c[j, 0],
+                                                 self.d),
                       Y_start + j)
 
         return 0
@@ -509,7 +540,10 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
                   ITYPE_t k,
                   ITYPE_t chunk_size = CHUNK_SIZE,
     ):
-        ArgKmin.__init__(self, X, Y, k)
+        ArgKmin.__init__(self, X, Y,
+                         distance_metric=DistanceMetric.get_metric("euclidean"),
+                         k=k,
+                         chunk_size=chunk_size)
         self.Y_sq_norms = np.einsum('ij,ij->i', self.Y, self.Y)
 
 
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 4370b6d39d4b1..75660cddb1ab8 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -31,7 +31,7 @@
 from ..utils.fixes import delayed
 from ..utils.fixes import sp_version, parse_version
 
-from ._parallel_reductions import FastSquaredEuclideanArgKmin
+from ._parallel_reductions import ArgKmin
 from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
 from ..exceptions import DataConversionWarning
 
@@ -648,7 +648,7 @@ def pairwise_distances_argmin_min(
 
     if metric == "fast_sqeuclidean":
         # TODO: generalise this simple plug here
-        values, indices = FastSquaredEuclideanArgKmin(X, Y, k=1).compute(
+        values, indices = ArgKmin.get_for(X=X, Y=Y, k=1, metric=metric).compute(
             strategy="auto", return_distance=True
         )
         values = np.ndarray.flatten(values)
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index ff601ec3bb59a..7e6578dbf22a5 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -23,7 +23,7 @@
 from ..base import is_classifier
 from ..metrics import pairwise_distances_chunked
 from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
-from ..metrics._parallel_reductions import FastSquaredEuclideanArgKmin
+from ..metrics._parallel_reductions import ArgKmin
 from ..utils import (
     check_array,
     gen_even_slices,
@@ -740,8 +740,8 @@ class from an array representing our data set and ask who's
             self._fit_method == "brute" and self.effective_metric_ == "fast_sqeuclidean"
         ):
             # TODO: generalise this simple plug here
-            results = FastSquaredEuclideanArgKmin(
-                X=X, Y=self._fit_X, k=n_neighbors
+            results = ArgKmin.get_for(
+                X=X, Y=self._fit_X, k=n_neighbors, metric=self.effective_metric_
             ).compute(
                 strategy="auto",
                 return_distance=return_distance,

From d386fe11a58ae910e988a17c02276964b917bce1 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Wed, 30 Jun 2021 20:47:22 +0200
Subject: [PATCH 13/26] Support DistanceMetric's and branch on ArgKmin when
 possible

---
 sklearn/metrics/_parallel_reductions.pyx | 69 ++++++++++--------------
 sklearn/metrics/pairwise.py              | 21 ++++----
 sklearn/neighbors/_base.py               |  4 +-
 3 files changed, 41 insertions(+), 53 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 0a1d870d748fa..9009d0866c0ba 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -18,6 +18,7 @@ from libc.stdlib cimport free, malloc
 from cython.parallel cimport parallel, prange
 
 from ._dist_metrics cimport DistanceMetric
+from ._dist_metrics import METRIC_MAPPING
 
 DEF CHUNK_SIZE = 256  # number of vectors
 
@@ -64,32 +65,6 @@ cdef inline DTYPE_t _euclidean_dist(
 
     return sqrt(dist)
 
-cdef int _exact_euclidean_dist(
-    DTYPE_t[:, ::1] X,                  # IN
-    DTYPE_t[:, ::1] Y,                  # IN
-    ITYPE_t[:, ::1] Y_indices,          # IN
-    ITYPE_t n_threads,                  # IN
-    DTYPE_t[:, ::1] distances,          # OUT
-) nogil:
-    """
-    Compute exact pairwise euclidean distances in parallel.
-
-    The pairwise distances considered are X vectors
-    and a subset of Y given for each row if X given in
-    Y_indices.
-
-    Notes: the body of this function could have been inlined,
-    but we use a function to have a cdef nogil context.
-    """
-    cdef:
-        ITYPE_t i, k
-
-    for i in prange(X.shape[0], schedule='static',
-                    nogil=True, num_threads=n_threads):
-        for k in range(Y_indices.shape[1]):
-            distances[i, k] = _euclidean_dist(X, Y, i,
-                                              Y_indices[i, k])
-
 
 cdef class ParallelReduction:
     """Abstract class to computes a reduction of a set of
@@ -218,6 +193,10 @@ cdef class ArgKmin(ParallelReduction):
         DTYPE_t ** heaps_red_distances_chunks
         ITYPE_t ** heaps_indices_chunks
 
+    @classmethod
+    def valid_metrics(cls):
+        return {"fast_sqeuclidean", *METRIC_MAPPING.keys()}
+
     @classmethod
     def get_for(cls,
                 DTYPE_t[:, ::1] X,
@@ -225,11 +204,12 @@ cdef class ArgKmin(ParallelReduction):
                 ITYPE_t k,
                 str metric="fast_sqeuclidean",
                 ITYPE_t chunk_size=CHUNK_SIZE,
-    ):
+                dict metric_kwargs=dict(),
+        ):
         if metric == "fast_sqeuclidean":
             return FastSquaredEuclideanArgKmin(X=X, Y=Y, k=k, chunk_size=chunk_size)
         return ArgKmin(X=X, Y=Y,
-                       distance_metric=DistanceMetric.get_metric(metric),
+                       distance_metric=DistanceMetric.get_metric(metric, **metric_kwargs),
                        k=k,
                        chunk_size=chunk_size)
 
@@ -301,7 +281,7 @@ cdef class ArgKmin(ParallelReduction):
         DTYPE_t[:, ::1] argkmin_red_distances,
     ) nogil:
         """Computes the argkmin of each vector (row) of X on Y
-        by parallelising computation on chunks of X.
+        by parallelizing computation on chunks of X.
         """
         cdef:
             ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx, idx, jdx
@@ -370,9 +350,9 @@ cdef class ArgKmin(ParallelReduction):
         DTYPE_t[:, ::1] argkmin_red_distances,   # OUT
     ) nogil:
         """Computes the argkmin of each vector (row) of X on Y
-        by parallelising computation on chunks of Y.
+        by parallelizing computation on chunks of Y.
 
-        This parallelisation strategy is more costly (as we need
+        This parallelization strategy is more costly (as we need
         extra heaps and synchronisation), yet it is useful in
         most contexts.
         """
@@ -457,6 +437,20 @@ cdef class ArgKmin(ParallelReduction):
         # end: for X_chunk_idx
         return self.Y_n_chunks
 
+    cdef void _exact_distances(self,
+        ITYPE_t[:, ::1] Y_indices,          # IN
+        DTYPE_t[:, ::1] distances,          # IN/OUT
+    ) nogil:
+        """Convert reduced distances to pairwise distances in parallel."""
+        cdef:
+            ITYPE_t i, j
+
+        for i in prange(self.n_X, schedule='static', nogil=True,
+                        num_threads=self.effective_omp_n_thread):
+            for j in range(self.k):
+                distances[i, j] = self.distance_metric.dist(&self.X[i, 0],
+                                                 &self.Y[Y_indices[i, j], 0],
+                                                 self.d)
 
     # Python interface
     def compute(self,
@@ -467,7 +461,7 @@ cdef class ArgKmin(ParallelReduction):
 
         strategy: str, {'auto', 'parallel_on_X', 'parallel_on_Y'}
             The chunking strategy defining which dataset
-            parallelisation are made on.
+            parallelization are made on.
 
              - 'parallel_on_X' is embarassingly parallel but
             is less used in practice.
@@ -518,14 +512,9 @@ cdef class ArgKmin(ParallelReduction):
 
         if return_distance:
             # We need to recompute distances because we relied on
-            # reduced distances using _gemm, which are missing a
-            # term for squared norms and which are not the most
-            # precise (catastrophic cancellation might have happened).
-            _exact_euclidean_dist(self.X, self.Y, argkmin_indices,
-                                  self.effective_omp_n_thread,
-                                  argkmin_distances)
-            return (np.asarray(argkmin_distances),
-                    np.asarray(argkmin_indices))
+            # reduced distances.
+            self._exact_distances(argkmin_indices, argkmin_distances)
+            return np.asarray(argkmin_distances), np.asarray(argkmin_indices)
 
         return np.asarray(argkmin_indices)
 
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 75660cddb1ab8..c35b38ad1fde4 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -646,20 +646,19 @@ def pairwise_distances_argmin_min(
     """
     X, Y = check_pairwise_arrays(X, Y)
 
-    if metric == "fast_sqeuclidean":
-        # TODO: generalise this simple plug here
-        values, indices = ArgKmin.get_for(X=X, Y=Y, k=1, metric=metric).compute(
-            strategy="auto", return_distance=True
-        )
+    if axis == 0:
+        X, Y = Y, X
+
+    if metric_kwargs is None:
+        metric_kwargs = {}
+
+    if metric in ArgKmin.valid_metrics():
+        values, indices = ArgKmin.get_for(
+            X=X, Y=Y, k=1, metric=metric, metric_kwargs=metric_kwargs
+        ).compute(strategy="auto", return_distance=True)
         values = np.ndarray.flatten(values)
         indices = np.ndarray.flatten(indices)
     else:
-        if metric_kwargs is None:
-            metric_kwargs = {}
-
-        if axis == 0:
-            X, Y = Y, X
-
         indices, values = zip(
             *pairwise_distances_chunked(
                 X, Y, reduce_func=_argmin_min_reduce, metric=metric, **metric_kwargs
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 7e6578dbf22a5..e97894e8731c4 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -737,9 +737,9 @@ class from an array representing our data set and ask who's
             )
 
         elif (
-            self._fit_method == "brute" and self.effective_metric_ == "fast_sqeuclidean"
+            self._fit_method == "brute"
+            and self.effective_metric_ in ArgKmin.valid_metrics()
         ):
-            # TODO: generalise this simple plug here
             results = ArgKmin.get_for(
                 X=X, Y=self._fit_X, k=n_neighbors, metric=self.effective_metric_
             ).compute(

From a61e81f42f2aeea65153f83b16d7d2650a3d3f41 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 09:09:06 +0200
Subject: [PATCH 14/26] Document GEMM call and change wording to "approximated
 distance"

---
 sklearn/metrics/_parallel_reductions.pyx | 97 ++++++++++++++----------
 1 file changed, 57 insertions(+), 40 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 9009d0866c0ba..3eac36ae3980d 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -190,7 +190,7 @@ cdef class ArgKmin(ParallelReduction):
 
     cdef:
         DTYPE_t ** dist_middle_terms_chunks
-        DTYPE_t ** heaps_red_distances_chunks
+        DTYPE_t ** heaps_approx_distances_chunks
         ITYPE_t ** heaps_indices_chunks
 
     @classmethod
@@ -224,7 +224,7 @@ cdef class ArgKmin(ParallelReduction):
 
         self.k = k
         self.dist_middle_terms_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
-        self.heaps_red_distances_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
+        self.heaps_approx_distances_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
         self.heaps_indices_chunks = <ITYPE_t **> malloc(sizeof(ITYPE_t *) * self.effective_omp_n_thread)
 
     def __dealloc__(self):
@@ -233,10 +233,10 @@ cdef class ArgKmin(ParallelReduction):
         else:
             raise RuntimeError("Trying to free heaps_indices_chunks which is NULL")
 
-        if self.heaps_red_distances_chunks is not NULL:
-            free(self.heaps_red_distances_chunks)
+        if self.heaps_approx_distances_chunks is not NULL:
+            free(self.heaps_approx_distances_chunks)
         else:
-            raise RuntimeError("Trying to free heaps_red_distances_chunks which is NULL")
+            raise RuntimeError("Trying to free heaps_approx_distances_chunks which is NULL")
 
         if self.dist_middle_terms_chunks is not NULL:
             free(self.dist_middle_terms_chunks)
@@ -258,7 +258,7 @@ cdef class ArgKmin(ParallelReduction):
             DTYPE_t[:, ::1] Y_c = Y[Y_start:Y_end, :]
             ITYPE_t k = self.k
             DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
-            DTYPE_t *heaps_red_distances = self.heaps_red_distances_chunks[thread_num]
+            DTYPE_t *heaps_approx_distances = self.heaps_approx_distances_chunks[thread_num]
             ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
 
             ITYPE_t n_x = X_end - X_start
@@ -266,7 +266,7 @@ cdef class ArgKmin(ParallelReduction):
 
         for i in range(X_c.shape[0]):
             for j in range(Y_c.shape[0]):
-                _push(heaps_red_distances + i * self.k,
+                _push(heaps_approx_distances + i * self.k,
                       heaps_indices + i * self.k,
                       k,
                       self.distance_metric.rdist(&X_c[i, 0],
@@ -278,7 +278,7 @@ cdef class ArgKmin(ParallelReduction):
 
     cdef int _parallel_on_X(self,
         ITYPE_t[:, ::1] argkmin_indices,
-        DTYPE_t[:, ::1] argkmin_red_distances,
+        DTYPE_t[:, ::1] argkmin_approx_distances,
     ) nogil:
         """Computes the argkmin of each vector (row) of X on Y
         by parallelizing computation on chunks of X.
@@ -297,12 +297,12 @@ cdef class ArgKmin(ParallelReduction):
             thread_num = openmp.omp_get_thread_num()
             # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
             self.dist_middle_terms_chunks[thread_num] = <DTYPE_t*> malloc(size_dist_middle_terms)
-            self.heaps_red_distances_chunks[thread_num] = <DTYPE_t*> malloc(heap_size)
+            self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t*> malloc(heap_size)
 
             for X_chunk_idx in prange(self.X_n_chunks, schedule='static'):
                 # We reset the heap between X chunks (memset can't be used here)
                 for idx in range(self.X_n_samples_chunk * self.k):
-                    self.heaps_red_distances_chunks[thread_num][idx] = FLOAT_INF
+                    self.heaps_approx_distances_chunks[thread_num][idx] = FLOAT_INF
 
                 X_start = X_chunk_idx * self.X_n_samples_chunk
                 if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_rem > 0:
@@ -332,14 +332,14 @@ cdef class ArgKmin(ParallelReduction):
                 # Sorting indices so that the closests' come first.
                 for idx in range(X_end - X_start):
                     _simultaneous_sort(
-                        self.heaps_red_distances_chunks[thread_num] + idx * self.k,
+                        self.heaps_approx_distances_chunks[thread_num] + idx * self.k,
                         &argkmin_indices[X_start + idx, 0],
                         self.k
                     )
 
             # end: for X_chunk_idx
             free(self.dist_middle_terms_chunks[thread_num])
-            free(self.heaps_red_distances_chunks[thread_num])
+            free(self.heaps_approx_distances_chunks[thread_num])
 
         # end: with nogil, parallel
         return self.X_n_chunks
@@ -347,7 +347,7 @@ cdef class ArgKmin(ParallelReduction):
 
     cdef int _parallel_on_Y(self,
         ITYPE_t[:, ::1] argkmin_indices,          # OUT
-        DTYPE_t[:, ::1] argkmin_red_distances,   # OUT
+        DTYPE_t[:, ::1] argkmin_approx_distances,   # OUT
     ) nogil:
         """Computes the argkmin of each vector (row) of X on Y
         by parallelizing computation on chunks of Y.
@@ -379,7 +379,7 @@ cdef class ArgKmin(ParallelReduction):
 
                 # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
                 self.dist_middle_terms_chunks[thread_num] = <DTYPE_t*> malloc(size_dist_middle_terms)
-                self.heaps_red_distances_chunks[thread_num] = <DTYPE_t*> malloc(float_heap_size)
+                self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t*> malloc(float_heap_size)
 
                 # As chunks of X are shared across threads, so must their
                 # heaps. To solve this, each thread has its own locals
@@ -388,7 +388,7 @@ cdef class ArgKmin(ParallelReduction):
 
                 # Initialising heaps (memset can't be used here)
                 for idx in range(self.X_n_samples_chunk * self.k):
-                    self.heaps_red_distances_chunks[thread_num][idx] = FLOAT_INF
+                    self.heaps_approx_distances_chunks[thread_num][idx] = FLOAT_INF
                     self.heaps_indices_chunks[thread_num][idx] = -1
 
                 for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
@@ -415,10 +415,10 @@ cdef class ArgKmin(ParallelReduction):
                     for idx in range(X_end - X_start):
                         for jdx in range(self.k):
                             _push(
-                                &argkmin_red_distances[X_start + idx, 0],
+                                &argkmin_approx_distances[X_start + idx, 0],
                                 &argkmin_indices[X_start + idx, 0],
                                 self.k,
-                                self.heaps_red_distances_chunks[thread_num][idx * self.k + jdx],
+                                self.heaps_approx_distances_chunks[thread_num][idx * self.k + jdx],
                                 self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
                             )
 
@@ -428,7 +428,7 @@ cdef class ArgKmin(ParallelReduction):
             for idx in prange(self.n_X, schedule='static',
                               nogil=True, num_threads=num_threads):
                 _simultaneous_sort(
-                    &argkmin_red_distances[idx, 0],
+                    &argkmin_approx_distances[idx, 0],
                     &argkmin_indices[idx, 0],
                     self.k,
                 )
@@ -557,35 +557,52 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
             DTYPE_t[:, ::1] Y_c = Y[Y_start:Y_end, :]
             ITYPE_t k = self.k
             DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
-            DTYPE_t *heaps_red_distances = self.heaps_red_distances_chunks[thread_num]
+            DTYPE_t *heaps_approx_distances = self.heaps_approx_distances_chunks[thread_num]
             ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
 
-        # Instead of computing the full pairwise squared distances matrix,
-        # ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²,
-        # we only need to store the - 2 X_c.Y_c^T + ||Y_c||²
-        # term since the argmin for a given sample X_c^{i} does not depend on
-        # ||X_c^{i}||²
-
-        # Careful: LDA, LDB and LDC are given for F-ordered arrays.
-        # Here, we use their counterpart values as indicated in the documentation.
-        # See the documentation of parameters here:
-        # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html
-        #
+            # Instead of computing the full pairwise squared distances matrix,
+            #
+            #      ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²,
+            #
+            # we only need to store the
+            #                                - 2 X_c.Y_c^T + ||Y_c||²
+            #
+            # term since the argkmin for a given sample X_c^{i} does not depend on
+            # ||X_c^{i}||²
+            #
+            # This term gets computed efficiently bellow using GEMM from BLAS Level 3.
+            #
+            # Careful: LDA, LDB and LDC are given for F-ordered arrays in BLAS documentations,
+            # for instance:
+            # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html
+            #
+            # Here, we use their counterpart values to work with C-ordered arrays.
+            BLAS_Order order = RowMajor
+            BLAS_Trans ta = NoTrans
+            BLAS_Trans tb = Trans
+            ITYPE_t m = X_c.shape[0]
+            ITYPE_t n = Y_c.shape[0]
+            ITYPE_t K = X_c.shape[1]
+            DTYPE_t alpha = - 2.
+            DTYPE_t * A = & X_c[0, 0]
+            ITYPE_t lda = X_c.shape[1]
+            DTYPE_t * B = & Y_c[0, 0]
+            ITYPE_t ldb = X_c.shape[1]
+            DTYPE_t beta = 0.
+            DTYPE_t * C = dist_middle_terms
+            ITYPE_t ldc = Y_c.shape[0]
+
         # dist_middle_terms = -2 * X_c.dot(Y_c.T)
-        _gemm(RowMajor, NoTrans, Trans,
-              X_c.shape[0], Y_c.shape[0], X_c.shape[1],
-              -2.0,
-              &X_c[0, 0], X_c.shape[1],
-              &Y_c[0, 0], X_c.shape[1], 0.0,
-              dist_middle_terms, Y_c.shape[0])
-
-        # Computing argmins here
+        _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, C, ldc)
+
+        # Pushing the distance and their associated indices on heaps
+        # which keep tracks of the argkmin.
         for i in range(X_c.shape[0]):
             for j in range(Y_c.shape[0]):
-                _push(heaps_red_distances + i * k,
+                _push(heaps_approx_distances + i * k,
                       heaps_indices + i * k,
                       k,
-                      # reduced distance: - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
+                      # approximated distance: - 2 X_c_i.Y_c_j^T + ||Y_c_j||²
                       dist_middle_terms[i * Y_c.shape[0] + j] + self.Y_sq_norms[j + Y_start],
                       j + Y_start)
         return 0

From e0d2881f7455b1c2ea925740017d9bd76c8053d9 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 10:01:38 +0200
Subject: [PATCH 15/26] Support memmapped and integral arrays

This also makes memoryviews const, clarifying their use.
---
 sklearn/metrics/_parallel_reductions.pyx | 89 +++++++++++++-----------
 1 file changed, 50 insertions(+), 39 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 3eac36ae3980d..01d1a10371615 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -19,6 +19,7 @@ from cython.parallel cimport parallel, prange
 
 from ._dist_metrics cimport DistanceMetric
 from ._dist_metrics import METRIC_MAPPING
+from ..utils import check_array
 
 DEF CHUNK_SIZE = 256  # number of vectors
 
@@ -85,21 +86,22 @@ cdef class ParallelReduction:
     """
 
     cdef:
-        ITYPE_t effective_omp_n_thread
+        const DTYPE_t[:, ::1] X  # shape: (n_X, d)
+        const DTYPE_t[:, ::1] Y  # shape: (n_Y, d)
+
+        DistanceMetric distance_metric
 
-        ITYPE_t k, d, sf, si
+        ITYPE_t effective_omp_n_thread
         ITYPE_t n_samples_chunk, chunk_size
 
-        ITYPE_t n_Y, Y_n_samples_chunk, Y_n_samples_rem
-        ITYPE_t n_X, X_n_samples_chunk, X_n_samples_rem
+        ITYPE_t d
 
-        # Counting remainder chunk in total number of chunks
-        ITYPE_t Y_n_chunks, X_n_chunks, num_threads
+        # dtypes sizes
+        ITYPE_t sf, si
 
-        DTYPE_t[:, ::1] X
-        DTYPE_t[:, ::1] Y
+        ITYPE_t n_X, X_n_samples_chunk, X_n_chunks, X_n_samples_rem
+        ITYPE_t n_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_rem
 
-        DistanceMetric distance_metric
 
     def __cinit__(self):
         # Initializing memory view to prevent memory errors and seg-faults
@@ -108,22 +110,23 @@ cdef class ParallelReduction:
         self.Y = np.empty((1, 1), dtype=DTYPE, order='c')
 
     def __init__(self,
-                 DTYPE_t[:, ::1] X,
-                 DTYPE_t[:, ::1] Y,
+                 X,
+                 Y,
                  DistanceMetric distance_metric,
                  ITYPE_t chunk_size = CHUNK_SIZE,
     ):
         cdef:
             ITYPE_t X_n_full_chunks, Y_n_full_chunks
-        self.X = X
-        self.Y = Y
-
-        # TODO: use proper internals checks of scikit-learn
-        assert X.shape[1] == Y.shape[1], (
-            f"Vectors of X and Y must have the same "
-            f"number of dimensions but are respectively "
-            f"{X.shape[1]}-dimensional and {Y.shape[1]}-dimensional."
-        )
+
+        self.effective_omp_n_thread = _openmp_effective_n_threads()
+
+        self.X = check_array(X, dtype=DTYPE)
+        self.Y = check_array(Y, dtype=DTYPE)
+
+        assert X.shape[1] == Y.shape[1], "Vectors of X and Y must have the " \
+                                         "same dimension but currently are " \
+                                         f"respectively {X.shape[1]}-dimensional " \
+                                         f"and {Y.shape[1]}-dimensional."
 
         self.d = X.shape[1]
         self.sf = sizeof(DTYPE_t)
@@ -152,12 +155,9 @@ cdef class ParallelReduction:
             self.n_X != (X_n_full_chunks * self.X_n_samples_chunk)
         )
 
-        self.effective_omp_n_thread = _openmp_effective_n_threads()
-
-
     cdef int _reduce_on_chunks(self,
-        DTYPE_t[:, ::1] X,                  # IN
-        DTYPE_t[:, ::1] Y,                  # IN
+        const DTYPE_t[:, ::1] X,                  # IN
+        const DTYPE_t[:, ::1] Y,                  # IN
         ITYPE_t X_start,
         ITYPE_t X_end,
         ITYPE_t Y_start,
@@ -189,18 +189,23 @@ cdef class ArgKmin(ParallelReduction):
     """
 
     cdef:
+        ITYPE_t k
+
         DTYPE_t ** dist_middle_terms_chunks
         DTYPE_t ** heaps_approx_distances_chunks
         ITYPE_t ** heaps_indices_chunks
 
+        ITYPE_t[:, ::1] argkmin_indices
+        DTYPE_t[:, ::1] argkmin_distances
+
     @classmethod
     def valid_metrics(cls):
         return {"fast_sqeuclidean", *METRIC_MAPPING.keys()}
 
     @classmethod
     def get_for(cls,
-                DTYPE_t[:, ::1] X,
-                DTYPE_t[:, ::1] Y,
+                X,
+                Y,
                 ITYPE_t k,
                 str metric="fast_sqeuclidean",
                 ITYPE_t chunk_size=CHUNK_SIZE,
@@ -214,8 +219,8 @@ cdef class ArgKmin(ParallelReduction):
                        chunk_size=chunk_size)
 
     def __init__(self,
-                 DTYPE_t[:, ::1] X,
-                 DTYPE_t[:, ::1] Y,
+                 X,
+                 Y,
                  DistanceMetric distance_metric,
                  ITYPE_t k,
                  ITYPE_t chunk_size = CHUNK_SIZE,
@@ -223,6 +228,12 @@ cdef class ArgKmin(ParallelReduction):
         ParallelReduction.__init__(self, X, Y, distance_metric, chunk_size)
 
         self.k = k
+
+        # Results returned by ArgKmin.compute
+        self.argkmin_indices = np.full((self.n_X, self.k), 0, dtype=ITYPE)
+        self.argkmin_distances = np.full((self.n_X, self.k), FLOAT_INF, dtype=DTYPE)
+
+        # Temporary datastructures used in threads
         self.dist_middle_terms_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
         self.heaps_approx_distances_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
         self.heaps_indices_chunks = <ITYPE_t **> malloc(sizeof(ITYPE_t *) * self.effective_omp_n_thread)
@@ -244,8 +255,8 @@ cdef class ArgKmin(ParallelReduction):
             raise RuntimeError("Trying to free dist_middle_terms_chunks which is NULL")
 
     cdef int _reduce_on_chunks(self,
-        DTYPE_t[:, ::1] X,                  # IN
-        DTYPE_t[:, ::1] Y,                  # IN
+        const DTYPE_t[:, ::1] X,                  # IN
+        const DTYPE_t[:, ::1] Y,                  # IN
         ITYPE_t X_start,
         ITYPE_t X_end,
         ITYPE_t Y_start,
@@ -254,8 +265,8 @@ cdef class ArgKmin(ParallelReduction):
     ) nogil except -1:
         cdef:
             ITYPE_t i, j
-            DTYPE_t[:, ::1] X_c = X[X_start:X_end, :]
-            DTYPE_t[:, ::1] Y_c = Y[Y_start:Y_end, :]
+            const DTYPE_t[:, ::1] X_c = X[X_start:X_end, :]
+            const DTYPE_t[:, ::1] Y_c = Y[Y_start:Y_end, :]
             ITYPE_t k = self.k
             DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
             DTYPE_t *heaps_approx_distances = self.heaps_approx_distances_chunks[thread_num]
@@ -524,8 +535,8 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
         DTYPE_t[::1] Y_sq_norms
 
     def __init__(self,
-                  DTYPE_t[:, ::1] X,
-                  DTYPE_t[:, ::1] Y,
+                  X,
+                  Y,
                   ITYPE_t k,
                   ITYPE_t chunk_size = CHUNK_SIZE,
     ):
@@ -537,8 +548,8 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
 
 
     cdef int _reduce_on_chunks(self,
-        DTYPE_t[:, ::1] X,                  # IN
-        DTYPE_t[:, ::1] Y,                  # IN
+        const DTYPE_t[:, ::1] X,                  # IN
+        const DTYPE_t[:, ::1] Y,                  # IN
         ITYPE_t X_start,
         ITYPE_t X_end,
         ITYPE_t Y_start,
@@ -553,8 +564,8 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
         """
         cdef:
             ITYPE_t i, j
-            DTYPE_t[:, ::1] X_c = X[X_start:X_end, :]
-            DTYPE_t[:, ::1] Y_c = Y[Y_start:Y_end, :]
+            const DTYPE_t[:, ::1] X_c = X[X_start:X_end, :]
+            const DTYPE_t[:, ::1] Y_c = Y[Y_start:Y_end, :]
             ITYPE_t k = self.k
             DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
             DTYPE_t *heaps_approx_distances = self.heaps_approx_distances_chunks[thread_num]

From b23f97294daf62f0009c637190516e2ce85219b5 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 10:24:21 +0200
Subject: [PATCH 16/26] Pull _parallel_on_{X,Y} up on ParallelReduction

Also add private template methods, allowing
each reduction to interact with its private
datastructures at various stages.

Reintroduce thread-local datastructures deallocation
(fix-up for 80aaf0bcb13cf31f5b2a251d615410c6b680db8f)
---
 sklearn/metrics/_parallel_reductions.pyx | 425 ++++++++++++++---------
 1 file changed, 261 insertions(+), 164 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 01d1a10371615..db484002f67a5 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -155,9 +155,164 @@ cdef class ParallelReduction:
             self.n_X != (X_n_full_chunks * self.X_n_samples_chunk)
         )
 
+    cdef void _on_X_parallel_init(self,
+            ITYPE_t thread_num,
+    ) nogil:
+        return
+
+    cdef void _on_X_parallel_finalize(self,
+            ITYPE_t thread_num
+    ) nogil:
+        return
+
+    cdef void _on_X_prange_iter_init(self,
+            ITYPE_t thread_num,
+            ITYPE_t X_chunk_idx,
+            ITYPE_t X_start,
+            ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _on_X_prange_iter_finalize(self,
+            ITYPE_t thread_num,
+            ITYPE_t X_chunk_idx,
+            ITYPE_t X_start,
+            ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_X(self) nogil:
+        """Computes the reduction of each vector (row) of X on Y
+        by parallelizing computation on chunks of X.
+
+        Private datastructures are modified internally by threads.
+
+        Private template methods can be implemented on subclasses to
+        interact with those datastructures at various stages.
+        """
+        cdef:
+            ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx
+            ITYPE_t num_threads = min(self.X_n_chunks, self.effective_omp_n_thread)
+            ITYPE_t thread_num
+
+        with nogil, parallel(num_threads=num_threads):
+            thread_num = openmp.omp_get_thread_num()
+
+            # Allocating thread local datastructures
+            self._on_X_parallel_init(thread_num)
+
+            for X_chunk_idx in prange(self.X_n_chunks, schedule='static'):
+                X_start = X_chunk_idx * self.X_n_samples_chunk
+                if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_rem > 0:
+                    X_end = X_start + self.X_n_samples_rem
+                else:
+                    X_end = X_start + self.X_n_samples_chunk
+
+                # Reinitializing thread local datastructures for the new X chunk
+                self._on_X_prange_iter_init(thread_num, X_chunk_idx, X_start, X_end)
+
+                for Y_chunk_idx in range(self.Y_n_chunks):
+                    Y_start = Y_chunk_idx * self.Y_n_samples_chunk
+                    if Y_chunk_idx == self.Y_n_chunks - 1 and self.Y_n_samples_rem > 0:
+                        Y_end = Y_start + self.Y_n_samples_rem
+                    else:
+                        Y_end = Y_start + self.Y_n_samples_chunk
+
+                    self._reduce_on_chunks(
+                        self.X,
+                        self.Y,
+                        X_start, X_end,
+                        Y_start, Y_end,
+                        thread_num,
+                    )
+
+                # Adjusting thread local datastructures on the full pass on Y
+                self._on_X_prange_iter_finalize(thread_num, X_chunk_idx, X_start, X_end)
+
+            # end: for X_chunk_idx
+
+            # Deallocating thread local datastructures
+            self._on_X_parallel_finalize(thread_num)
+
+        # end: with nogil, parallel
+        return
+
+    cdef void _on_Y_parallel_init(self,
+        ITYPE_t thread_num,
+    ) nogil:
+        return
+
+    cdef void _on_Y_parallel_finalize(self,
+        ITYPE_t thread_num,
+        ITYPE_t X_chunk_idx,
+        ITYPE_t X_start,
+        ITYPE_t X_end,
+    ) nogil:
+        return
+
+    cdef void _on_Y_finalize(self,
+        ITYPE_t thread_num,
+    ) nogil:
+        return
+
+    cdef void _parallel_on_Y(self) nogil:
+        """Computes the argkmin of each vector (row) of X on Y
+        by parallelizing computation on chunks of Y.
+
+        Private datastructures are modified internally by threads.
+
+        Private template methods can be implemented on subclasses to
+        interact with those datastructures at various stages.
+        """
+        cdef:
+            ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx
+            ITYPE_t num_threads = min(self.X_n_chunks, self.effective_omp_n_thread)
+            ITYPE_t thread_num
+
+        for X_chunk_idx in range(self.X_n_chunks):
+            X_start = X_chunk_idx * self.X_n_samples_chunk
+            if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_rem > 0:
+                X_end = X_start + self.X_n_samples_rem
+            else:
+                X_end = X_start + self.X_n_samples_chunk
+
+            with nogil, parallel(num_threads=num_threads):
+                # Thread local buffers
+                thread_num = openmp.omp_get_thread_num()
+
+                # Allocating thread local datastructures
+                self._on_Y_parallel_init(thread_num)
+
+                for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
+                    Y_start = Y_chunk_idx * self.Y_n_samples_chunk
+                    if Y_chunk_idx == self.Y_n_chunks - 1 \
+                            and self.Y_n_samples_rem > 0:
+                        Y_end = Y_start + self.Y_n_samples_rem
+                    else:
+                        Y_end = Y_start + self.Y_n_samples_chunk
+
+                    self._reduce_on_chunks(
+                        self.X,
+                        self.Y,
+                        X_start, X_end,
+                        Y_start, Y_end,
+                        thread_num,
+                    )
+                # end: prange
+
+                # Synchronizing thread local datastructures with the main ones
+                # This can potentially block
+                self._on_Y_parallel_finalize(thread_num, X_chunk_idx, X_start, X_end)
+            # end: with nogil, parallel
+
+        # end: for X_chunk_idx
+        # Adjusting main datastructures before returning
+        self._on_Y_finalize(num_threads)
+        return
+
     cdef int _reduce_on_chunks(self,
-        const DTYPE_t[:, ::1] X,                  # IN
-        const DTYPE_t[:, ::1] Y,                  # IN
+        const DTYPE_t[:, ::1] X,
+        const DTYPE_t[:, ::1] Y,
         ITYPE_t X_start,
         ITYPE_t X_end,
         ITYPE_t Y_start,
@@ -255,8 +410,8 @@ cdef class ArgKmin(ParallelReduction):
             raise RuntimeError("Trying to free dist_middle_terms_chunks which is NULL")
 
     cdef int _reduce_on_chunks(self,
-        const DTYPE_t[:, ::1] X,                  # IN
-        const DTYPE_t[:, ::1] Y,                  # IN
+        const DTYPE_t[:, ::1] X,
+        const DTYPE_t[:, ::1] Y,
         ITYPE_t X_start,
         ITYPE_t X_end,
         ITYPE_t Y_start,
@@ -287,170 +442,124 @@ cdef class ArgKmin(ParallelReduction):
 
         return 0
 
-    cdef int _parallel_on_X(self,
-        ITYPE_t[:, ::1] argkmin_indices,
-        DTYPE_t[:, ::1] argkmin_approx_distances,
+    cdef void _on_X_parallel_init(self,
+            ITYPE_t thread_num,
     ) nogil:
-        """Computes the argkmin of each vector (row) of X on Y
-        by parallelizing computation on chunks of X.
-        """
         cdef:
-            ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx, idx, jdx
-            ITYPE_t num_threads = min(self.X_n_chunks, self.effective_omp_n_thread)
-            ITYPE_t thread_num
-
             # in bytes
             ITYPE_t size_dist_middle_terms = self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf
             ITYPE_t heap_size = self.X_n_samples_chunk * self.k * self.sf
 
-        with nogil, parallel(num_threads=num_threads):
-            # Thread local buffers
-            thread_num = openmp.omp_get_thread_num()
-            # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-            self.dist_middle_terms_chunks[thread_num] = <DTYPE_t*> malloc(size_dist_middle_terms)
-            self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t*> malloc(heap_size)
-
-            for X_chunk_idx in prange(self.X_n_chunks, schedule='static'):
-                # We reset the heap between X chunks (memset can't be used here)
-                for idx in range(self.X_n_samples_chunk * self.k):
-                    self.heaps_approx_distances_chunks[thread_num][idx] = FLOAT_INF
-
-                X_start = X_chunk_idx * self.X_n_samples_chunk
-                if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_rem > 0:
-                    X_end = X_start + self.X_n_samples_rem
-                else:
-                    X_end = X_start + self.X_n_samples_chunk
-
-                # Referencing the thread-local heaps via the thread-scope pointer
-                # of pointers attached to the instance
-                self.heaps_indices_chunks[thread_num] = &argkmin_indices[X_start, 0]
-
-                for Y_chunk_idx in range(self.Y_n_chunks):
-                    Y_start = Y_chunk_idx * self.Y_n_samples_chunk
-                    if Y_chunk_idx == self.Y_n_chunks - 1 and self.Y_n_samples_rem > 0:
-                        Y_end = Y_start + self.Y_n_samples_rem
-                    else:
-                        Y_end = Y_start + self.Y_n_samples_chunk
-
-                    self._reduce_on_chunks(
-                        self.X,
-                        self.Y,
-                        X_start, X_end,
-                        Y_start, Y_end,
-                        thread_num,
-                    )
-
-                # Sorting indices so that the closests' come first.
-                for idx in range(X_end - X_start):
-                    _simultaneous_sort(
-                        self.heaps_approx_distances_chunks[thread_num] + idx * self.k,
-                        &argkmin_indices[X_start + idx, 0],
-                        self.k
-                    )
+        # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+        self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(size_dist_middle_terms)
+        self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t *> malloc(heap_size)
 
-            # end: for X_chunk_idx
-            free(self.dist_middle_terms_chunks[thread_num])
-            free(self.heaps_approx_distances_chunks[thread_num])
+    cdef void _on_X_prange_iter_init(self,
+            ITYPE_t thread_num,
+            ITYPE_t X_chunk_idx,
+            ITYPE_t X_start,
+            ITYPE_t X_end,
+    ) nogil:
 
-        # end: with nogil, parallel
-        return self.X_n_chunks
+        # We reset the heap between X chunks (memset can't be used here)
+        for idx in range(self.X_n_samples_chunk * self.k):
+            self.heaps_approx_distances_chunks[thread_num][idx] = FLOAT_INF
 
+        # Referencing the thread-local heaps via the thread-scope pointer
+        # of pointers attached to the instance
+        self.heaps_indices_chunks[thread_num] = &self.argkmin_indices[X_start, 0]
 
-    cdef int _parallel_on_Y(self,
-        ITYPE_t[:, ::1] argkmin_indices,          # OUT
-        DTYPE_t[:, ::1] argkmin_approx_distances,   # OUT
+    cdef void _on_X_prange_iter_finalize(self,
+            ITYPE_t thread_num,
+            ITYPE_t X_chunk_idx,
+            ITYPE_t X_start,
+            ITYPE_t X_end,
     ) nogil:
-        """Computes the argkmin of each vector (row) of X on Y
-        by parallelizing computation on chunks of Y.
-
-        This parallelization strategy is more costly (as we need
-        extra heaps and synchronisation), yet it is useful in
-        most contexts.
-        """
         cdef:
-            ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx, idx, jdx
-            ITYPE_t num_threads = min(self.X_n_chunks, self.effective_omp_n_thread)
+            ITYPE_t idx, jdx
+
+        # Sorting indices of the argkmin for each query vector of X
+        for idx in range(X_end - X_start):
+            _simultaneous_sort(
+                self.heaps_approx_distances_chunks[thread_num] + idx * self.k,
+                &self.argkmin_indices[X_start + idx, 0],
+                self.k
+            )
+
+    cdef void _on_X_parallel_finalize(self,
             ITYPE_t thread_num
+    ) nogil:
+        free(self.dist_middle_terms_chunks[thread_num])
+        free(self.heaps_approx_distances_chunks[thread_num])
 
+    cdef void _on_Y_parallel_init(self,
+            ITYPE_t thread_num,
+    ) nogil:
+        cdef:
             # in bytes
             ITYPE_t size_dist_middle_terms = self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf
             ITYPE_t int_heap_size = self.X_n_samples_chunk * self.k * self.si
             ITYPE_t float_heap_size = self.X_n_samples_chunk * self.k * self.sf
 
-        for X_chunk_idx in range(self.X_n_chunks):
-            X_start = X_chunk_idx * self.X_n_samples_chunk
-            if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_rem > 0:
-                X_end = X_start + self.X_n_samples_rem
-            else:
-                X_end = X_start + self.X_n_samples_chunk
-
-            with nogil, parallel(num_threads=num_threads):
-                # Thread local buffers
-                thread_num = openmp.omp_get_thread_num()
-
-                # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-                self.dist_middle_terms_chunks[thread_num] = <DTYPE_t*> malloc(size_dist_middle_terms)
-                self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t*> malloc(float_heap_size)
-
-                # As chunks of X are shared across threads, so must their
-                # heaps. To solve this, each thread has its own locals
-                # heaps which are then synchronised back in the main ones.
-                self.heaps_indices_chunks[thread_num] = <ITYPE_t*> malloc(int_heap_size)
-
-                # Initialising heaps (memset can't be used here)
-                for idx in range(self.X_n_samples_chunk * self.k):
-                    self.heaps_approx_distances_chunks[thread_num][idx] = FLOAT_INF
-                    self.heaps_indices_chunks[thread_num][idx] = -1
-
-                for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'):
-                    Y_start = Y_chunk_idx * self.Y_n_samples_chunk
-                    if Y_chunk_idx == self.Y_n_chunks - 1 \
-                        and self.Y_n_samples_rem > 0:
-                        Y_end = Y_start + self.Y_n_samples_rem
-                    else:
-                        Y_end = Y_start + self.Y_n_samples_chunk
-
-
-                    self._reduce_on_chunks(
-                        self.X,
-                        self.Y,
-                        X_start, X_end,
-                        Y_start, Y_end,
-                        thread_num,
+        # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+        self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(size_dist_middle_terms)
+        self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t *> malloc(float_heap_size)
+
+        # As chunks of X are shared across threads, so must their
+        # heaps. To solve this, each thread has its own locals
+        # heaps which are then synchronised back in the main ones.
+        self.heaps_indices_chunks[thread_num] = <ITYPE_t *> malloc(int_heap_size)
+
+        # Initialising heaps (memset can't be used here)
+        for idx in range(self.X_n_samples_chunk * self.k):
+            self.heaps_approx_distances_chunks[thread_num][idx] = FLOAT_INF
+            self.heaps_indices_chunks[thread_num][idx] = -1
+
+    cdef void _on_Y_parallel_finalize(self,
+            ITYPE_t thread_num,
+            ITYPE_t X_chunk_idx,
+            ITYPE_t X_start,
+            ITYPE_t X_end,
+    ) nogil:
+        cdef:
+            ITYPE_t idx, jdx
+        with gil:
+            # Synchronising the thread local heaps
+            # with the main heaps
+            for idx in range(X_end - X_start):
+                for jdx in range(self.k):
+                    _push(
+                        &self.argkmin_distances[X_start + idx, 0],
+                        &self.argkmin_indices[X_start + idx, 0],
+                        self.k,
+                        self.heaps_approx_distances_chunks[thread_num][idx * self.k + jdx],
+                        self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
                     )
 
-                # end: for Y_chunk_idx
-                with gil:
-                    # Synchronising the thread local heaps
-                    # with the main heaps
-                    for idx in range(X_end - X_start):
-                        for jdx in range(self.k):
-                            _push(
-                                &argkmin_approx_distances[X_start + idx, 0],
-                                &argkmin_indices[X_start + idx, 0],
-                                self.k,
-                                self.heaps_approx_distances_chunks[thread_num][idx * self.k + jdx],
-                                self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
-                            )
+        free(self.dist_middle_terms_chunks[thread_num])
+        free(self.heaps_approx_distances_chunks[thread_num])
+        free(self.heaps_indices_chunks[thread_num])
 
-            # end: with nogil, parallel
-
-            # Sorting indices of the argkmin for each query vector of X
-            for idx in prange(self.n_X, schedule='static',
-                              nogil=True, num_threads=num_threads):
-                _simultaneous_sort(
-                    &argkmin_approx_distances[idx, 0],
-                    &argkmin_indices[idx, 0],
-                    self.k,
-                )
-            # end: prange
-
-        # end: for X_chunk_idx
-        return self.Y_n_chunks
+    cdef void _on_Y_finalize(self,
+            ITYPE_t thread_num,
+    ) nogil:
+        cdef:
+            ITYPE_t num_threads = min(self.X_n_chunks, self.effective_omp_n_thread)
+            ITYPE_t idx
+
+        # Sorting indices of the argkmin for each query vector of X
+        for idx in prange(self.n_X, schedule='static',
+                          nogil=True, num_threads=num_threads):
+            _simultaneous_sort(
+                &self.argkmin_distances[idx, 0],
+                &self.argkmin_indices[idx, 0],
+                self.k,
+            )
+        return
 
     cdef void _exact_distances(self,
-        ITYPE_t[:, ::1] Y_indices,          # IN
-        DTYPE_t[:, ::1] distances,          # IN/OUT
+        ITYPE_t[:, ::1] Y_indices,  # IN
+        DTYPE_t[:, ::1] distances,  # IN/OUT
     ) nogil:
         """Convert reduced distances to pairwise distances in parallel."""
         cdef:
@@ -494,40 +603,28 @@ cdef class ArgKmin(ParallelReduction):
         indices: ndarray of shape (n, k)
             Indices of each X vector argkmin in Y.
         """
-        cdef:
-            ITYPE_t n_X = self.X.shape[0]
-            ITYPE_t[:, ::1] argkmin_indices = np.full((n_X, self.k), 0,
-                                                   dtype=ITYPE)
-            DTYPE_t[:, ::1] argkmin_distances = np.full((n_X, self.k),
-                                                      FLOAT_INF,
-                                                      dtype=DTYPE)
-
         if strategy == 'auto':
             # This is a simple heuristic whose constant for the
             # comparison has been chosen based on experiments.
-            if 4 * self.chunk_size * self.effective_omp_n_thread < n_X:
+            if 4 * self.chunk_size * self.effective_omp_n_thread < self.n_X:
                 strategy = 'parallel_on_X'
             else:
                 strategy = 'parallel_on_Y'
 
         if strategy == 'parallel_on_Y':
-            self._parallel_on_Y(
-                argkmin_indices, argkmin_distances
-            )
+            self._parallel_on_Y()
         elif strategy == 'parallel_on_X':
-            self._parallel_on_X(
-                argkmin_indices, argkmin_distances
-            )
+            self._parallel_on_X()
         else:
             raise RuntimeError(f"strategy '{strategy}' not supported.")
 
         if return_distance:
             # We need to recompute distances because we relied on
             # reduced distances.
-            self._exact_distances(argkmin_indices, argkmin_distances)
-            return np.asarray(argkmin_distances), np.asarray(argkmin_indices)
+            self._exact_distances(self.argkmin_indices, self.argkmin_distances)
+            return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices)
 
-        return np.asarray(argkmin_indices)
+        return np.asarray(self.argkmin_indices)
 
 cdef class FastSquaredEuclideanArgKmin(ArgKmin):
 
@@ -548,8 +645,8 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
 
 
     cdef int _reduce_on_chunks(self,
-        const DTYPE_t[:, ::1] X,                  # IN
-        const DTYPE_t[:, ::1] Y,                  # IN
+        const DTYPE_t[:, ::1] X,
+        const DTYPE_t[:, ::1] Y,
         ITYPE_t X_start,
         ITYPE_t X_end,
         ITYPE_t Y_start,

From 67e02d48b174a14182ef60f6fdef2d4997140156 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 10:24:21 +0200
Subject: [PATCH 17/26] Pull GEMM buffers down to FastSquaredEuclideanArgKmin

Also add some method call chain and documentation for
FastSquaredEuclideanArgKmin.
---
 sklearn/metrics/_parallel_reductions.pyx | 71 +++++++++++++++++++-----
 1 file changed, 56 insertions(+), 15 deletions(-)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index db484002f67a5..c3d01a7f9e016 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -329,6 +329,7 @@ cdef class ArgKmin(ParallelReduction):
 
     The implementation is parallelized on chunks whose size can
     be set using ``chunk_size``.
+
     Parameters
     ----------
     X: ndarray of shape (n, d)
@@ -346,7 +347,6 @@ cdef class ArgKmin(ParallelReduction):
     cdef:
         ITYPE_t k
 
-        DTYPE_t ** dist_middle_terms_chunks
         DTYPE_t ** heaps_approx_distances_chunks
         ITYPE_t ** heaps_indices_chunks
 
@@ -389,11 +389,11 @@ cdef class ArgKmin(ParallelReduction):
         self.argkmin_distances = np.full((self.n_X, self.k), FLOAT_INF, dtype=DTYPE)
 
         # Temporary datastructures used in threads
-        self.dist_middle_terms_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
         self.heaps_approx_distances_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
         self.heaps_indices_chunks = <ITYPE_t **> malloc(sizeof(ITYPE_t *) * self.effective_omp_n_thread)
 
     def __dealloc__(self):
+        ParallelReduction.__dealloc__(self)
         if self.heaps_indices_chunks is not NULL:
             free(self.heaps_indices_chunks)
         else:
@@ -404,11 +404,6 @@ cdef class ArgKmin(ParallelReduction):
         else:
             raise RuntimeError("Trying to free heaps_approx_distances_chunks which is NULL")
 
-        if self.dist_middle_terms_chunks is not NULL:
-            free(self.dist_middle_terms_chunks)
-        else:
-            raise RuntimeError("Trying to free dist_middle_terms_chunks which is NULL")
-
     cdef int _reduce_on_chunks(self,
         const DTYPE_t[:, ::1] X,
         const DTYPE_t[:, ::1] Y,
@@ -423,7 +418,6 @@ cdef class ArgKmin(ParallelReduction):
             const DTYPE_t[:, ::1] X_c = X[X_start:X_end, :]
             const DTYPE_t[:, ::1] Y_c = Y[Y_start:Y_end, :]
             ITYPE_t k = self.k
-            DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num]
             DTYPE_t *heaps_approx_distances = self.heaps_approx_distances_chunks[thread_num]
             ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num]
 
@@ -447,11 +441,9 @@ cdef class ArgKmin(ParallelReduction):
     ) nogil:
         cdef:
             # in bytes
-            ITYPE_t size_dist_middle_terms = self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf
             ITYPE_t heap_size = self.X_n_samples_chunk * self.k * self.sf
 
         # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-        self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(size_dist_middle_terms)
         self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t *> malloc(heap_size)
 
     cdef void _on_X_prange_iter_init(self,
@@ -489,7 +481,6 @@ cdef class ArgKmin(ParallelReduction):
     cdef void _on_X_parallel_finalize(self,
             ITYPE_t thread_num
     ) nogil:
-        free(self.dist_middle_terms_chunks[thread_num])
         free(self.heaps_approx_distances_chunks[thread_num])
 
     cdef void _on_Y_parallel_init(self,
@@ -497,12 +488,9 @@ cdef class ArgKmin(ParallelReduction):
     ) nogil:
         cdef:
             # in bytes
-            ITYPE_t size_dist_middle_terms = self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf
             ITYPE_t int_heap_size = self.X_n_samples_chunk * self.k * self.si
             ITYPE_t float_heap_size = self.X_n_samples_chunk * self.k * self.sf
 
-        # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
-        self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(size_dist_middle_terms)
         self.heaps_approx_distances_chunks[thread_num] = <DTYPE_t *> malloc(float_heap_size)
 
         # As chunks of X are shared across threads, so must their
@@ -536,7 +524,6 @@ cdef class ArgKmin(ParallelReduction):
                         self.heaps_indices_chunks[thread_num][idx * self.k + jdx],
                     )
 
-        free(self.dist_middle_terms_chunks[thread_num])
         free(self.heaps_approx_distances_chunks[thread_num])
         free(self.heaps_indices_chunks[thread_num])
 
@@ -627,10 +614,25 @@ cdef class ArgKmin(ParallelReduction):
         return np.asarray(self.argkmin_indices)
 
 cdef class FastSquaredEuclideanArgKmin(ArgKmin):
+    """Fast specialized alternative for ArgKmin on
+    EuclideanDistance.
+
+    Computes the argkmin of vectors (rows) of a set of
+    vectors (rows) of X on another set of vectors (rows) of Y
+    using the GEMM-trick.
+
+    This implementation has an superior arithmetic intensity
+    and hence running time, but it can suffer from numerical
+    instability. We recommend using ArgKmin with
+    EuclideanDistance when exact precision is needed.
+    """
 
     cdef:
         DTYPE_t[::1] Y_sq_norms
 
+        # Buffers for GEMM
+        DTYPE_t ** dist_middle_terms_chunks
+
     def __init__(self,
                   X,
                   Y,
@@ -642,7 +644,46 @@ cdef class FastSquaredEuclideanArgKmin(ArgKmin):
                          k=k,
                          chunk_size=chunk_size)
         self.Y_sq_norms = np.einsum('ij,ij->i', self.Y, self.Y)
+        # Temporary datastructures used in threads
+        self.dist_middle_terms_chunks = <DTYPE_t **> malloc(sizeof(DTYPE_t *) * self.effective_omp_n_thread)
 
+    def __dealloc__(self):
+        ArgKmin.__dealloc__(self)
+        if self.dist_middle_terms_chunks is not NULL:
+            free(self.dist_middle_terms_chunks)
+        else:
+            raise RuntimeError("Trying to free dist_middle_terms_chunks which is NULL")
+
+    cdef void _on_X_parallel_init(self,
+            ITYPE_t thread_num,
+    ) nogil:
+        ArgKmin._on_X_parallel_init(self, thread_num)
+        # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+        self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
+            self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf)
+
+    cdef void _on_X_parallel_finalize(self,
+            ITYPE_t thread_num
+    ) nogil:
+        ArgKmin._on_X_parallel_finalize(self, thread_num)
+        free(self.dist_middle_terms_chunks[thread_num])
+
+    cdef void _on_Y_parallel_init(self,
+            ITYPE_t thread_num,
+    ) nogil:
+        ArgKmin._on_Y_parallel_init(self, thread_num)
+        # Temporary buffer for the -2 * X_c.dot(Y_c.T) term
+        self.dist_middle_terms_chunks[thread_num] = <DTYPE_t *> malloc(
+            self.Y_n_samples_chunk * self.X_n_samples_chunk * self.sf)
+
+    cdef void _on_Y_parallel_finalize(self,
+            ITYPE_t thread_num,
+            ITYPE_t X_chunk_idx,
+            ITYPE_t X_start,
+            ITYPE_t X_end,
+    ) nogil:
+        ArgKmin._on_Y_parallel_finalize(self, thread_num, X_chunk_idx, X_start, X_end)
+        free(self.dist_middle_terms_chunks[thread_num])
 
     cdef int _reduce_on_chunks(self,
         const DTYPE_t[:, ::1] X,

From a8331da48ac45137143cc9e3466c3440ba254cc5 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 13:41:29 +0200
Subject: [PATCH 18/26] Skip tests for translation invariance

---
 sklearn/neighbors/tests/test_neighbors.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index ca16f373212bc..b0d231721d71f 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1846,6 +1846,10 @@ def test_fast_sqeuclidean_correctness(
 @pytest.mark.parametrize("d", [5, 10, 100, 500])
 @pytest.mark.parametrize("n_neighbors", [1, 10, 100, 1000])
 @pytest.mark.parametrize("translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]])
+@pytest.mark.skip(
+    reason="Long test, translation invariance should "
+    "have its own study: skipping for now"
+)
 def test_fast_sqeuclidean_translation_invariance(
     n,
     d,

From 0617dbd2c8f9acd40d0cad8c4a3efac196f65025 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 14:39:07 +0200
Subject: [PATCH 19/26] Define methods on the base class

---
 sklearn/metrics/_parallel_reductions.pyx | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index c3d01a7f9e016..9b664f14dd4e8 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -102,6 +102,9 @@ cdef class ParallelReduction:
         ITYPE_t n_X, X_n_samples_chunk, X_n_chunks, X_n_samples_rem
         ITYPE_t n_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_rem
 
+    @classmethod
+    def valid_metrics(cls):
+        return {*METRIC_MAPPING.keys()}
 
     def __cinit__(self):
         # Initializing memory view to prevent memory errors and seg-faults
@@ -155,6 +158,9 @@ cdef class ParallelReduction:
             self.n_X != (X_n_full_chunks * self.X_n_samples_chunk)
         )
 
+    def __dealloc__(self):
+        pass
+
     cdef void _on_X_parallel_init(self,
             ITYPE_t thread_num,
     ) nogil:

From 38b97c331b39ee02105a483c6bc644eed844f165 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 14:38:19 +0200
Subject: [PATCH 20/26] Improve tests for NearestNeighbors the algorithm

Parametrise tests.
Add all common metrics between the methods.
Test for strict equality.
---
 sklearn/neighbors/tests/test_neighbors.py | 71 +++++++++++++++++------
 1 file changed, 53 insertions(+), 18 deletions(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index b0d231721d71f..437facb18752a 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -57,6 +57,7 @@
 SPARSE_OR_DENSE = SPARSE_TYPES + (np.asarray,)
 
 ALGORITHMS = ("ball_tree", "brute", "kd_tree", "auto")
+COMMON_VALID_METRICS = set.intersection(*map(set, neighbors.VALID_METRICS.values()))
 P = (1, 2, 3, 4, np.inf)
 JOBLIB_BACKENDS = list(joblib.parallel.BACKENDS.keys())
 
@@ -77,31 +78,65 @@ def _weight_func(dist):
     return retval ** 2
 
 
+@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]])
+@pytest.mark.parametrize("n_features", [5, 10, 100])
+@pytest.mark.parametrize("n_query_pts", [1, 10, 100])
+@pytest.mark.parametrize("n_neighbors", [1, 10, 100])
+@pytest.mark.parametrize("metric", COMMON_VALID_METRICS)
 def test_unsupervised_kneighbors(
-    n_samples=20, n_features=5, n_query_pts=2, n_neighbors=5
+    n_samples,
+    n_features,
+    n_query_pts,
+    n_neighbors,
+    metric,
 ):
-    # Test unsupervised neighbors methods
-    X = rng.rand(n_samples, n_features)
+    # The different algorithms must return identical results
+    # on their common metrics, with and without returning
+    # distances
 
-    test = rng.rand(n_query_pts, n_features)
+    # Redefining the rng locally to use the same generated X
+    local_rng = np.random.RandomState(0)
+    X = local_rng.rand(n_samples, n_features)
 
-    for p in P:
-        results_nodist = []
-        results = []
+    test = local_rng.rand(n_query_pts, n_features)
 
-        for algorithm in ALGORITHMS:
-            neigh = neighbors.NearestNeighbors(
-                n_neighbors=n_neighbors, algorithm=algorithm, p=p
-            )
-            neigh.fit(X)
+    results_nodist = []
+    results = []
+
+    for algorithm in ALGORITHMS:
+        neigh = neighbors.NearestNeighbors(
+            n_neighbors=n_neighbors, algorithm=algorithm, metric=metric
+        )
+        neigh.fit(X)
 
-            results_nodist.append(neigh.kneighbors(test, return_distance=False))
-            results.append(neigh.kneighbors(test, return_distance=True))
+        results_nodist.append(neigh.kneighbors(test, return_distance=False))
+        results.append(neigh.kneighbors(test, return_distance=True))
 
-        for i in range(len(results) - 1):
-            assert_array_almost_equal(results_nodist[i], results[i][1])
-            assert_array_almost_equal(results[i][0], results[i + 1][0])
-            assert_array_almost_equal(results[i][1], results[i + 1][1])
+    for i in range(len(results) - 1):
+        algorithm = ALGORITHMS[i]
+        next_algorithm = ALGORITHMS[i + 1]
+
+        indices_no_dist = results_nodist[i]
+        distances, next_distances = results[i][0], results[i + 1][0]
+        indices, next_indices = results[i][1], results[i + 1][1]
+        assert_array_equal(
+            indices_no_dist,
+            indices,
+            err_msg=f"The '{algorithm}' algorithm returns different"
+            f"indices depending on 'return_distances'.",
+        )
+        assert_array_equal(
+            indices,
+            next_indices,
+            err_msg=f"The '{algorithm}' and '{next_algorithm}' "
+            f"algorithms return different indices.",
+        )
+        assert_array_equal(
+            distances,
+            next_distances,
+            err_msg=f"The '{algorithm}' and '{next_algorithm}' "
+            f"algorithms return different distances.",
+        )
 
 
 @pytest.mark.parametrize(

From ef9c7f6b842ee2040b095c3799827cbdda1cc8be Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 14:45:56 +0200
Subject: [PATCH 21/26] Propagate metric kwargs in KNeighborsMixin.kneighbors

---
 sklearn/neighbors/_base.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index e97894e8731c4..ac8dc0bea0e03 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -741,7 +741,11 @@ class from an array representing our data set and ask who's
             and self.effective_metric_ in ArgKmin.valid_metrics()
         ):
             results = ArgKmin.get_for(
-                X=X, Y=self._fit_X, k=n_neighbors, metric=self.effective_metric_
+                X=X,
+                Y=self._fit_X,
+                k=n_neighbors,
+                metric=self.effective_metric_,
+                metric_kwargs=self.effective_metric_params_,
             ).compute(
                 strategy="auto",
                 return_distance=return_distance,

From 882e6e767043863f4d7d9dd2e564ee1e727e982d Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 15:20:28 +0200
Subject: [PATCH 22/26] Add DistanceMetric data validation at initialisation

---
 sklearn/metrics/_parallel_reductions.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sklearn/metrics/_parallel_reductions.pyx b/sklearn/metrics/_parallel_reductions.pyx
index 9b664f14dd4e8..f57d7bcd9fd5e 100644
--- a/sklearn/metrics/_parallel_reductions.pyx
+++ b/sklearn/metrics/_parallel_reductions.pyx
@@ -130,6 +130,8 @@ cdef class ParallelReduction:
                                          "same dimension but currently are " \
                                          f"respectively {X.shape[1]}-dimensional " \
                                          f"and {Y.shape[1]}-dimensional."
+        distance_metric._validate_data(X)
+        distance_metric._validate_data(Y)
 
         self.d = X.shape[1]
         self.sf = sizeof(DTYPE_t)

From 15c110ae2432fdedcb39a0304bbc0e5b234a56bb Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 15:34:30 +0200
Subject: [PATCH 23/26] Remove warning checks for 'wminkowski' now that Scipy
 is not used

---
 sklearn/neighbors/tests/test_neighbors.py | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 437facb18752a..bfc8a0c665bbb 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1334,19 +1334,9 @@ def test_neighbors_metrics(n_samples=20, n_features=3, n_query_pts=2, n_neighbor
 
             neigh.fit(X[:, feature_sl])
 
-            # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0
-            ExceptionToAssert = None
-            if (
-                metric == "wminkowski"
-                and algorithm == "brute"
-                and sp_version >= parse_version("1.6.0")
-            ):
-                ExceptionToAssert = DeprecationWarning
-
-            with pytest.warns(ExceptionToAssert):
-                results[algorithm] = neigh.kneighbors(
-                    test[:, feature_sl], return_distance=True
-                )
+            results[algorithm] = neigh.kneighbors(
+                test[:, feature_sl], return_distance=True
+            )
 
         assert_array_almost_equal(results["brute"][0], results["ball_tree"][0])
         assert_array_almost_equal(results["brute"][1], results["ball_tree"][1])

From 7e3c4b7f66e62564f071b92d204f1950bb323938 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 15:38:18 +0200
Subject: [PATCH 24/26] Parametrise test_k_and_radius_neighbors_duplicates on
 algorithms

---
 sklearn/neighbors/tests/test_neighbors.py | 83 +++++++++++------------
 1 file changed, 41 insertions(+), 42 deletions(-)

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index bfc8a0c665bbb..959ed6bfd7210 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1543,49 +1543,48 @@ def test_k_and_radius_neighbors_X_None():
         )
 
 
-def test_k_and_radius_neighbors_duplicates():
+@pytest.mark.parametrize("algorithm", ALGORITHMS)
+def test_k_and_radius_neighbors_duplicates(algorithm):
     # Test behavior of kneighbors when duplicates are present in query
-
-    for algorithm in ALGORITHMS:
-        nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm)
-        nn.fit([[0], [1]])
-
-        # Do not do anything special to duplicates.
-        kng = nn.kneighbors_graph([[0], [1]], mode="distance")
-        assert_array_equal(kng.A, np.array([[0.0, 0.0], [0.0, 0.0]]))
-        assert_array_equal(kng.data, [0.0, 0.0])
-        assert_array_equal(kng.indices, [0, 1])
-
-        dist, ind = nn.radius_neighbors([[0], [1]], radius=1.5)
-        check_object_arrays(dist, [[0, 1], [1, 0]])
-        check_object_arrays(ind, [[0, 1], [0, 1]])
-
-        rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5)
-        assert_array_equal(rng.A, np.ones((2, 2)))
-
-        rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5, mode="distance")
-        rng.sort_indices()
-        assert_array_equal(rng.A, [[0, 1], [1, 0]])
-        assert_array_equal(rng.indices, [0, 1, 0, 1])
-        assert_array_equal(rng.data, [0, 1, 1, 0])
-
-        # Mask the first duplicates when n_duplicates > n_neighbors.
-        X = np.ones((3, 1))
-        nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm="brute")
-        nn.fit(X)
-        dist, ind = nn.kneighbors()
-        assert_array_equal(dist, np.zeros((3, 1)))
-        assert_array_equal(ind, [[1], [0], [1]])
-
-        # Test that zeros are explicitly marked in kneighbors_graph.
-        kng = nn.kneighbors_graph(mode="distance")
-        assert_array_equal(kng.A, np.zeros((3, 3)))
-        assert_array_equal(kng.data, np.zeros(3))
-        assert_array_equal(kng.indices, [1.0, 0.0, 1.0])
-        assert_array_equal(
-            nn.kneighbors_graph().A,
-            np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]),
-        )
+    nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm)
+    nn.fit([[0], [1]])
+
+    # Do not do anything special to duplicates.
+    kng = nn.kneighbors_graph([[0], [1]], mode="distance")
+    assert_array_equal(kng.A, np.array([[0.0, 0.0], [0.0, 0.0]]))
+    assert_array_equal(kng.data, [0.0, 0.0])
+    assert_array_equal(kng.indices, [0, 1])
+
+    dist, ind = nn.radius_neighbors([[0], [1]], radius=1.5)
+    check_object_arrays(dist, [[0, 1], [1, 0]])
+    check_object_arrays(ind, [[0, 1], [0, 1]])
+
+    rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5)
+    assert_array_equal(rng.A, np.ones((2, 2)))
+
+    rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5, mode="distance")
+    rng.sort_indices()
+    assert_array_equal(rng.A, [[0, 1], [1, 0]])
+    assert_array_equal(rng.indices, [0, 1, 0, 1])
+    assert_array_equal(rng.data, [0, 1, 1, 0])
+
+    # Mask the first duplicates when n_duplicates > n_neighbors.
+    X = np.ones((3, 1))
+    nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm="brute")
+    nn.fit(X)
+    dist, ind = nn.kneighbors()
+    assert_array_equal(dist, np.zeros((3, 1)))
+    assert_array_equal(ind, [[1], [0], [1]])
+
+    # Test that zeros are explicitly marked in kneighbors_graph.
+    kng = nn.kneighbors_graph(mode="distance")
+    assert_array_equal(kng.A, np.zeros((3, 3)))
+    assert_array_equal(kng.data, np.zeros(3))
+    assert_array_equal(kng.indices, [1.0, 0.0, 1.0])
+    assert_array_equal(
+        nn.kneighbors_graph().A,
+        np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]),
+    )
 
 
 def test_include_self_neighbors_graph():

From 2ec36c1c4c7ffad2734b0f3f97ec4cc89ea48368 Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 15:56:00 +0200
Subject: [PATCH 25/26] Remove uncalled snippet

This can be simplified as the condition won't be verified
anymore.
---
 sklearn/neighbors/_base.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index ac8dc0bea0e03..0d3f15974a7e6 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -758,12 +758,6 @@ class from an array representing our data set and ask who's
                 return_distance=return_distance,
             )
 
-            # for efficiency, use squared euclidean distances
-            if self.effective_metric_ == "euclidean":
-                kwds = {"squared": True}
-            else:
-                kwds = self.effective_metric_params_
-
             chunked_results = list(
                 pairwise_distances_chunked(
                     X,
@@ -771,7 +765,7 @@ class from an array representing our data set and ask who's
                     reduce_func=reduce_func,
                     metric=self.effective_metric_,
                     n_jobs=n_jobs,
-                    **kwds,
+                    **self.effective_metric_params_,
                 )
             )
 

From ad496f00dd7fefdc4c98f8befce0d6fedd30adff Mon Sep 17 00:00:00 2001
From: Julien Jerphanion <git@jjerphan.xyz>
Date: Thu, 1 Jul 2021 16:00:00 +0200
Subject: [PATCH 26/26] Do not branch on sparse arrays, yet

We need to implement a strategy for sparse arrays for
ParallelReduction.
---
 sklearn/metrics/pairwise.py | 7 ++++++-
 sklearn/neighbors/_base.py  | 5 ++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index c35b38ad1fde4..bcf42ac2ca9fa 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -652,7 +652,12 @@ def pairwise_distances_argmin_min(
     if metric_kwargs is None:
         metric_kwargs = {}
 
-    if metric in ArgKmin.valid_metrics():
+    if (
+        # TODO: support sparse arrays
+        not issparse(X)
+        and not issparse(X)
+        and metric in ArgKmin.valid_metrics()
+    ):
         values, indices = ArgKmin.get_for(
             X=X, Y=Y, k=1, metric=metric, metric_kwargs=metric_kwargs
         ).compute(strategy="auto", return_distance=True)
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 0d3f15974a7e6..85c3cd743cb57 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -737,7 +737,10 @@ class from an array representing our data set and ask who's
             )
 
         elif (
-            self._fit_method == "brute"
+            # TODO: support sparse arrays
+            not issparse(X)
+            and not issparse(self._fit_X)
+            and self._fit_method == "brute"
             and self.effective_metric_ in ArgKmin.valid_metrics()
         ):
             results = ArgKmin.get_for(