FEA Fused sparse-dense support for PairwiseDistancesReduction (#23585)

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org> Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com> Co-authored-by: Jérémie du Boisberranger <jeremiedbb@users.noreply.github.com> Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com> Co-authored-by: Meekail Zain <Micky774@users.noreply.github.com>
scikit-learn · Sep 20, 2022 · 60cc5b5 · 60cc5b5
1 parent bfe68b4
commit 60cc5b5
Show file tree

Hide file tree

Showing 16 changed files with 525 additions and 89 deletions.
diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst
@@ -51,6 +51,29 @@ Changes impacting all modules
   second-pass algorithm.
   :pr:`23197` by :user:`Meekail Zain <micky774>`
 
+- |Enhancement| Support for combinations of dense and sparse datasets pairs
+  for all distance metrics and for float32 and float64 datasets has been added
+  or has seen its performance improved for the following estimators:
+
+  - :func:`sklearn.metrics.pairwise_distances_argmin`
+  - :func:`sklearn.metrics.pairwise_distances_argmin_min`
+  - :class:`sklearn.cluster.AffinityPropagation`
+  - :class:`sklearn.cluster.Birch`
+  - :class:`sklearn.cluster.SpectralClustering`
+  - :class:`sklearn.neighbors.KNeighborsClassifier`
+  - :class:`sklearn.neighbors.KNeighborsRegressor`
+  - :class:`sklearn.neighbors.RadiusNeighborsClassifier`
+  - :class:`sklearn.neighbors.RadiusNeighborsRegressor`
+  - :class:`sklearn.neighbors.LocalOutlierFactor`
+  - :class:`sklearn.neighbors.NearestNeighbors`
+  - :class:`sklearn.manifold.Isomap`
+  - :class:`sklearn.manifold.TSNE`
+  - :func:`sklearn.manifold.trustworthiness`
+
+  :pr:`23604` and :pr:`23585` by :user:`Julien Jerphanion <jjerphan>`,
+  :user:`Olivier Grisel <ogrisel>`, and `Thomas Fan`_.
+
+
 Changelog
 ---------
 

diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py
@@ -330,9 +330,9 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X : {array-like, sparse graph, BallTree, KDTree, NearestNeighbors}
+        X : {array-like, sparse matrix, BallTree, KDTree, NearestNeighbors}
             Sample data, shape = (n_samples, n_features), in the form of a
-            numpy array, sparse graph, precomputed tree, or NearestNeighbors
+            numpy array, sparse matrix, precomputed tree, or NearestNeighbors
             object.
 
         y : Ignored
@@ -352,7 +352,7 @@ def fit_transform(self, X, y=None):
 
         Parameters
         ----------
-        X : {array-like, sparse graph, BallTree, KDTree}
+        X : {array-like, sparse matrix, BallTree, KDTree}
             Training vector, where `n_samples` is the number of samples
             and `n_features` is the number of features.
 
@@ -381,7 +381,7 @@ def transform(self, X):
 
         Parameters
         ----------
-        X : array-like, shape (n_queries, n_features)
+        X : {array-like, sparse matrix}, shape (n_queries, n_features)
             If neighbors_algorithm='precomputed', X is assumed to be a
             distance matrix or a sparse graph of shape
             (n_queries, n_samples_fit).

diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py
@@ -461,11 +461,12 @@ def trustworthiness(X, X_embedded, *, n_neighbors=5, metric="euclidean"):
 
     Parameters
     ----------
-    X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)
+    X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+        (n_samples, n_samples)
         If the metric is 'precomputed' X must be a square distance
         matrix. Otherwise it contains a sample per row.
 
-    X_embedded : ndarray of shape (n_samples, n_components)
+    X_embedded : {array-like, sparse matrix} of shape (n_samples, n_components)
         Embedding of the training data in low-dimensional space.
 
     n_neighbors : int, default=5
@@ -1095,7 +1096,8 @@ def fit_transform(self, X, y=None):
 
         Parameters
         ----------
-        X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+            (n_samples, n_samples)
             If the metric is 'precomputed' X must be a square distance
             matrix. Otherwise it contains a sample per row. If the method
             is 'exact', X may be a sparse matrix of type 'csr', 'csc'
@@ -1121,7 +1123,8 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
+            (n_samples, n_samples)
             If the metric is 'precomputed' X must be a square distance
             matrix. Otherwise it contains a sample per row. If the method
             is 'exact', X may be a sparse matrix of type 'csr', 'csc'

diff --git a/sklearn/manifold/tests/test_isomap.py b/sklearn/manifold/tests/test_isomap.py
@@ -216,19 +216,21 @@ def test_isomap_clone_bug():
         assert model.nbrs_.n_neighbors == n_neighbors
 
 
-def test_sparse_input():
+@pytest.mark.parametrize("eigen_solver", eigen_solvers)
+@pytest.mark.parametrize("path_method", path_methods)
+def test_sparse_input(eigen_solver, path_method):
+    # TODO: compare results on dense and sparse data as proposed in:
+    # https://github.com/scikit-learn/scikit-learn/pull/23585#discussion_r968388186
     X = sparse_rand(100, 3, density=0.1, format="csr")
 
-    # Should not error
-    for eigen_solver in eigen_solvers:
-        for path_method in path_methods:
-            clf = manifold.Isomap(
-                n_components=2,
-                eigen_solver=eigen_solver,
-                path_method=path_method,
-                n_neighbors=8,
-            )
-            clf.fit(X)
+    clf = manifold.Isomap(
+        n_components=2,
+        eigen_solver=eigen_solver,
+        path_method=path_method,
+        n_neighbors=8,
+    )
+    clf.fit(X)
+    clf.transform(X)
 
 
 def test_isomap_fit_precomputed_radius_graph():

diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py
@@ -329,7 +329,9 @@ def test_optimization_minimizes_kl_divergence():
 
 
 @pytest.mark.parametrize("method", ["exact", "barnes_hut"])
-def test_fit_csr_matrix(method):
+def test_fit_transform_csr_matrix(method):
+    # TODO: compare results on dense and sparse data as proposed in:
+    # https://github.com/scikit-learn/scikit-learn/pull/23585#discussion_r968388186
     # X can be a sparse matrix.
     rng = check_random_state(0)
     X = rng.randn(50, 2)

diff --git a/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp b/sklearn/metrics/_pairwise_distances_reduction/_datasets_pair.pxd.tp
@@ -5,8 +5,7 @@ implementation_specific_values = [
     #
     #       name_suffix, DistanceMetric, INPUT_DTYPE_t, INPUT_DTYPE
     #
-    # We also use the float64 dtype and C-type names as defined in
-    # `sklearn.utils._typedefs` to maintain consistency.
+    # We use DistanceMetric for float64 for backward naming compatibility.
     #
     ('64', 'DistanceMetric', 'DTYPE_t', 'DTYPE'),
     ('32', 'DistanceMetric32', 'cnp.float32_t', 'np.float32')
@@ -15,14 +14,16 @@ implementation_specific_values = [
 }}
 cimport numpy as cnp
 
-from ...utils._typedefs cimport DTYPE_t, ITYPE_t
+from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t
 from ...metrics._dist_metrics cimport DistanceMetric, DistanceMetric32
 
 {{for name_suffix, DistanceMetric, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}
 
 
 cdef class DatasetsPair{{name_suffix}}:
-    cdef {{DistanceMetric}} distance_metric
+    cdef:
+        {{DistanceMetric}} distance_metric
+        ITYPE_t n_features
 
     cdef ITYPE_t n_samples_X(self) nogil
 
@@ -37,5 +38,35 @@ cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
     cdef:
         const {{INPUT_DTYPE_t}}[:, ::1] X
         const {{INPUT_DTYPE_t}}[:, ::1] Y
-        ITYPE_t d
+
+
+cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
+    cdef:
+        const {{INPUT_DTYPE_t}}[:] X_data
+        const SPARSE_INDEX_TYPE_t[:] X_indices
+        const SPARSE_INDEX_TYPE_t[:] X_indptr
+
+        const {{INPUT_DTYPE_t}}[:] Y_data
+        const SPARSE_INDEX_TYPE_t[:] Y_indices
+        const SPARSE_INDEX_TYPE_t[:] Y_indptr
+
+
+cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
+    cdef:
+        const {{INPUT_DTYPE_t}}[:] X_data
+        const SPARSE_INDEX_TYPE_t[:] X_indices
+        const SPARSE_INDEX_TYPE_t[:] X_indptr
+
+        const {{INPUT_DTYPE_t}}[:] Y_data
+        const SPARSE_INDEX_TYPE_t[:] Y_indices
+        ITYPE_t n_Y
+
+
+cdef class DenseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
+    cdef:
+        # As distance metrics are commutative, we can simply rely
+        # on the implementation of SparseDenseDatasetsPair and
+        # swap arguments.
+        DatasetsPair{{name_suffix}} datasets_pair
+
 {{endfor}}