Skip to content

Commit

Permalink
FEA Fused sparse-dense support for PairwiseDistancesReduction (#23585)
Browse files Browse the repository at this point in the history

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
Co-authored-by: Jérémie du Boisberranger <jeremiedbb@users.noreply.github.com>
Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
Co-authored-by: Meekail Zain <Micky774@users.noreply.github.com>
  • Loading branch information
6 people committed Sep 20, 2022
1 parent bfe68b4 commit 60cc5b5
Show file tree
Hide file tree
Showing 16 changed files with 525 additions and 89 deletions.
23 changes: 23 additions & 0 deletions doc/whats_new/v1.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,29 @@ Changes impacting all modules
second-pass algorithm.
:pr:`23197` by :user:`Meekail Zain <micky774>`

- |Enhancement| Support for combinations of dense and sparse datasets pairs
for all distance metrics and for float32 and float64 datasets has been added
or has seen its performance improved for the following estimators:

- :func:`sklearn.metrics.pairwise_distances_argmin`
- :func:`sklearn.metrics.pairwise_distances_argmin_min`
- :class:`sklearn.cluster.AffinityPropagation`
- :class:`sklearn.cluster.Birch`
- :class:`sklearn.cluster.SpectralClustering`
- :class:`sklearn.neighbors.KNeighborsClassifier`
- :class:`sklearn.neighbors.KNeighborsRegressor`
- :class:`sklearn.neighbors.RadiusNeighborsClassifier`
- :class:`sklearn.neighbors.RadiusNeighborsRegressor`
- :class:`sklearn.neighbors.LocalOutlierFactor`
- :class:`sklearn.neighbors.NearestNeighbors`
- :class:`sklearn.manifold.Isomap`
- :class:`sklearn.manifold.TSNE`
- :func:`sklearn.manifold.trustworthiness`

:pr:`23604` and :pr:`23585` by :user:`Julien Jerphanion <jjerphan>`,
:user:`Olivier Grisel <ogrisel>`, and `Thomas Fan`_.


Changelog
---------

Expand Down
8 changes: 4 additions & 4 deletions sklearn/manifold/_isomap.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,9 +330,9 @@ def fit(self, X, y=None):
Parameters
----------
X : {array-like, sparse graph, BallTree, KDTree, NearestNeighbors}
X : {array-like, sparse matrix, BallTree, KDTree, NearestNeighbors}
Sample data, shape = (n_samples, n_features), in the form of a
numpy array, sparse graph, precomputed tree, or NearestNeighbors
numpy array, sparse matrix, precomputed tree, or NearestNeighbors
object.
y : Ignored
Expand All @@ -352,7 +352,7 @@ def fit_transform(self, X, y=None):
Parameters
----------
X : {array-like, sparse graph, BallTree, KDTree}
X : {array-like, sparse matrix, BallTree, KDTree}
Training vector, where `n_samples` is the number of samples
and `n_features` is the number of features.
Expand Down Expand Up @@ -381,7 +381,7 @@ def transform(self, X):
Parameters
----------
X : array-like, shape (n_queries, n_features)
X : {array-like, sparse matrix}, shape (n_queries, n_features)
If neighbors_algorithm='precomputed', X is assumed to be a
distance matrix or a sparse graph of shape
(n_queries, n_samples_fit).
Expand Down
11 changes: 7 additions & 4 deletions sklearn/manifold/_t_sne.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,11 +461,12 @@ def trustworthiness(X, X_embedded, *, n_neighbors=5, metric="euclidean"):
Parameters
----------
X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
(n_samples, n_samples)
If the metric is 'precomputed' X must be a square distance
matrix. Otherwise it contains a sample per row.
X_embedded : ndarray of shape (n_samples, n_components)
X_embedded : {array-like, sparse matrix} of shape (n_samples, n_components)
Embedding of the training data in low-dimensional space.
n_neighbors : int, default=5
Expand Down Expand Up @@ -1095,7 +1096,8 @@ def fit_transform(self, X, y=None):
Parameters
----------
X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
(n_samples, n_samples)
If the metric is 'precomputed' X must be a square distance
matrix. Otherwise it contains a sample per row. If the method
is 'exact', X may be a sparse matrix of type 'csr', 'csc'
Expand All @@ -1121,7 +1123,8 @@ def fit(self, X, y=None):
Parameters
----------
X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)
X : {array-like, sparse matrix} of shape (n_samples, n_features) or \
(n_samples, n_samples)
If the metric is 'precomputed' X must be a square distance
matrix. Otherwise it contains a sample per row. If the method
is 'exact', X may be a sparse matrix of type 'csr', 'csc'
Expand Down
24 changes: 13 additions & 11 deletions sklearn/manifold/tests/test_isomap.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,19 +216,21 @@ def test_isomap_clone_bug():
assert model.nbrs_.n_neighbors == n_neighbors


def test_sparse_input():
@pytest.mark.parametrize("eigen_solver", eigen_solvers)
@pytest.mark.parametrize("path_method", path_methods)
def test_sparse_input(eigen_solver, path_method):
# TODO: compare results on dense and sparse data as proposed in:
# https://github.com/scikit-learn/scikit-learn/pull/23585#discussion_r968388186
X = sparse_rand(100, 3, density=0.1, format="csr")

# Should not error
for eigen_solver in eigen_solvers:
for path_method in path_methods:
clf = manifold.Isomap(
n_components=2,
eigen_solver=eigen_solver,
path_method=path_method,
n_neighbors=8,
)
clf.fit(X)
clf = manifold.Isomap(
n_components=2,
eigen_solver=eigen_solver,
path_method=path_method,
n_neighbors=8,
)
clf.fit(X)
clf.transform(X)


def test_isomap_fit_precomputed_radius_graph():
Expand Down
4 changes: 3 additions & 1 deletion sklearn/manifold/tests/test_t_sne.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,9 @@ def test_optimization_minimizes_kl_divergence():


@pytest.mark.parametrize("method", ["exact", "barnes_hut"])
def test_fit_csr_matrix(method):
def test_fit_transform_csr_matrix(method):
# TODO: compare results on dense and sparse data as proposed in:
# https://github.com/scikit-learn/scikit-learn/pull/23585#discussion_r968388186
# X can be a sparse matrix.
rng = check_random_state(0)
X = rng.randn(50, 2)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@ implementation_specific_values = [
#
# name_suffix, DistanceMetric, INPUT_DTYPE_t, INPUT_DTYPE
#
# We also use the float64 dtype and C-type names as defined in
# `sklearn.utils._typedefs` to maintain consistency.
# We use DistanceMetric for float64 for backward naming compatibility.
#
('64', 'DistanceMetric', 'DTYPE_t', 'DTYPE'),
('32', 'DistanceMetric32', 'cnp.float32_t', 'np.float32')
Expand All @@ -15,14 +14,16 @@ implementation_specific_values = [
}}
cimport numpy as cnp

from ...utils._typedefs cimport DTYPE_t, ITYPE_t
from ...utils._typedefs cimport DTYPE_t, ITYPE_t, SPARSE_INDEX_TYPE_t
from ...metrics._dist_metrics cimport DistanceMetric, DistanceMetric32

{{for name_suffix, DistanceMetric, INPUT_DTYPE_t, INPUT_DTYPE in implementation_specific_values}}


cdef class DatasetsPair{{name_suffix}}:
cdef {{DistanceMetric}} distance_metric
cdef:
{{DistanceMetric}} distance_metric
ITYPE_t n_features

cdef ITYPE_t n_samples_X(self) nogil

Expand All @@ -37,5 +38,35 @@ cdef class DenseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
cdef:
const {{INPUT_DTYPE_t}}[:, ::1] X
const {{INPUT_DTYPE_t}}[:, ::1] Y
ITYPE_t d


cdef class SparseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
cdef:
const {{INPUT_DTYPE_t}}[:] X_data
const SPARSE_INDEX_TYPE_t[:] X_indices
const SPARSE_INDEX_TYPE_t[:] X_indptr

const {{INPUT_DTYPE_t}}[:] Y_data
const SPARSE_INDEX_TYPE_t[:] Y_indices
const SPARSE_INDEX_TYPE_t[:] Y_indptr


cdef class SparseDenseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
cdef:
const {{INPUT_DTYPE_t}}[:] X_data
const SPARSE_INDEX_TYPE_t[:] X_indices
const SPARSE_INDEX_TYPE_t[:] X_indptr

const {{INPUT_DTYPE_t}}[:] Y_data
const SPARSE_INDEX_TYPE_t[:] Y_indices
ITYPE_t n_Y


cdef class DenseSparseDatasetsPair{{name_suffix}}(DatasetsPair{{name_suffix}}):
cdef:
# As distance metrics are commutative, we can simply rely
# on the implementation of SparseDenseDatasetsPair and
# swap arguments.
DatasetsPair{{name_suffix}} datasets_pair

{{endfor}}

0 comments on commit 60cc5b5

Please sign in to comment.