FIX PLSRegression.coef_ takes X and Y variances into account when sca…

…le=True (#28612) Co-authored-by: Jérémie du Boisberranger <jeremie@probabl.ai>
scikit-learn · May 2, 2024 · e19d8c2 · e19d8c2
1 parent 2bafd7b
commit e19d8c2
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 6 deletions.
diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst
@@ -212,7 +212,12 @@ Changelog
   :class:`cross_decomposition.CCA`,
   and :class:`cross_decomposition.PLSSVD`.
   `Y` will be removed in version 1.7.
-  :pr:`28604` by :user:`David Leon <davidleon123>`
+  :pr:`28604` by :user:`David Leon <davidleon123>`.
+
+- |Fix| The `coef_` fitted attribute of :class:`cross_decomposition.PLSRegression`
+  now takes into account both the scale of `X` and `Y` when `scale=True`. Note that
+  the previous predicted values were not affected by this bug.
+  :pr:`28612` by :user:`Guillaume Lemaitre <glemaitre>`.
 
 :mod:`sklearn.datasets`
 .......................

diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py
@@ -388,7 +388,7 @@ def fit(self, X, y=None, Y=None):
             pinv2(np.dot(self.y_loadings_.T, self.y_weights_), check_finite=False),
         )
         self.coef_ = np.dot(self.x_rotations_, self.y_loadings_.T)
-        self.coef_ = (self.coef_ * self._y_std).T
+        self.coef_ = (self.coef_ * self._y_std).T / self._x_std
         self.intercept_ = self._y_mean
         self._n_features_out = self.x_rotations_.shape[1]
         return self
@@ -517,9 +517,8 @@ def predict(self, X, copy=True):
         """
         check_is_fitted(self)
         X = self._validate_data(X, copy=copy, dtype=FLOAT_DTYPES, reset=False)
-        # Normalize
+        # Only center X but do not scale it since the coefficients are already scaled
         X -= self._x_mean
-        X /= self._x_std
         Ypred = X @ self.coef_.T + self.intercept_
         return Ypred.ravel() if self._predict_1d else Ypred
 

diff --git a/sklearn/cross_decomposition/tests/test_pls.py b/sklearn/cross_decomposition/tests/test_pls.py
@@ -589,8 +589,6 @@ def test_pls_prediction(PLSEstimator, scale):
 
     y_mean = Y.mean(axis=0)
     X_trans = X - X.mean(axis=0)
-    if scale:
-        X_trans /= X.std(axis=0, ddof=1)
 
     assert_allclose(pls.intercept_, y_mean)
     assert_allclose(Y_pred, X_trans @ pls.coef_.T + pls.intercept_)
@@ -646,6 +644,28 @@ def test_pls_regression_fit_1d_y():
     assert_allclose(y_pred, expected)
 
 
+def test_pls_regression_scaling_coef():
+    """Check that when using `scale=True`, the coefficients are using the std. dev. from
+    both `X` and `Y`.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/27964
+    """
+    # handcrafted data where we can predict Y from X with an additional scaling factor
+    rng = np.random.RandomState(0)
+    coef = rng.uniform(size=(3, 5))
+    X = rng.normal(scale=10, size=(30, 5))  # add a std of 10
+    Y = X @ coef.T
+
+    # we need to make sure that the dimension of the latent space is large enough to
+    # perfectly predict `Y` from `X` (no information loss)
+    pls = PLSRegression(n_components=5, scale=True).fit(X, Y)
+    assert_allclose(pls.coef_, coef)
+
+    # we therefore should be able to predict `Y` from `X`
+    assert_allclose(pls.predict(X), Y)
+
+
 # TODO(1.7): Remove
 @pytest.mark.parametrize("Klass", [PLSRegression, CCA, PLSSVD, PLSCanonical])
 def test_pls_fit_warning_on_deprecated_Y_argument(Klass):