Release 1.2.0 [cd build] (#25121)

scikit-learn · Dec 8, 2022 · dc580a8 · dc580a8
1 parent c7d5f58
commit dc580a8
Show file tree

Hide file tree

Showing 29 changed files with 497 additions and 135 deletions.
diff --git a/build_tools/azure/install_win.sh b/build_tools/azure/install_win.sh
@@ -7,9 +7,7 @@ set -x
 source build_tools/shared.sh
 
 if [[ "$DISTRIB" == "conda" ]]; then
-    conda update -n base conda -y
-    conda install pip -y
-    pip install "$(get_dep conda-lock min)"
+    conda install -c conda-forge "$(get_dep conda-lock min)" -y
     conda-lock install --name $VIRTUALENV $LOCK_FILE
     source activate $VIRTUALENV
 else

diff --git a/doc/computing/parallelism.rst b/doc/computing/parallelism.rst
@@ -299,6 +299,13 @@ When this environment variable is set to a non zero value, the `Cython`
 derivative, `boundscheck` is set to `True`. This is useful for finding
 segfaults.
 
+`SKLEARN_BUILD_ENABLE_DEBUG_SYMBOLS`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When this environment variable is set to a non zero value, the debug symbols
+will be included in the compiled C extensions. Only debug symbols for POSIX
+systems is configured.
+
 `SKLEARN_PAIRWISE_DIST_CHUNK_SIZE`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/doc/developers/maintainer.rst b/doc/developers/maintainer.rst
@@ -310,6 +310,7 @@ The following GitHub checklist might be helpful in a release PR::
     * [ ] upload the wheels and source tarball to PyPI
     * [ ] https://github.com/scikit-learn/scikit-learn/releases publish (except for RC)
     * [ ] announce on mailing list and on Twitter, and LinkedIn
+    * [ ] update SECURITY.md in main branch (except for RC)
 
 Merging Pull Requests
 ---------------------

diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst
@@ -411,11 +411,15 @@ Changelog
 :mod:`sklearn.inspection`
 .........................
 
-- |Enhancement| Extended :func:`inspection.partial_dependence` and
+- |MajorFeature| Extended :func:`inspection.partial_dependence` and
   :class:`inspection.PartialDependenceDisplay` to handle categorical features.
   :pr:`18298` by :user:`Madhura Jayaratne <madhuracj>` and
   :user:`Guillaume Lemaitre <glemaitre>`.
 
+- |Fix| :class:`inspection.DecisionBoundaryDisplay` now raises error if input
+  data is not 2-dimensional.
+  :pr:`25077` by :user:`Arturo Amor <ArturoAmorQ>`.
+
 :mod:`sklearn.kernel_approximation`
 ...................................
 
@@ -641,6 +645,16 @@ Changelog
   dtype for `numpy.float32` inputs.
   :pr:`22665` by :user:`Julien Jerphanion <jjerphan>`.
 
+:mod:`sklearn.neural_network`
+.............................
+
+- |Fix| :class:`neural_network.MLPClassifier` and
+  :class:`neural_network.MLPRegressor` always expose the parameters `best_loss_`,
+  `validation_scores_`, and `best_validation_score_`. `best_loss_` is set to
+  `None` when `early_stopping=True`, while `validation_scores_` and
+  `best_validation_score_` are set to `None` when `early_stopping=False`.
+  :pr:`24683` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.pipeline`
 .......................
 
@@ -696,6 +710,10 @@ Changelog
 - |Enhancement| :func:`utils.validation.column_or_1d` now accepts a `dtype`
   parameter to specific `y`'s dtype. :pr:`22629` by `Thomas Fan`_.
 
+- |Enhancement| :func:`utils.extmath.cartesian` now accepts arrays with different
+  `dtype` and will cast the ouptut to the most permissive `dtype`.
+  :pr:`25067` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 - |Fix| :func:`utils.multiclass.type_of_target` now properly handles sparse matrices.
   :pr:`14862` by :user:`Léonard Binet <leonardbinet>`.
 
@@ -705,6 +723,10 @@ Changelog
 - |Fix| :func:`utils.estimator_checks.check_estimator` now takes into account
   the `requires_positive_X` tag correctly. :pr:`24667` by `Thomas Fan`_.
 
+- |Fix| :func:`utils.check_array` now supports Pandas Series with `pd.NA`
+  by raising a better error message or returning a compatible `ndarray`.
+  :pr:`25080` by `Thomas Fan`_.
+
 - |API| The extra keyword parameters of :func:`utils.extmath.density` are deprecated
   and will be removed in 1.4.
   :pr:`24523` by :user:`Mia Bajic <clytaemnestra>`.

diff --git a/examples/release_highlights/plot_release_highlights_1_2_0.py b/examples/release_highlights/plot_release_highlights_1_2_0.py
@@ -93,15 +93,49 @@
     hist_no_interact, X, y, cv=5, n_jobs=2, train_sizes=np.linspace(0.1, 1, 5)
 )
 
+# %%
+# :class:`~inspection.PartialDependenceDisplay` exposes a new parameter
+# `categorical_features` to display partial dependence for categorical features
+# using bar plots and heatmaps.
+from sklearn.datasets import fetch_openml
+
+X, y = fetch_openml(
+    "titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
+)
+X = X.select_dtypes(["number", "category"]).drop(columns=["body"])
+
+# %%
+from sklearn.preprocessing import OrdinalEncoder
+from sklearn.pipeline import make_pipeline
+
+categorical_features = ["pclass", "sex", "embarked"]
+model = make_pipeline(
+    ColumnTransformer(
+        transformers=[("cat", OrdinalEncoder(), categorical_features)],
+        remainder="passthrough",
+    ),
+    HistGradientBoostingRegressor(random_state=0),
+).fit(X, y)
+
+# %%
+from sklearn.inspection import PartialDependenceDisplay
+
+fig, ax = plt.subplots(figsize=(14, 4), constrained_layout=True)
+_ = PartialDependenceDisplay.from_estimator(
+    model,
+    X,
+    features=["age", "sex", ("pclass", "sex")],
+    categorical_features=categorical_features,
+    ax=ax,
+)
+
 # %%
 # Faster parser in :func:`~datasets.fetch_openml`
 # -----------------------------------------------
 # :func:`~datasets.fetch_openml` now supports a new `"pandas"` parser that is
 # more memory and CPU efficient. In v1.4, the default will change to
 # `parser="auto"` which will automatically use the `"pandas"` parser for dense
 # data and `"liac-arff"` for sparse data.
-from sklearn.datasets import fetch_openml
-
 X, y = fetch_openml(
     "titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
 )

diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [options]
-packages = find_namespace:
+packages = find:
 
 [options.packages.find]
 include = sklearn*

diff --git a/setup.py b/setup.py
@@ -497,8 +497,24 @@ def configure_extension_modules():
 
     is_pypy = platform.python_implementation() == "PyPy"
     np_include = numpy.get_include()
-    default_libraries = ["m"] if os.name == "posix" else []
-    default_extra_compile_args = ["-O3"]
+
+    optimization_level = "O2"
+    if os.name == "posix":
+        default_extra_compile_args = [f"-{optimization_level}"]
+        default_libraries = ["m"]
+    else:
+        default_extra_compile_args = [f"/{optimization_level}"]
+        default_libraries = []
+
+    build_with_debug_symbols = (
+        os.environ.get("SKLEARN_BUILD_ENABLE_DEBUG_SYMBOLS", "0") != "0"
+    )
+    if os.name == "posix":
+        if build_with_debug_symbols:
+            default_extra_compile_args.append("-g")
+        else:
+            # Setting -g0 will strip symbols, reducing the binary size of extensions
+            default_extra_compile_args.append("-g0")
 
     cython_exts = []
     for submodule, extensions in extension_config.items():
@@ -608,9 +624,8 @@ def setup_package():
         cmdclass=cmdclass,
         python_requires=python_requires,
         install_requires=min_deps.tag_to_packages["install"],
-        package_data={"": ["*.pxd"]},
+        package_data={"": ["*.csv", "*.gz", "*.txt", "*.pxd", "*.rst", "*.jpg"]},
         zip_safe=False,  # the package can run out of an .egg file
-        include_package_data=True,
         extras_require={
             key: min_deps.tag_to_packages[key]
             for key in ["examples", "docs", "tests", "benchmark"]

diff --git a/sklearn/__init__.py b/sklearn/__init__.py
@@ -39,7 +39,7 @@
 # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
 #
-__version__ = "1.2.0rc1"
+__version__ = "1.2.0"
 
 
 # On OSX, we can get a runtime error due to multiple OpenMP libraries loaded

diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py
@@ -890,25 +890,7 @@ def _fit_multiplicative_update(
         "X": ["array-like", "sparse matrix"],
         "W": ["array-like", None],
         "H": ["array-like", None],
-        "n_components": [Interval(Integral, 1, None, closed="left"), None],
-        "init": [
-            StrOptions({"random", "nndsvd", "nndsvda", "nndsvdar", "custom"}),
-            None,
-        ],
         "update_H": ["boolean"],
-        "solver": [StrOptions({"mu", "cd"})],
-        "beta_loss": [
-            StrOptions({"frobenius", "kullback-leibler", "itakura-saito"}),
-            Real,
-        ],
-        "tol": [Interval(Real, 0, None, closed="left")],
-        "max_iter": [Interval(Integral, 1, None, closed="left")],
-        "alpha_W": [Interval(Real, 0, None, closed="left")],
-        "alpha_H": [Interval(Real, 0, None, closed="left"), StrOptions({"same"})],
-        "l1_ratio": [Interval(Real, 0, 1, closed="both")],
-        "random_state": ["random_state"],
-        "verbose": ["verbose"],
-        "shuffle": ["boolean"],
     }
 )
 def non_negative_factorization(
@@ -1107,8 +1089,6 @@ def non_negative_factorization(
     >>> W, H, n_iter = non_negative_factorization(
     ...     X, n_components=2, init='random', random_state=0)
     """
-    X = check_array(X, accept_sparse=("csr", "csc"), dtype=[np.float64, np.float32])
-
     est = NMF(
         n_components=n_components,
         init=init,
@@ -1123,6 +1103,9 @@ def non_negative_factorization(
         verbose=verbose,
         shuffle=shuffle,
     )
+    est._validate_params()
+
+    X = check_array(X, accept_sparse=("csr", "csc"), dtype=[np.float64, np.float32])
 
     with config_context(assume_finite=True):
         W, H, n_iter = est._fit_transform(X, W=W, H=H, update_H=update_H)

diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py
@@ -488,8 +488,11 @@ def fit(self, X, y, sample_weight=None, monitor=None):
                     try:
                         self.init_.fit(X, y, sample_weight=sample_weight)
                     except TypeError as e:
-                        # regular estimator without SW support
-                        raise ValueError(msg) from e
+                        if "unexpected keyword argument 'sample_weight'" in str(e):
+                            # regular estimator without SW support
+                            raise ValueError(msg) from e
+                        else:  # regular estimator whose input checking failed
+                            raise
                     except ValueError as e:
                         if (
                             "pass parameters to specific steps of "

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -270,18 +270,21 @@ def _check_categories(self, X):
                 if missing.any():
                     categories = categories[~missing]
 
+                if hasattr(self, "feature_names_in_"):
+                    feature_name = f"'{self.feature_names_in_[f_idx]}'"
+                else:
+                    feature_name = f"at index {f_idx}"
+
                 if categories.size > self.max_bins:
                     raise ValueError(
-                        f"Categorical feature at index {f_idx} is "
-                        "expected to have a "
-                        f"cardinality <= {self.max_bins}"
+                        f"Categorical feature {feature_name} is expected to "
+                        f"have a cardinality <= {self.max_bins}"
                     )
 
                 if (categories >= self.max_bins).any():
                     raise ValueError(
-                        f"Categorical feature at index {f_idx} is "
-                        "expected to be encoded with "
-                        f"values < {self.max_bins}"
+                        f"Categorical feature {feature_name} is expected to "
+                        f"be encoded with values < {self.max_bins}"
                     )
             else:
                 categories = None

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -58,10 +58,6 @@ def _make_dumb_dataset(n_samples):
 @pytest.mark.parametrize(
     "params, err_msg",
     [
-        (
-            {"interaction_cst": "string"},
-            "",
-        ),
         (
             {"interaction_cst": [0, 1]},
             "Interaction constraints must be a sequence of tuples or lists",
@@ -1141,20 +1137,32 @@ def test_categorical_spec_no_categories(Est, categorical_features, as_array):
 @pytest.mark.parametrize(
     "Est", (HistGradientBoostingClassifier, HistGradientBoostingRegressor)
 )
-def test_categorical_bad_encoding_errors(Est):
+@pytest.mark.parametrize(
+    "use_pandas, feature_name", [(False, "at index 0"), (True, "'f0'")]
+)
+def test_categorical_bad_encoding_errors(Est, use_pandas, feature_name):
     # Test errors when categories are encoded incorrectly
 
     gb = Est(categorical_features=[True], max_bins=2)
 
-    X = np.array([[0, 1, 2]]).T
+    if use_pandas:
+        pd = pytest.importorskip("pandas")
+        X = pd.DataFrame({"f0": [0, 1, 2]})
+    else:
+        X = np.array([[0, 1, 2]]).T
     y = np.arange(3)
-    msg = "Categorical feature at index 0 is expected to have a cardinality <= 2"
+    msg = f"Categorical feature {feature_name} is expected to have a cardinality <= 2"
     with pytest.raises(ValueError, match=msg):
         gb.fit(X, y)
 
-    X = np.array([[0, 2]]).T
+    if use_pandas:
+        X = pd.DataFrame({"f0": [0, 2]})
+    else:
+        X = np.array([[0, 2]]).T
     y = np.arange(2)
-    msg = "Categorical feature at index 0 is expected to be encoded with values < 2"
+    msg = (
+        f"Categorical feature {feature_name} is expected to be encoded with values < 2"
+    )
     with pytest.raises(ValueError, match=msg):
         gb.fit(X, y)
 

diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py
@@ -27,6 +27,7 @@
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import skip_if_32bit
+from sklearn.utils._param_validation import InvalidParameterError
 from sklearn.exceptions import DataConversionWarning
 from sklearn.exceptions import NotFittedError
 from sklearn.dummy import DummyClassifier, DummyRegressor
@@ -1265,14 +1266,14 @@ def test_gradient_boosting_with_init_pipeline():
 
     # Passing sample_weight to a pipeline raises a ValueError. This test makes
     # sure we make the distinction between ValueError raised by a pipeline that
-    # was passed sample_weight, and a ValueError raised by a regular estimator
-    # whose input checking failed.
+    # was passed sample_weight, and a InvalidParameterError raised by a regular
+    # estimator whose input checking failed.
     invalid_nu = 1.5
     err_msg = (
         "The 'nu' parameter of NuSVR must be a float in the"
         f" range (0.0, 1.0]. Got {invalid_nu} instead."
     )
-    with pytest.raises(ValueError, match=re.escape(err_msg)):
+    with pytest.raises(InvalidParameterError, match=re.escape(err_msg)):
         # Note that NuSVR properly supports sample_weight
         init = NuSVR(gamma="auto", nu=invalid_nu)
         gb = GradientBoostingRegressor(init=init)

diff --git a/sklearn/inspection/_plot/decision_boundary.py b/sklearn/inspection/_plot/decision_boundary.py
@@ -6,7 +6,11 @@
 from ...utils import check_matplotlib_support
 from ...utils import _safe_indexing
 from ...base import is_regressor
-from ...utils.validation import check_is_fitted, _is_arraylike_not_scalar
+from ...utils.validation import (
+    check_is_fitted,
+    _is_arraylike_not_scalar,
+    _num_features,
+)
 
 
 def _check_boundary_response_method(estimator, response_method):
@@ -316,6 +320,12 @@ def from_estimator(
                 f"Got {plot_method} instead."
             )
 
+        num_features = _num_features(X)
+        if num_features != 2:
+            raise ValueError(
+                f"n_features must be equal to 2. Got {num_features} instead."
+            )
+
         x0, x1 = _safe_indexing(X, 0, axis=1), _safe_indexing(X, 1, axis=1)
 
         x0_min, x0_max = x0.min() - eps, x0.max() + eps

diff --git a/sklearn/inspection/_plot/tests/test_boundary_decision_display.py b/sklearn/inspection/_plot/tests/test_boundary_decision_display.py
@@ -38,6 +38,16 @@ def fitted_clf():
     return LogisticRegression().fit(X, y)
 
 
+def test_input_data_dimension(pyplot):
+    """Check that we raise an error when `X` does not have exactly 2 features."""
+    X, y = make_classification(n_samples=10, n_features=4, random_state=0)
+
+    clf = LogisticRegression().fit(X, y)
+    msg = "n_features must be equal to 2. Got 4 instead."
+    with pytest.raises(ValueError, match=msg):
+        DecisionBoundaryDisplay.from_estimator(estimator=clf, X=X)
+
+
 def test_check_boundary_response_method_auto():
     """Check _check_boundary_response_method behavior with 'auto'."""