From cadafb79bad52dc552ec4bdd76286b5a2ace42c4 Mon Sep 17 00:00:00 2001
From: Lucy Liu <jliu176@gmail.com>
Date: Sat, 11 Apr 2020 19:10:21 +0200
Subject: [PATCH 001/125] DOC Fix grammar and clarify VotingRegressor (#16896)

---
 sklearn/ensemble/_voting.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py
index cab321702c85d..8d2bbbe8c2b8a 100644
--- a/sklearn/ensemble/_voting.py
+++ b/sklearn/ensemble/_voting.py
@@ -350,8 +350,8 @@ class VotingRegressor(RegressorMixin, _BaseVoting):
 
     .. versionadded:: 0.21
 
-    A voting regressor is an ensemble meta-estimator that fits base
-    regressors each on the whole dataset. It, then, averages the individual
+    A voting regressor is an ensemble meta-estimator that fits several base
+    regressors, each on the whole dataset. Then it averages the individual
     predictions to form a final prediction.
 
     Read more in the :ref:`User Guide <voting_regressor>`.

From 8122e77bee8414c787f4bcd730673d2c0e137d06 Mon Sep 17 00:00:00 2001
From: Lucy Liu <jliu176@gmail.com>
Date: Sat, 11 Apr 2020 19:15:23 +0200
Subject: [PATCH 002/125] DOC Fix typos, wording in
 plot_gradient_boosting_regression.py (#16894)

---
 .../plot_gradient_boosting_regression.py      | 48 +++++++++----------
 1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/examples/ensemble/plot_gradient_boosting_regression.py b/examples/ensemble/plot_gradient_boosting_regression.py
index 3dbe7dbaac296..860bb14687534 100644
--- a/examples/ensemble/plot_gradient_boosting_regression.py
+++ b/examples/ensemble/plot_gradient_boosting_regression.py
@@ -11,7 +11,7 @@
 and 500 regression trees of depth 4.
 
 Note: For larger datasets (n_samples >= 10000), please refer to
-:class:`sklearn.ensemble.HistGradientBoostingRegressor`
+:class:`sklearn.ensemble.HistGradientBoostingRegressor`.
 """
 print(__doc__)
 
@@ -32,8 +32,7 @@
 # Load the data
 # -------------------------------------
 #
-# First we need to load the data. We set random state to be consistent with the
-# result.
+# First we need to load the data.
 
 diabetes = datasets.load_diabetes()
 X, y = diabetes.data, diabetes.target
@@ -43,13 +42,11 @@
 # -------------------------------------
 #
 # Next, we will split our dataset to use 90% for training and leave the rest
-# for testing. We will also prepare the parameters we want to use to fit our
-# regression model. You can play with those parameters to see how the
-# results change:
+# for testing. We will also set the regression model parameters. You can play
+# with these parameters to see how the results change.
 #
-# n_estimators : the number of boosting stages which will be performed.
-# Later, we will plot and see how the deviance changes with those boosting
-# operations.
+# n_estimators : the number of boosting stages that will be performed.
+# Later, we will plot deviance against boosting iterations.
 #
 # max_depth : limits the number of nodes in the tree.
 # The best value depends on the interaction of the input variables.
@@ -57,12 +54,11 @@
 # min_samples_split : the minimum number of samples required to split an
 # internal node.
 #
-# learning_rate : how much the contribution of each tree will shrink
+# learning_rate : how much the contribution of each tree will shrink.
 #
-# loss : here, we decided to use least squeares as a loss function.
-# However there are many other options (check
-# :class:`~sklearn.ensemble.GradientBoostingRegressor` to see what are
-# other possibilities)
+# loss : loss function to optimize. The least squares function is  used in this
+# case however, there are many other options (see
+# :class:`~sklearn.ensemble.GradientBoostingRegressor` ).
 
 X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.1, random_state=13)
@@ -80,10 +76,10 @@
 # Now we will initiate the gradient boosting regressors and fit it with our
 # training data. Let's also look and the mean squared error on the test data.
 
-clf = ensemble.GradientBoostingRegressor(**params)
-clf.fit(X_train, y_train)
+reg = ensemble.GradientBoostingRegressor(**params)
+reg.fit(X_train, y_train)
 
-mse = mean_squared_error(y_test, clf.predict(X_test))
+mse = mean_squared_error(y_test, reg.predict(X_test))
 print("The mean squared error (MSE) on test set: {:.4f}".format(mse))
 
 ##############################################################################
@@ -91,16 +87,16 @@
 # -------------------------------------
 #
 # Finally, we will visualize the results. To do that we will first compute the
-# test set deviance and then plot it.
+# test set deviance and then plot it against boosting iterations.
 
 test_score = np.zeros((params['n_estimators'],), dtype=np.float64)
-for i, y_pred in enumerate(clf.staged_predict(X_test)):
-    test_score[i] = clf.loss_(y_test, y_pred)
+for i, y_pred in enumerate(reg.staged_predict(X_test)):
+    test_score[i] = reg.loss_(y_test, y_pred)
 
 fig = plt.figure(figsize=(6, 6))
 plt.subplot(1, 1, 1)
 plt.title('Deviance')
-plt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',
+plt.plot(np.arange(params['n_estimators']) + 1, reg.train_score_, 'b-',
          label='Training Set Deviance')
 plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
          label='Test Set Deviance')
@@ -116,16 +112,16 @@
 #
 # Careful, impurity-based feature importances can be misleading for
 # high cardinality features (many unique values). As an alternative,
-# the permutation importances of ``clf`` are computed on a
+# the permutation importances of ``reg`` can be computed on a
 # held out test set. See :ref:`permutation_importance` for more details.
 #
-# In this case, the two methods agree to identify the same top 2 features
-# as strongly predictive features but not in the same order. The third most
+# For this example, the impurity-based and permutation methods identify the
+# same 2 strongly predictive features but not in the same order. The third most
 # predictive feature, "bp", is also the same for the 2 methods. The remaining
 # features are less predictive and the error bars of the permutation plot
 # show that they overlap with 0.
 
-feature_importance = clf.feature_importances_
+feature_importance = reg.feature_importances_
 sorted_idx = np.argsort(feature_importance)
 pos = np.arange(sorted_idx.shape[0]) + .5
 fig = plt.figure(figsize=(12, 6))
@@ -134,7 +130,7 @@
 plt.yticks(pos, np.array(diabetes.feature_names)[sorted_idx])
 plt.title('Feature Importance (MDI)')
 
-result = permutation_importance(clf, X_test, y_test, n_repeats=10,
+result = permutation_importance(reg, X_test, y_test, n_repeats=10,
                                 random_state=42, n_jobs=2)
 sorted_idx = result.importances_mean.argsort()
 plt.subplot(1, 2, 2)

From c2b31ac21b8780498e11a42744212231b3fefaa6 Mon Sep 17 00:00:00 2001
From: Lucy Liu <jliu176@gmail.com>
Date: Tue, 14 Apr 2020 10:42:15 +0200
Subject: [PATCH 003/125] DOC replace Boston in _classes.py (#16892)

* replace boston

* fix score
---
 sklearn/tree/_classes.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index fe77610a20601..f252ba0acbb1c 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -1152,16 +1152,16 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
 
     Examples
     --------
-    >>> from sklearn.datasets import load_boston
+    >>> from sklearn.datasets import load_diabetes
     >>> from sklearn.model_selection import cross_val_score
     >>> from sklearn.tree import DecisionTreeRegressor
-    >>> X, y = load_boston(return_X_y=True)
+    >>> X, y = load_diabetes(return_X_y=True)
     >>> regressor = DecisionTreeRegressor(random_state=0)
     >>> cross_val_score(regressor, X, y, cv=10)
     ...                    # doctest: +SKIP
     ...
-    array([ 0.61..., 0.57..., -0.34..., 0.41..., 0.75...,
-            0.07..., 0.29..., 0.33..., -1.42..., -1.77...])
+    array([-0.39..., -0.46...,  0.02...,  0.06..., -0.50...,
+           0.16...,  0.11..., -0.73..., -0.30..., -0.00...])
     """
     def __init__(self,
                  criterion="mse",
@@ -1697,18 +1697,18 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
 
     Examples
     --------
-    >>> from sklearn.datasets import load_boston
+    >>> from sklearn.datasets import load_diabetes
     >>> from sklearn.model_selection import train_test_split
     >>> from sklearn.ensemble import BaggingRegressor
     >>> from sklearn.tree import ExtraTreeRegressor
-    >>> X, y = load_boston(return_X_y=True)
+    >>> X, y = load_diabetes(return_X_y=True)
     >>> X_train, X_test, y_train, y_test = train_test_split(
     ...     X, y, random_state=0)
     >>> extra_tree = ExtraTreeRegressor(random_state=0)
     >>> reg = BaggingRegressor(extra_tree, random_state=0).fit(
     ...     X_train, y_train)
     >>> reg.score(X_test, y_test)
-    0.7447...
+    0.33...
     """
     def __init__(self,
                  criterion="mse",

From a2d361bf40dbe7eee77a8c27aba779358ff96d0e Mon Sep 17 00:00:00 2001
From: Lucy Liu <jliu176@gmail.com>
Date: Tue, 14 Apr 2020 11:27:00 +0200
Subject: [PATCH 004/125] DOC Fix wording, typo in plot_voting_regressor.py
 (#16895)

* wording, typo

* add suggestions

* add n est

* remove n est
---
 examples/ensemble/plot_voting_regressor.py | 31 +++++++++++-----------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/examples/ensemble/plot_voting_regressor.py b/examples/ensemble/plot_voting_regressor.py
index 6fd629bb9c083..2587dee4352e9 100644
--- a/examples/ensemble/plot_voting_regressor.py
+++ b/examples/ensemble/plot_voting_regressor.py
@@ -5,28 +5,28 @@
 
 .. currentmodule:: sklearn
 
-A voting regressor is an ensemble meta-estimator that fits base regressors each
-on the whole dataset. It, then, averages the individual predictions to form a
-final prediction.
+A voting regressor is an ensemble meta-estimator that fits several base
+regressors, each on the whole dataset. Then it averages the individual
+predictions to form a final prediction.
 We will use three different regressors to predict the data:
 :class:`~ensemble.GradientBoostingRegressor`,
 :class:`~ensemble.RandomForestRegressor`, and
 :class:`~linear_model.LinearRegression`).
-Then, using them we will make voting regressor
+Then the above 3 regressors will be used for the
 :class:`~ensemble.VotingRegressor`.
 
-Finally, we will plot all of them for comparison.
+Finally, we will plot the predictions made by all models for comparison.
 
-We will work with the diabetes dataset which consists of the 10 features
-collected from a cohort of diabetes patients. The target is the disease
-progression after one year from the baseline.
+We will work with the diabetes dataset which consists of 10 features
+collected from a cohort of diabetes patients. The target is a quantitative
+measure of disease progression one year after baseline.
 
 """
 print(__doc__)
 
 import matplotlib.pyplot as plt
 
-from sklearn import datasets
+from sklearn.datasets import load_diabetes
 from sklearn.ensemble import GradientBoostingRegressor
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.linear_model import LinearRegression
@@ -36,11 +36,11 @@
 # Training classifiers
 # --------------------------------
 #
-# First, we are going to load diabetes dataset and initiate gradient boosting
-# regressor, random forest regressor and linear regression. Next, we are going
-# to use each of them to build the voting regressor:
+# First, we will load the diabetes dataset and initiate a gradient boosting
+# regressor, a random forest regressor and a linear regression. Next, we will
+# use the 3 regressors to build the voting regressor:
 
-X, y = datasets.load_diabetes(return_X_y=True)
+X, y = load_diabetes(return_X_y=True)
 
 # Train classifiers
 reg1 = GradientBoostingRegressor(random_state=1)
@@ -58,8 +58,7 @@
 # Making predictions
 # --------------------------------
 #
-# Now we will use each of the regressors to make 20 first predictions about the
-# diabetes dataset.
+# Now we will use each of the regressors to make the 20 first predictions.
 
 xt = X[:20]
 
@@ -73,7 +72,7 @@
 # --------------------------------
 #
 # Finally, we will visualize the 20 predictions. The red stars show the average
-# prediction
+# prediction made by :class:`~ensemble.VotingRegressor`.
 
 plt.figure()
 plt.plot(pred1, 'gd', label='GradientBoostingRegressor')

From 9cc55587067fa57688a28f9d819f0e0a1881882c Mon Sep 17 00:00:00 2001
From: Lucy Liu <jliu176@gmail.com>
Date: Tue, 14 Apr 2020 12:20:47 +0200
Subject: [PATCH 005/125] DOC remove boston from tutorial.rst (#16889)

---
 doc/tutorial/basic/tutorial.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/tutorial/basic/tutorial.rst b/doc/tutorial/basic/tutorial.rst
index 082c5ffa3aa79..28e965bd925a5 100644
--- a/doc/tutorial/basic/tutorial.rst
+++ b/doc/tutorial/basic/tutorial.rst
@@ -77,8 +77,8 @@ Loading an example dataset
 `scikit-learn` comes with a few standard datasets, for instance the
 `iris <https://en.wikipedia.org/wiki/Iris_flower_data_set>`_ and `digits
 <https://archive.ics.uci.edu/ml/datasets/Pen-Based+Recognition+of+Handwritten+Digits>`_
-datasets for classification and the `boston house prices dataset
-<https://archive.ics.uci.edu/ml/machine-learning-databases/housing/>`_ for regression.
+datasets for classification and the `diabetes dataset
+<https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html>`_ for regression.
 
 In the following, we start a Python interpreter from our shell and then
 load the ``iris`` and ``digits`` datasets.  Our notational convention is that

From 9901d8df131e06d8f6ba1677e10330cabfdeb245 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Tue, 14 Apr 2020 09:38:32 -0400
Subject: [PATCH 006/125] Fix _deprecate_positional_args for kwonly args w/o
 default (#16850)

---
 sklearn/utils/tests/test_validation.py | 9 +++++++++
 sklearn/utils/validation.py            | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 5f6df9685a25c..b178ccc148d9d 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -1097,6 +1097,15 @@ def f2(a=1, *, b=1, c=1, d=1):
                       match=r"Pass b=2 as keyword args"):
         f2(1, 2)
 
+    # The * is place before a keyword only argument without a default value
+    @_deprecate_positional_args
+    def f3(a, *, b, c=1, d=1):
+        pass
+
+    with pytest.warns(FutureWarning,
+                      match=r"Pass b=2 as keyword args"):
+        f3(1, 2)
+
 
 def test_deprecate_positional_args_warns_for_class():
 
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 4bb50c3deb5e7..953584fff0f8a 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -1301,7 +1301,7 @@ def inner_f(*args, **kwargs):
                           "passing these as positional arguments will "
                           "result in an error".format(", ".join(args_msg)),
                           FutureWarning)
-        kwargs.update({k: arg for k, arg in zip(all_args, args)})
+        kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
         return f(**kwargs)
     return inner_f
 

From bd9fd0f1a9a222c58bbf8aba45025d42c598a31e Mon Sep 17 00:00:00 2001
From: Kevin Markham <justmarkham@users.noreply.github.com>
Date: Tue, 14 Apr 2020 09:40:15 -0400
Subject: [PATCH 007/125] DOC Minor updates to the Decision Tree User Guide
 (#16905)

---
 doc/modules/tree.rst | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst
index ecd037d0631ac..af6fc4e1edfe9 100644
--- a/doc/modules/tree.rst
+++ b/doc/modules/tree.rst
@@ -56,9 +56,9 @@ The disadvantages of decision trees include:
 
     - Decision-tree learners can create over-complex trees that do not
       generalise the data well. This is called overfitting. Mechanisms
-      such as pruning (not currently supported), setting the minimum
-      number of samples required at a leaf node or setting the maximum
-      depth of the tree are necessary to avoid this problem.
+      such as pruning, setting the minimum number of samples required
+      at a leaf node or setting the maximum depth of the tree are
+      necessary to avoid this problem.
 
     - Decision trees can be unstable because small variations in the
       data might result in a completely different tree being generated.
@@ -124,10 +124,10 @@ Using the Iris dataset, we can construct a tree as follows::
     >>> clf = tree.DecisionTreeClassifier()
     >>> clf = clf.fit(X, y)
 
-Once trained, you can plot the tree with the plot_tree function::
+Once trained, you can plot the tree with the :func:`plot_tree` function::
 
 
-    >>> tree.plot_tree(clf.fit(iris.data, iris.target)) # doctest: +SKIP
+    >>> tree.plot_tree(clf) # doctest: +SKIP
 
 .. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_dtc_002.png
    :target: ../auto_examples/tree/plot_iris_dtc.html
@@ -137,10 +137,7 @@ Once trained, you can plot the tree with the plot_tree function::
 We can also export the tree in `Graphviz
 <https://www.graphviz.org/>`_ format using the :func:`export_graphviz`
 exporter. If you use the `conda <https://conda.io>`_ package manager, the graphviz binaries
-
-and the python package can be installed with
-
-    conda install python-graphviz
+and the python package can be installed with `conda install python-graphviz`.
 
 Alternatively binaries for graphviz can be downloaded from the graphviz project homepage,
 and the Python wrapper installed from pypi with `pip install graphviz`.
@@ -188,7 +185,7 @@ of external libraries and is more compact:
 
     >>> from sklearn.datasets import load_iris
     >>> from sklearn.tree import DecisionTreeClassifier
-    >>> from sklearn.tree.export import export_text
+    >>> from sklearn.tree import export_text
     >>> iris = load_iris()
     >>> decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2)
     >>> decision_tree = decision_tree.fit(iris.data, iris.target)

From 5e2d74bc5f61b382758c0403c577253539e77156 Mon Sep 17 00:00:00 2001
From: Noa Tamir <6564007+noatamir@users.noreply.github.com>
Date: Wed, 15 Apr 2020 10:36:51 +0200
Subject: [PATCH 008/125] DOC add versionadded versionchanged v0.19 (#16233)

* added v0.19.1 and wip v0.19

* finished adding vchanged strings for v0.19

Towards #15426

@adrinjalali #wimlds #scikitlearnsprint

* fixing linter issues

* caught line issues with flake8

* caught the last line issue

* added lines and cleaned gtiignore

* Update sklearn/multiclass.py

Co-Authored-By: Thomas J Fan <thomasjpfan@gmail.com>

* Update sklearn/multiclass.py

Co-Authored-By: Thomas J Fan <thomasjpfan@gmail.com>

Co-authored-by: Thomas J Fan <thomasjpfan@gmail.com>
---
 sklearn/decomposition/_lda.py          |  3 +++
 sklearn/feature_extraction/_hash.py    |  4 ++++
 sklearn/model_selection/_search.py     | 10 ++++++++++
 sklearn/model_selection/_validation.py |  5 +++++
 sklearn/multiclass.py                  |  8 ++++++++
 sklearn/multioutput.py                 |  5 +++++
 sklearn/neighbors/_nearest_centroid.py |  3 +++
 7 files changed, 38 insertions(+)

diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py
index 641e68cd7fc8b..a6e253aab1e6e 100644
--- a/sklearn/decomposition/_lda.py
+++ b/sklearn/decomposition/_lda.py
@@ -143,6 +143,9 @@ class LatentDirichletAllocation(TransformerMixin, BaseEstimator):
     n_components : int, optional (default=10)
         Number of topics.
 
+        .. versionchanged:: 0.19
+            ``n_topics `` was renamed to ``n_components``
+
     doc_topic_prior : float, optional (default=None)
         Prior of document topic distribution `theta`. If the value is None,
         defaults to `1 / n_components`.
diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py
index f52e6f296169b..d5cfa913991b6 100644
--- a/sklearn/feature_extraction/_hash.py
+++ b/sklearn/feature_extraction/_hash.py
@@ -69,6 +69,10 @@ class FeatureHasher(TransformerMixin, BaseEstimator):
         approximately conserve the inner product in the hashed space even for
         small n_features. This approach is similar to sparse random projection.
 
+    .. versionchanged:: 0.19
+        ``alternate_sign`` replaces the now deprecated ``non_negative``
+        parameter.
+
     Examples
     --------
     >>> from sklearn.feature_extraction import FeatureHasher
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 3e5b85ed73a02..d283dc2f0b483 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -1002,6 +1002,11 @@ class GridSearchCV(BaseSearchCV):
         expensive and is not strictly required to select the parameters that
         yield the best generalization performance.
 
+        .. versionadded:: 0.19
+
+        .. versionchanged:: 0.21
+            Default value was changed from ``True`` to ``False``
+
 
     Examples
     --------
@@ -1338,6 +1343,11 @@ class RandomizedSearchCV(BaseSearchCV):
         expensive and is not strictly required to select the parameters that
         yield the best generalization performance.
 
+        .. versionadded:: 0.19
+
+        .. versionchanged:: 0.21
+            Default value was changed from ``True`` to ``False``
+
     Attributes
     ----------
     cv_results_ : dict of numpy (masked) ndarrays
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index ae6151a88727b..180c48fc99762 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -134,6 +134,11 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
         expensive and is not strictly required to select the parameters that
         yield the best generalization performance.
 
+        .. versionadded:: 0.19
+
+        .. versionchanged:: 0.21
+            Default value was changed from ``True`` to ``False``
+
     return_estimator : bool, default=False
         Whether to return the estimators fitted on each split.
 
diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py
index 9eeb4248f83fd..ae17d998882ea 100644
--- a/sklearn/multiclass.py
+++ b/sklearn/multiclass.py
@@ -390,6 +390,10 @@ def decision_function(self, X):
         Returns
         -------
         T : array-like of shape (n_samples, n_classes)
+
+            .. versionchanged:: 0.19
+                output shape changed to ``(n_samples,)`` to conform to
+                scikit-learn conventions for binary classification.
         """
         check_is_fitted(self)
         if len(self.estimators_) == 1:
@@ -643,6 +647,10 @@ def decision_function(self, X):
         Returns
         -------
         Y : array-like of shape (n_samples, n_classes)
+
+            .. versionchanged:: 0.19
+                output shape changed to ``(n_samples,)`` to conform to
+                scikit-learn conventions for binary classification.
         """
         check_is_fitted(self)
 
diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
index 2f8976a86c8b8..8f94a0ae634da 100644
--- a/sklearn/multioutput.py
+++ b/sklearn/multioutput.py
@@ -362,6 +362,11 @@ def predict_proba(self):
             such arrays if n_outputs > 1.
             The class probabilities of the input samples. The order of the
             classes corresponds to that in the attribute :term:`classes_`.
+
+            .. versionchanged:: 0.19
+                This function now returns a list of arrays where the length of
+                the list is ``n_outputs``, and each array is (``n_samples``,
+                ``n_classes``) for that particular output.
         """
         check_is_fitted(self)
         if not all([hasattr(estimator, "predict_proba")
diff --git a/sklearn/neighbors/_nearest_centroid.py b/sklearn/neighbors/_nearest_centroid.py
index 0fdcd597353f5..bf00d8b8f88d2 100644
--- a/sklearn/neighbors/_nearest_centroid.py
+++ b/sklearn/neighbors/_nearest_centroid.py
@@ -41,6 +41,9 @@ class NearestCentroid(ClassifierMixin, BaseEstimator):
         If the "manhattan" metric is provided, this centroid is the median and
         for all other metrics, the centroid is now set to be the mean.
 
+        .. versionchanged:: 0.19
+            ``metric='precomputed'`` was deprecated and now raises an error
+
     shrink_threshold : float, default=None
         Threshold for shrinking centroids to remove features.
 

From 6cd77c2c50792127d71cccc8a1296cb8ee178960 Mon Sep 17 00:00:00 2001
From: Adrin Jalali <adrin.jalali@gmail.com>
Date: Wed, 15 Apr 2020 18:11:26 +0200
Subject: [PATCH 009/125] API make feature_extraction's constructors' params
 kwonly (#16866)

---
 doc/modules/feature_extraction.rst              |  2 +-
 sklearn/feature_extraction/_dict_vectorizer.py  |  5 +++--
 sklearn/feature_extraction/_hash.py             |  5 +++--
 sklearn/feature_extraction/image.py             |  6 ++++--
 .../tests/test_feature_hasher.py                |  2 +-
 sklearn/feature_extraction/text.py              | 17 +++++++++--------
 6 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/doc/modules/feature_extraction.rst b/doc/modules/feature_extraction.rst
index 084e110f5c702..cedc43c23c16c 100644
--- a/doc/modules/feature_extraction.rst
+++ b/doc/modules/feature_extraction.rst
@@ -1019,7 +1019,7 @@ The :class:`PatchExtractor` class works in the same way as
 implemented as an estimator, so it can be used in pipelines. See::
 
     >>> five_images = np.arange(5 * 4 * 4 * 3).reshape(5, 4, 4, 3)
-    >>> patches = image.PatchExtractor((2, 2)).transform(five_images)
+    >>> patches = image.PatchExtractor(patch_size=(2, 2)).transform(five_images)
     >>> patches.shape
     (45, 2, 2, 3)
 
diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py
index b527b0d72e6be..303e34d6f0ab9 100644
--- a/sklearn/feature_extraction/_dict_vectorizer.py
+++ b/sklearn/feature_extraction/_dict_vectorizer.py
@@ -11,6 +11,7 @@
 
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array, tosequence
+from ..utils.validation import _deprecate_positional_args
 
 
 def _tosequence(X):
@@ -89,8 +90,8 @@ class DictVectorizer(TransformerMixin, BaseEstimator):
     sklearn.preprocessing.OrdinalEncoder : handles nominal/categorical
       features encoded as columns of arbitrary data types.
     """
-
-    def __init__(self, dtype=np.float64, separator="=", sparse=True,
+    @_deprecate_positional_args
+    def __init__(self, *, dtype=np.float64, separator="=", sparse=True,
                  sort=True):
         self.dtype = dtype
         self.separator = separator
diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py
index d5cfa913991b6..b9c2abaa25a72 100644
--- a/sklearn/feature_extraction/_hash.py
+++ b/sklearn/feature_extraction/_hash.py
@@ -7,6 +7,7 @@
 import scipy.sparse as sp
 
 from ..utils import IS_PYPY
+from ..utils.validation import _deprecate_positional_args
 from ..base import BaseEstimator, TransformerMixin
 
 if not IS_PYPY:
@@ -88,8 +89,8 @@ class FeatureHasher(TransformerMixin, BaseEstimator):
     DictVectorizer : vectorizes string-valued features using a hash table.
     sklearn.preprocessing.OneHotEncoder : handles nominal/categorical features.
     """
-
-    def __init__(self, n_features=(2 ** 20), input_type="dict",
+    @_deprecate_positional_args
+    def __init__(self, n_features=(2 ** 20), *, input_type="dict",
                  dtype=np.float64, alternate_sign=True):
         self._validate_params(n_features, input_type)
 
diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py
index 588abf3fcf896..737f555bbccda 100644
--- a/sklearn/feature_extraction/image.py
+++ b/sklearn/feature_extraction/image.py
@@ -16,6 +16,7 @@
 from numpy.lib.stride_tricks import as_strided
 
 from ..utils import check_array, check_random_state, deprecated
+from ..utils.validation import _deprecate_positional_args
 from ..base import BaseEstimator
 
 __all__ = ['PatchExtractor',
@@ -519,8 +520,9 @@ class PatchExtractor(BaseEstimator):
     >>> print('Patches shape: {}'.format(pe_trans.shape))
     Patches shape: (545706, 2, 2)
     """
-
-    def __init__(self, patch_size=None, max_patches=None, random_state=None):
+    @_deprecate_positional_args
+    def __init__(self, *, patch_size=None, max_patches=None,
+                 random_state=None):
         self.patch_size = patch_size
         self.max_patches = max_patches
         self.random_state = random_state
diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py
index 9fa7a191ca279..c0cd50cef6e09 100644
--- a/sklearn/feature_extraction/tests/test_feature_hasher.py
+++ b/sklearn/feature_extraction/tests/test_feature_hasher.py
@@ -32,7 +32,7 @@ def test_feature_hasher_strings():
 
         it = (x for x in raw_X)                 # iterable
 
-        h = FeatureHasher(n_features, input_type="string",
+        h = FeatureHasher(n_features=n_features, input_type="string",
                           alternate_sign=False)
         X = h.transform(it)
 
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index ebc584b6271a9..27c5eb437805b 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -33,6 +33,7 @@
 from ..utils import _IS_32BIT, deprecated
 from ..utils.fixes import _astype_copy_false
 from ..exceptions import NotFittedError
+from ..utils.validation import _deprecate_positional_args
 
 
 __all__ = ['HashingVectorizer',
@@ -677,8 +678,8 @@ class HashingVectorizer(TransformerMixin, _VectorizerMixin, BaseEstimator):
     CountVectorizer, TfidfVectorizer
 
     """
-
-    def __init__(self, input='content', encoding='utf-8',
+    @_deprecate_positional_args
+    def __init__(self, *, input='content', encoding='utf-8',
                  decode_error='strict', strip_accents=None,
                  lowercase=True, preprocessor=None, tokenizer=None,
                  stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
@@ -999,8 +1000,8 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):
     when pickling. This attribute is provided only for introspection and can
     be safely removed using delattr or set to None before pickling.
     """
-
-    def __init__(self, input='content', encoding='utf-8',
+    @_deprecate_positional_args
+    def __init__(self, *, input='content', encoding='utf-8',
                  decode_error='strict', strip_accents=None,
                  lowercase=True, preprocessor=None, tokenizer=None,
                  stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
@@ -1409,8 +1410,8 @@ class TfidfTransformer(TransformerMixin, BaseEstimator):
                    Introduction to Information Retrieval. Cambridge University
                    Press, pp. 118-120.
     """
-
-    def __init__(self, norm='l2', use_idf=True, smooth_idf=True,
+    @_deprecate_positional_args
+    def __init__(self, *, norm='l2', use_idf=True, smooth_idf=True,
                  sublinear_tf=False):
         self.norm = norm
         self.use_idf = use_idf
@@ -1715,8 +1716,8 @@ class TfidfVectorizer(CountVectorizer):
     >>> print(X.shape)
     (4, 9)
     """
-
-    def __init__(self, input='content', encoding='utf-8',
+    @_deprecate_positional_args
+    def __init__(self, *, input='content', encoding='utf-8',
                  decode_error='strict', strip_accents=None, lowercase=True,
                  preprocessor=None, tokenizer=None, analyzer='word',
                  stop_words=None, token_pattern=r"(?u)\b\w\w+\b",

From cb9ddbb91fad3c663ae0770d07d5e96601a66875 Mon Sep 17 00:00:00 2001
From: Lucy Liu <jliu176@gmail.com>
Date: Wed, 15 Apr 2020 19:30:28 +0200
Subject: [PATCH 010/125] TST Replace boston in ensemble test_bagging (#16921)

---
 sklearn/ensemble/tests/test_bagging.py | 48 +++++++++++++-------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py
index 883f0067f5e78..3e8401332aeef 100644
--- a/sklearn/ensemble/tests/test_bagging.py
+++ b/sklearn/ensemble/tests/test_bagging.py
@@ -29,7 +29,7 @@
 from sklearn.pipeline import make_pipeline
 from sklearn.feature_selection import SelectKBest
 from sklearn.model_selection import train_test_split
-from sklearn.datasets import load_boston, load_iris, make_hastie_10_2
+from sklearn.datasets import load_diabetes, load_iris, make_hastie_10_2
 from sklearn.utils import check_random_state
 from sklearn.preprocessing import FunctionTransformer
 
@@ -44,12 +44,12 @@
 iris.data = iris.data[perm]
 iris.target = iris.target[perm]
 
-# also load the boston dataset
+# also load the diabetes dataset
 # and randomly permute it
-boston = load_boston()
-perm = rng.permutation(boston.target.size)
-boston.data = boston.data[perm]
-boston.target = boston.target[perm]
+diabetes = load_diabetes()
+perm = rng.permutation(diabetes.target.size)
+diabetes.data = diabetes.data[perm]
+diabetes.target = diabetes.target[perm]
 
 
 # TODO: Remove in 0.24 when DummyClassifier's `strategy` default updates
@@ -140,8 +140,8 @@ def fit(self, X, y):
 def test_regression():
     # Check regression for various parameter settings.
     rng = check_random_state(0)
-    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
-                                                        boston.target[:50],
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50],
+                                                        diabetes.target[:50],
                                                         random_state=rng)
     grid = ParameterGrid({"max_samples": [0.5, 1.0],
                           "max_features": [0.5, 1.0],
@@ -162,8 +162,8 @@ def test_regression():
 def test_sparse_regression():
     # Check regression for various parameter settings on sparse input.
     rng = check_random_state(0)
-    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
-                                                        boston.target[:50],
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50],
+                                                        diabetes.target[:50],
                                                         random_state=rng)
 
     class CustomSVR(SVR):
@@ -229,8 +229,8 @@ def fit(self, X, y):
 def test_bootstrap_samples():
     # Test that bootstrapping samples generate non-perfect base estimators.
     rng = check_random_state(0)
-    X_train, X_test, y_train, y_test = train_test_split(boston.data,
-                                                        boston.target,
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
+                                                        diabetes.target,
                                                         random_state=rng)
 
     base_estimator = DecisionTreeRegressor().fit(X_train, y_train)
@@ -268,8 +268,8 @@ def test_bootstrap_samples():
 def test_bootstrap_features():
     # Test that bootstrapping features may generate duplicate features.
     rng = check_random_state(0)
-    X_train, X_test, y_train, y_test = train_test_split(boston.data,
-                                                        boston.target,
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
+                                                        diabetes.target,
                                                         random_state=rng)
 
     ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
@@ -278,7 +278,7 @@ def test_bootstrap_features():
                                 random_state=rng).fit(X_train, y_train)
 
     for features in ensemble.estimators_features_:
-        assert boston.data.shape[1] == np.unique(features).shape[0]
+        assert diabetes.data.shape[1] == np.unique(features).shape[0]
 
     ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                 max_features=1.0,
@@ -286,7 +286,7 @@ def test_bootstrap_features():
                                 random_state=rng).fit(X_train, y_train)
 
     for features in ensemble.estimators_features_:
-        assert boston.data.shape[1] > np.unique(features).shape[0]
+        assert diabetes.data.shape[1] > np.unique(features).shape[0]
 
 
 def test_probability():
@@ -355,8 +355,8 @@ def test_oob_score_regression():
     # Check that oob prediction is a good estimation of the generalization
     # error.
     rng = check_random_state(0)
-    X_train, X_test, y_train, y_test = train_test_split(boston.data,
-                                                        boston.target,
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
+                                                        diabetes.target,
                                                         random_state=rng)
 
     clf = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
@@ -383,8 +383,8 @@ def test_oob_score_regression():
 def test_single_estimator():
     # Check singleton ensembles.
     rng = check_random_state(0)
-    X_train, X_test, y_train, y_test = train_test_split(boston.data,
-                                                        boston.target,
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
+                                                        diabetes.target,
                                                         random_state=rng)
 
     clf1 = BaggingRegressor(base_estimator=KNeighborsRegressor(),
@@ -488,8 +488,8 @@ def test_parallel_regression():
     # Check parallel regression.
     rng = check_random_state(0)
 
-    X_train, X_test, y_train, y_test = train_test_split(boston.data,
-                                                        boston.target,
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
+                                                        diabetes.target,
                                                         random_state=rng)
 
     ensemble = BaggingRegressor(DecisionTreeRegressor(),
@@ -553,8 +553,8 @@ def test_base_estimator():
     assert isinstance(ensemble.base_estimator_, Perceptron)
 
     # Regression
-    X_train, X_test, y_train, y_test = train_test_split(boston.data,
-                                                        boston.target,
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
+                                                        diabetes.target,
                                                         random_state=rng)
 
     ensemble = BaggingRegressor(None,

From 9d366a4f745abc57f3d6c3ffe148cd88d5d415fc Mon Sep 17 00:00:00 2001
From: Geoffrey Bolmier <geoffrey.bolmier@gmail.com>
Date: Wed, 15 Apr 2020 13:38:22 -0400
Subject: [PATCH 011/125] ENH Add custom loss support for HistGradientBoosting
 (#16908)

---
 .../gradient_boosting.py                      | 10 +++++++--
 .../tests/test_gradient_boosting.py           | 21 +++++++++++++++++++
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 796f4f060dda5..6087adb0b6575 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -23,6 +23,7 @@
 from .binning import _BinMapper
 from .grower import TreeGrower
 from .loss import _LOSSES
+from .loss import BaseLoss
 
 
 class BaseHistGradientBoosting(BaseEstimator, ABC):
@@ -58,7 +59,8 @@ def _validate_parameters(self):
         The parameters that are directly passed to the grower are checked in
         TreeGrower."""
 
-        if self.loss not in self._VALID_LOSSES:
+        if (self.loss not in self._VALID_LOSSES and
+                not isinstance(self.loss, BaseLoss)):
             raise ValueError(
                 "Loss {} is not supported for {}. Accepted losses: "
                 "{}.".format(self.loss, self.__class__.__name__,
@@ -150,7 +152,11 @@ def fit(self, X, y, sample_weight=None):
         # data.
         self._in_fit = True
 
-        self.loss_ = self._get_loss(sample_weight=sample_weight)
+        if isinstance(self.loss, str):
+            self.loss_ = self._get_loss(sample_weight=sample_weight)
+        elif isinstance(self.loss, BaseLoss):
+            self.loss_ = self.loss
+
         if self.early_stopping == 'auto':
             self.do_early_stopping_ = n_samples > 10000
         else:
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index 1b61e65793422..6fc412942d180 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -12,6 +12,8 @@
 from sklearn.ensemble import HistGradientBoostingRegressor
 from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES
+from sklearn.ensemble._hist_gradient_boosting.loss import LeastSquares
+from sklearn.ensemble._hist_gradient_boosting.loss import BinaryCrossEntropy
 from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
 from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
 from sklearn.utils import shuffle
@@ -681,3 +683,22 @@ def test_single_node_trees(Est):
                for predictor in est._predictors)
     # Still gives correct predictions thanks to the baseline prediction
     assert_allclose(est.predict(X), y)
+
+
+@pytest.mark.parametrize('Est, loss, X, y', [
+    (
+        HistGradientBoostingClassifier,
+        BinaryCrossEntropy(sample_weight=None),
+        X_classification,
+        y_classification
+    ),
+    (
+        HistGradientBoostingRegressor,
+        LeastSquares(sample_weight=None),
+        X_regression,
+        y_regression
+    )
+])
+def test_custom_loss(Est, loss, X, y):
+    est = Est(loss=loss, max_iter=20)
+    est.fit(X, y)

From 9358a6ee8f93511fd615d3264fa7ee9de0f21b93 Mon Sep 17 00:00:00 2001
From: Pierre Delanoue <delanoue.pierre@gmail.com>
Date: Wed, 15 Apr 2020 20:51:04 +0200
Subject: [PATCH 012/125] DOC Update random_state description for Multiclass
 (#16839)

---
 sklearn/multiclass.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py
index ae17d998882ea..96ec40743fe2c 100644
--- a/sklearn/multiclass.py
+++ b/sklearn/multiclass.py
@@ -707,10 +707,9 @@ class OutputCodeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
         than one-vs-the-rest.
 
     random_state : int, RandomState instance or None, optional, default: None
-        The generator used to initialize the codebook.  If int, random_state is
-        the seed used by the random number generator; If RandomState instance,
-        random_state is the random number generator; If None, the random number
-        generator is the RandomState instance used by `np.random`.
+        The generator used to initialize the codebook.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
 
     n_jobs : int or None, optional (default=None)
         The number of jobs to use for the computation.

From abfb6fd11e97cefd1947078646399cecec5bbe9c Mon Sep 17 00:00:00 2001
From: Angela Ambroz <angelaambroz@users.noreply.github.com>
Date: Fri, 17 Apr 2020 11:14:21 -0400
Subject: [PATCH 013/125] [MRG] Add jitter to LassoLars (#15179)

* Adding jitter to LassoLars fit

* CircleCI fail

* MR comments

* Jitter becomes default, added test based on issue description

* flake8 fixes

* Removing unexpected cython files

* Better coverage

* PR comments

* PR comments

* PR comments

* PR comments

* PR comments

* Linting

* Apply suggestions from code review

* addressed comments

* added whatnew entry

* test both estimators

* update whatsnew

* removed random_state for lassolarsIC

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 doc/whats_new/v0.23.rst                       |  5 +++
 sklearn/linear_model/_least_angle.py          | 36 +++++++++++++++++--
 .../linear_model/tests/test_least_angle.py    | 24 +++++++++++++
 3 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index 4c489c1887815..c4fa3818aa1bc 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -298,6 +298,11 @@ Changelog
   of strictly inferior for maximum of `absgrad` and `tol` in `utils.optimize._newton_cg`.
   :pr:`16266` by :user:`Rushabh Vasani <rushabh-v>`.
 
+- |Enhancement| :class:`linear_model.LassoLars` and
+  :class:`linear_model.Lars` now support a `jitter` parameter that adds
+  random noise to the target. This might help with stability in some edge
+  cases. :pr:`15179` by :user:`angelaambroz`.
+
 :mod:`sklearn.metrics`
 ......................
 
diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index a3781cf981710..bc71d7a1fccbd 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -21,6 +21,7 @@
 from ..base import RegressorMixin, MultiOutputMixin
 # mypy error: Module 'sklearn.utils' has no attribute 'arrayfuncs'
 from ..utils import arrayfuncs, as_float_array  # type: ignore
+from ..utils import check_random_state
 from ..model_selection import check_cv
 from ..exceptions import ConvergenceWarning
 
@@ -800,6 +801,16 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel):
         setting ``fit_path`` to ``False`` will lead to a speedup, especially
         with a small alpha.
 
+    jitter : float, default=None
+        Upper bound on a uniform noise parameter to be added to the
+        `y` values, to satisfy the model's assumption of
+        one-at-a-time computations. Might help with stability.
+
+    random_state : int, RandomState instance or None (default)
+        Determines random number generation for jittering. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`. Ignored if `jitter` is None.
+
     Attributes
     ----------
     alphas_ : array-like of shape (n_alphas + 1,) | list of n_targets such \
@@ -846,7 +857,8 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel):
 
     def __init__(self, fit_intercept=True, verbose=False, normalize=True,
                  precompute='auto', n_nonzero_coefs=500,
-                 eps=np.finfo(np.float).eps, copy_X=True, fit_path=True):
+                 eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
+                 jitter=None, random_state=None):
         self.fit_intercept = fit_intercept
         self.verbose = verbose
         self.normalize = normalize
@@ -855,6 +867,8 @@ def __init__(self, fit_intercept=True, verbose=False, normalize=True,
         self.eps = eps
         self.copy_X = copy_X
         self.fit_path = fit_path
+        self.jitter = jitter
+        self.random_state = random_state
 
     @staticmethod
     def _get_gram(precompute, X, y):
@@ -954,6 +968,12 @@ def fit(self, X, y, Xy=None):
         else:
             max_iter = self.max_iter
 
+        if self.jitter is not None:
+            rng = check_random_state(self.random_state)
+
+            noise = rng.uniform(high=self.jitter, size=len(y))
+            y = y + noise
+
         self._fit(X, y, max_iter=max_iter, alpha=alpha, fit_path=self.fit_path,
                   Xy=Xy)
 
@@ -1031,6 +1051,16 @@ class LassoLars(Lars):
         algorithm are typically in congruence with the solution of the
         coordinate descent Lasso estimator.
 
+    jitter : float, default=None
+        Upper bound on a uniform noise parameter to be added to the
+        `y` values, to satisfy the model's assumption of
+        one-at-a-time computations. Might help with stability.
+
+    random_state : int, RandomState instance or None (default)
+        Determines random number generation for jittering. Pass an int
+        for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`. Ignored if `jitter` is None.
+
     Attributes
     ----------
     alphas_ : array-like of shape (n_alphas + 1,) | list of n_targets such \
@@ -1083,7 +1113,7 @@ class LassoLars(Lars):
     def __init__(self, alpha=1.0, fit_intercept=True, verbose=False,
                  normalize=True, precompute='auto', max_iter=500,
                  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
-                 positive=False):
+                 positive=False, jitter=None, random_state=None):
         self.alpha = alpha
         self.fit_intercept = fit_intercept
         self.max_iter = max_iter
@@ -1094,6 +1124,8 @@ def __init__(self, alpha=1.0, fit_intercept=True, verbose=False,
         self.copy_X = copy_X
         self.eps = eps
         self.fit_path = fit_path
+        self.jitter = jitter
+        self.random_state = random_state
 
 
 ###############################################################################
diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py
index 6e7c1fb37096a..e198dfb15e323 100644
--- a/sklearn/linear_model/tests/test_least_angle.py
+++ b/sklearn/linear_model/tests/test_least_angle.py
@@ -6,6 +6,7 @@
 import pytest
 from scipy import linalg
 
+from sklearn.base import clone
 from sklearn.model_selection import train_test_split
 from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_array_almost_equal
@@ -17,6 +18,7 @@
 from sklearn import linear_model, datasets
 from sklearn.linear_model._least_angle import _lars_path_residues
 from sklearn.linear_model import LassoLarsIC, lars_path
+from sklearn.linear_model import Lars, LassoLars
 
 # TODO: use another dataset that has multiple drops
 diabetes = datasets.load_diabetes()
@@ -733,6 +735,28 @@ def test_lasso_lars_fit_copyX_behaviour(copy_X):
     assert copy_X == np.array_equal(X, X_copy)
 
 
+@pytest.mark.parametrize('est', (LassoLars(alpha=1e-3), Lars()))
+def test_lars_with_jitter(est):
+    # Test that a small amount of jitter helps stability,
+    # using example provided in issue #2746
+
+    X = np.array([[0.0, 0.0, 0.0, -1.0, 0.0],
+                  [0.0, -1.0, 0.0, 0.0, 0.0]])
+    y = [-2.5, -2.5]
+    expected_coef = [0, 2.5, 0, 2.5, 0]
+
+    # set to fit_intercept to False since target is constant and we want check
+    # the value of coef. coef would be all zeros otherwise.
+    est.set_params(fit_intercept=False)
+    est_jitter = clone(est).set_params(jitter=10e-8, random_state=0)
+
+    est.fit(X, y)
+    est_jitter.fit(X, y)
+
+    assert np.mean((est.coef_ - est_jitter.coef_)**2) > .1
+    np.testing.assert_allclose(est_jitter.coef_, expected_coef, rtol=1e-3)
+
+
 def test_X_none_gram_not_none():
     with pytest.raises(ValueError,
                        match="X cannot be None if Gram is not None"):

From 269afa3a77972e883aa1d64081b8f25d1819d5ac Mon Sep 17 00:00:00 2001
From: Mariana Meireles <marian.meireles@gmail.com>
Date: Fri, 17 Apr 2020 17:32:15 +0200
Subject: [PATCH 014/125] DOC Fixed Plot Mnist Example (#16200)

---
 examples/neural_networks/plot_mnist_filters.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/examples/neural_networks/plot_mnist_filters.py b/examples/neural_networks/plot_mnist_filters.py
index 57314a218f6ee..33f421a226c33 100644
--- a/examples/neural_networks/plot_mnist_filters.py
+++ b/examples/neural_networks/plot_mnist_filters.py
@@ -18,10 +18,16 @@
 
 To make the example run faster, we use very few hidden units, and train only
 for a very short time. Training longer would result in weights with a much
-smoother spatial appearance.
+smoother spatial appearance. The example will throw a warning because it
+doesn't converge, in this case this is what we want because of CI's time
+constraints.
 """
+
+import warnings
+
 import matplotlib.pyplot as plt
 from sklearn.datasets import fetch_openml
+from sklearn.exceptions import ConvergenceWarning
 from sklearn.neural_network import MLPClassifier
 
 print(__doc__)
@@ -38,7 +44,13 @@
                     solver='sgd', verbose=10, random_state=1,
                     learning_rate_init=.1)
 
-mlp.fit(X_train, y_train)
+# this example won't converge because of CI's time constraints, so we catch the
+# warning and are ignore it here
+with warnings.catch_warnings():
+    warnings.filterwarnings("ignore", category=ConvergenceWarning,
+                            module="sklearn")
+    mlp.fit(X_train, y_train)
+
 print("Training set score: %f" % mlp.score(X_train, y_train))
 print("Test set score: %f" % mlp.score(X_test, y_test))
 

From 7a77214c3cf2240b43873c62f11a36a85ebe36a6 Mon Sep 17 00:00:00 2001
From: Hao Chun Chang <changhaochun84@gmail.com>
Date: Fri, 17 Apr 2020 23:33:24 +0800
Subject: [PATCH 015/125] DOC Improve neighbors documentation (#16923)

---
 sklearn/neighbors/_binary_tree.pxi  | 48 ++++++++++++++++++++++++++---
 sklearn/neighbors/_dist_metrics.pyx |  4 ++-
 sklearn/neighbors/_lof.py           | 17 +++++-----
 3 files changed, 56 insertions(+), 13 deletions(-)

diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi
index ef6a2a2d5d330..599a4e9cc6426 100755
--- a/sklearn/neighbors/_binary_tree.pxi
+++ b/sklearn/neighbors/_binary_tree.pxi
@@ -239,9 +239,10 @@ cdef NodeData_t[::1] get_memview_NodeData_1D(
 # Define doc strings, substituting the appropriate class name using
 # the DOC_DICT variable defined in the pyx files.
 CLASS_DOC = \
-"""{BinaryTree} for fast generalized N-point problems
+"""
+{BinaryTree}(X, leaf_size=40, metric='minkowski', **kwargs)
 
-{BinaryTree}(X, leaf_size=40, metric='minkowski', \\**kwargs)
+{BinaryTree} for fast generalized N-point problems
 
 Parameters
 ----------
@@ -1159,15 +1160,50 @@ cdef class BinaryTree:
         self._update_memviews()
 
     def get_tree_stats(self):
+        """
+        get_tree_stats(self)
+
+        Get tree status.
+
+        Returns
+        -------
+        tree_stats: tuple of int
+            (number of trims, number of leaves, number of splits)
+        """
         return (self.n_trims, self.n_leaves, self.n_splits)
 
     def reset_n_calls(self):
+        """
+        reset_n_calls(self)
+
+        Reset number of calls to 0.
+        """
         self.n_calls = 0
 
     def get_n_calls(self):
+        """
+        get_n_calls(self)
+
+        Get number of calls.
+
+        Returns
+        -------
+        n_calls: int
+            number of distance computation calls
+        """
         return self.n_calls
 
     def get_arrays(self):
+        """
+        get_arrays(self)
+
+        Get data and node arrays.
+
+        Returns
+        -------
+        arrays: tuple of array
+            Arrays for storing tree data, index, node data and node bounds.
+        """
         return (self.data_arr, self.idx_array_arr,
                 self.node_data_arr, self.node_bounds_arr)
 
@@ -1362,7 +1398,8 @@ cdef class BinaryTree:
     def query_radius(self, X, r, int return_distance=False,
                      int count_only=False, int sort_results=False):
         """
-        query_radius(self, X, r, count_only = False):
+        query_radius(X, r, return_distance=False,
+        count_only=False, sort_results=False)
 
         query the tree for neighbors within a radius r
 
@@ -1694,7 +1731,10 @@ cdef class BinaryTree:
             return np.exp(log_density_arr)
 
     def two_point_correlation(self, X, r, dualtree=False):
-        """Compute the two-point correlation function
+        """
+        two_point_correlation(X, r, dualtree=False)
+
+        Compute the two-point correlation function
 
         Parameters
         ----------
diff --git a/sklearn/neighbors/_dist_metrics.pyx b/sklearn/neighbors/_dist_metrics.pyx
index 94c67f8ee9fa3..0c24efdd214e6 100755
--- a/sklearn/neighbors/_dist_metrics.pyx
+++ b/sklearn/neighbors/_dist_metrics.pyx
@@ -110,8 +110,10 @@ cdef class DistanceMetric:
     This class provides a uniform interface to fast distance metric
     functions.  The various metrics can be accessed via the :meth:`get_metric`
     class method and the metric string identifier (see below).
-    For example, to use the Euclidean distance:
 
+    Examples
+    --------
+    >>> from sklearn.neighbors import DistanceMetric
     >>> dist = DistanceMetric.get_metric('euclidean')
     >>> X = [[0, 1, 2],
              [3, 4, 5]]
diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py
index fc27b7ed69420..e03c4d9cb1e0e 100644
--- a/sklearn/neighbors/_lof.py
+++ b/sklearn/neighbors/_lof.py
@@ -176,8 +176,9 @@ def __init__(self, n_neighbors=20, algorithm='auto', leaf_size=30,
 
     @property
     def fit_predict(self):
-        """"Fits the model to the training set X and returns the labels.
+        """Fits the model to the training set X and returns the labels.
 
+        **Only available for novelty detection (when novelty is set to True).**
         Label is 1 for an inlier and -1 for an outlier according to the LOF
         score and the contamination parameter.
 
@@ -207,7 +208,7 @@ def fit_predict(self):
         return self._fit_predict
 
     def _fit_predict(self, X, y=None):
-        """"Fits the model to the training set X and returns the labels.
+        """Fits the model to the training set X and returns the labels.
 
         Label is 1 for an inlier and -1 for an outlier according to the LOF
         score and the contamination parameter.
@@ -286,9 +287,9 @@ def fit(self, X, y=None):
     def predict(self):
         """Predict the labels (1 inlier, -1 outlier) of X according to LOF.
 
+        **Only available for novelty detection (when novelty is set to True).**
         This method allows to generalize prediction to *new observations* (not
-        in the training set). Only available for novelty detection (when
-        novelty is set to True).
+        in the training set).
 
         Parameters
         ----------
@@ -345,8 +346,8 @@ def decision_function(self):
 
         Bigger is better, i.e. large values correspond to inliers.
 
+        **Only available for novelty detection (when novelty is set to True).**
         The shift offset allows a zero threshold for being an outlier.
-        Only available for novelty detection (when novelty is set to True).
         The argument X is supposed to contain *new data*: if X contains a
         point from training, it considers the later in its own neighborhood.
         Also, the samples in X are not considered in the neighborhood of any
@@ -381,8 +382,8 @@ def _decision_function(self, X):
 
         Bigger is better, i.e. large values correspond to inliers.
 
+        **Only available for novelty detection (when novelty is set to True).**
         The shift offset allows a zero threshold for being an outlier.
-        Only available for novelty detection (when novelty is set to True).
         The argument X is supposed to contain *new data*: if X contains a
         point from training, it considers the later in its own neighborhood.
         Also, the samples in X are not considered in the neighborhood of any
@@ -411,7 +412,7 @@ def score_samples(self):
         It is the opposite as bigger is better, i.e. large values correspond
         to inliers.
 
-        Only available for novelty detection (when novelty is set to True).
+        **Only available for novelty detection (when novelty is set to True).**
         The argument X is supposed to contain *new data*: if X contains a
         point from training, it considers the later in its own neighborhood.
         Also, the samples in X are not considered in the neighborhood of any
@@ -447,7 +448,7 @@ def _score_samples(self, X):
         It is the opposite as bigger is better, i.e. large values correspond
         to inliers.
 
-        Only available for novelty detection (when novelty is set to True).
+        **Only available for novelty detection (when novelty is set to True).**
         The argument X is supposed to contain *new data*: if X contains a
         point from training, it considers the later in its own neighborhood.
         Also, the samples in X are not considered in the neighborhood of any

From 2d03d781a9f6333f1e3e1be452e37c3340396881 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 17 Apr 2020 13:44:09 -0400
Subject: [PATCH 016/125] MNT simplify xfail check marking logic (#16949)

Co-Authored-By: Roman Yurchak <rth.yurchak@gmail.com>
---
 doc/developers/develop.rst           | 29 ++------------
 sklearn/base.py                      |  2 +-
 sklearn/decomposition/_sparse_pca.py |  2 +-
 sklearn/dummy.py                     |  2 +-
 sklearn/neural_network/_rbm.py       |  2 +-
 sklearn/svm/_classes.py              |  2 +-
 sklearn/utils/estimator_checks.py    | 56 ++++++++++++++++++----------
 7 files changed, 46 insertions(+), 49 deletions(-)

diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst
index 96aa942fb9238..d8ae6dd224840 100644
--- a/doc/developers/develop.rst
+++ b/doc/developers/develop.rst
@@ -246,7 +246,9 @@ whether it is just for you or for contributing it to scikit-learn, there are
 several internals of scikit-learn that you should be aware of in addition to
 the scikit-learn API outlined above. You can check whether your estimator
 adheres to the scikit-learn interface and standards by running
-:func:`utils.estimator_checks.check_estimator` on the class::
+:func:`utils.estimator_checks.check_estimator` on the class or using
+:func:`~sklearn.utils.parametrize_with_checks` pytest decorator (see its
+docstring for details and possible interactions with `pytest`)::
 
   >>> from sklearn.utils.estimator_checks import check_estimator
   >>> from sklearn.svm import LinearSVC
@@ -257,29 +259,6 @@ interface might be that you want to use it together with model evaluation and
 selection tools such as :class:`model_selection.GridSearchCV` and
 :class:`pipeline.Pipeline`.
 
-Setting `generate_only=True` returns a generator that yields (estimator, check)
-tuples where the check can be called independently from each other, i.e.
-`check(estimator)`. This allows all checks to be run independently and report
-the checks that are failing. scikit-learn provides a pytest specific decorator, 
-:func:`~sklearn.utils.parametrize_with_checks`, making it easier to test
-multiple estimators::
-
-  from sklearn.utils.estimator_checks import parametrize_with_checks
-  from sklearn.linear_model import LogisticRegression
-  from sklearn.tree import DecisionTreeRegressor
-
-  @parametrize_with_checks([LogisticRegression, DecisionTreeRegressor])
-  def test_sklearn_compatible_estimator(estimator, check):
-      check(estimator)
-
-This decorator sets the `id` keyword in `pytest.mark.parameterize` exposing
-the name of the underlying estimator and check in the test name. This allows
-`pytest -k` to be used to specify which tests to run.
-
-.. code-block: bash
-   
-   pytest test_check_estimators.py -k check_estimators_fit_returns_self
-
 Before detailing the required interface below, we describe two ways to achieve
 the correct interface more easily.
 
@@ -538,7 +517,7 @@ _skip_test (default=False)
     whether to skip common tests entirely. Don't use this unless you have a
     *very good* reason.
 
-_xfail_test (default=False)
+_xfail_checks (default=False)
     dictionary ``{check_name : reason}`` of common checks to mark as a
     known failure, with the associated reason. Don't use this unless you have a
     *very good* reason.
diff --git a/sklearn/base.py b/sklearn/base.py
index 70dec8c030418..8a6041cc17982 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -33,7 +33,7 @@
     'stateless': False,
     'multilabel': False,
     '_skip_test': False,
-    '_xfail_test': False,
+    '_xfail_checks': False,
     'multioutput_only': False,
     'binary_only': False,
     'requires_fit': True}
diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py
index 888d5d79e1e4b..cf1f5a2608e1c 100644
--- a/sklearn/decomposition/_sparse_pca.py
+++ b/sklearn/decomposition/_sparse_pca.py
@@ -234,7 +234,7 @@ def transform(self, X):
 
     def _more_tags(self):
         return {
-            '_xfail_test': {
+            '_xfail_checks': {
                 "check_methods_subset_invariance":
                 "fails for the transform method"
             }
diff --git a/sklearn/dummy.py b/sklearn/dummy.py
index 634943231860f..37e9145f7536c 100644
--- a/sklearn/dummy.py
+++ b/sklearn/dummy.py
@@ -358,7 +358,7 @@ def predict_log_proba(self, X):
     def _more_tags(self):
         return {
             'poor_score': True, 'no_validation': True,
-            '_xfail_test': {
+            '_xfail_checks': {
                 'check_methods_subset_invariance':
                 'fails for the predict method'
             }
diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py
index 06e7cc71bad3c..03b69c656b4a3 100644
--- a/sklearn/neural_network/_rbm.py
+++ b/sklearn/neural_network/_rbm.py
@@ -375,7 +375,7 @@ def fit(self, X, y=None):
 
     def _more_tags(self):
         return {
-            '_xfail_test': {
+            '_xfail_checks': {
                 'check_methods_subset_invariance':
                 'fails for the decision_function method'
             }
diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py
index 46086729af35c..10975a6f8e4a2 100644
--- a/sklearn/svm/_classes.py
+++ b/sklearn/svm/_classes.py
@@ -855,7 +855,7 @@ def __init__(self, nu=0.5, kernel='rbf', degree=3, gamma='scale',
 
     def _more_tags(self):
         return {
-            '_xfail_test': {
+            '_xfail_checks': {
                 'check_methods_subset_invariance':
                 'fails for the decision_function method',
                 'check_class_weight_classifiers': 'class_weight is ignored.'
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 34a0e25c7fcaa..eef9109fb56f5 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -359,38 +359,37 @@ def _generate_class_checks(Estimator):
 
 
 def _mark_xfail_checks(estimator, check, pytest):
-    """Mark estimator check pairs with xfail"""
+    """Mark (estimator, check) pairs with xfail according to the
+    _xfail_checks_ tag"""
     if isinstance(estimator, type):
-        # try to construct estimator to get tags, if it is unable to then
-        # return the estimator class
+        # try to construct estimator instance, if it is unable to then
+        # return the estimator class, ignoring the tag
         try:
-            xfail_checks = _safe_tags(_construct_instance(estimator),
-                                      '_xfail_test')
+            estimator = _construct_instance(estimator),
         except Exception:
             return estimator, check
-    else:
-        xfail_checks = _safe_tags(estimator, '_xfail_test')
-
-    if not xfail_checks:
-        return estimator, check
 
+    xfail_checks = _safe_tags(estimator, '_xfail_checks') or {}
     check_name = _set_check_estimator_ids(check)
-    msg = xfail_checks.get(check_name, None)
 
-    if msg is None:
+    if check_name not in xfail_checks:
+        # check isn't part of the xfail_checks tags, just return it
         return estimator, check
-
-    return pytest.param(
-        estimator, check, marks=pytest.mark.xfail(reason=msg))
+    else:
+        # check is in the tag, mark it as xfail for pytest
+        reason = xfail_checks[check_name]
+        return pytest.param(estimator, check,
+                            marks=pytest.mark.xfail(reason=reason))
 
 
 def parametrize_with_checks(estimators):
     """Pytest specific decorator for parametrizing estimator checks.
 
-    The `id` of each test is set to be a pprint version of the estimator
+    The `id` of each check is set to be a pprint version of the estimator
     and the name of the check with its keyword arguments.
+    This allows to use `pytest -k` to specify which tests to run::
 
-    Read more in the :ref:`User Guide<rolling_your_own_estimator>`.
+        pytest test_check_estimators.py -k check_estimators_fit_returns_self
 
     Parameters
     ----------
@@ -400,6 +399,17 @@ def parametrize_with_checks(estimators):
     Returns
     -------
     decorator : `pytest.mark.parametrize`
+
+    Examples
+    --------
+    >>> from sklearn.utils.estimator_checks import parametrize_with_checks
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from sklearn.tree import DecisionTreeRegressor
+
+    >>> @parametrize_with_checks([LogisticRegression, DecisionTreeRegressor])
+    >>> def test_sklearn_compatible_estimator(estimator, check):
+    >>>     check(estimator)
+
     """
     import pytest
 
@@ -419,7 +429,8 @@ def check_estimator(Estimator, generate_only=False):
     """Check if estimator adheres to scikit-learn conventions.
 
     This estimator will run an extensive test-suite for input validation,
-    shapes, etc.
+    shapes, etc, making sure that the estimator complies with `scikit-leanrn`
+    conventions as detailed in :ref:`rolling_your_own_estimator`.
     Additional tests for classifiers, regressors, clustering or transformers
     will be run if the Estimator class inherits from the corresponding mixin
     from sklearn.base.
@@ -428,7 +439,14 @@ def check_estimator(Estimator, generate_only=False):
     Classes currently have some additional tests that related to construction,
     while passing instances allows the testing of multiple options.
 
-    Read more in :ref:`rolling_your_own_estimator`.
+    Setting `generate_only=True` returns a generator that yields (estimator,
+    check) tuples where the check can be called independently from each
+    other, i.e. `check(estimator)`. This allows all checks to be run
+    independently and report the checks that are failing.
+
+    scikit-learn provides a pytest specific decorator,
+    :func:`~sklearn.utils.parametrize_with_checks`, making it easier to test
+    multiple estimators.
 
     Parameters
     ----------

From 522ecac61330887838722db9007c4be30ecd8744 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Fri, 17 Apr 2020 16:19:02 -0400
Subject: [PATCH 017/125] DOC Fix docstring issue in parametrize_with_checks
 (#16953)

---
 sklearn/utils/estimator_checks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index eef9109fb56f5..e2a51e94653b7 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -407,8 +407,8 @@ def parametrize_with_checks(estimators):
     >>> from sklearn.tree import DecisionTreeRegressor
 
     >>> @parametrize_with_checks([LogisticRegression, DecisionTreeRegressor])
-    >>> def test_sklearn_compatible_estimator(estimator, check):
-    >>>     check(estimator)
+    ... def test_sklearn_compatible_estimator(estimator, check):
+    ...     check(estimator)
 
     """
     import pytest

From 5abd22f58f152a0a899f33bb22609cc085fbfdec Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Fri, 17 Apr 2020 20:03:15 -0400
Subject: [PATCH 018/125] FIX Bug in mark_xfail_checks (#16954)

---
 sklearn/utils/estimator_checks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index e2a51e94653b7..1623902b202f3 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -365,7 +365,7 @@ def _mark_xfail_checks(estimator, check, pytest):
         # try to construct estimator instance, if it is unable to then
         # return the estimator class, ignoring the tag
         try:
-            estimator = _construct_instance(estimator),
+            estimator = _construct_instance(estimator)
         except Exception:
             return estimator, check
 

From cb49ad475155ed482829bb1a0278a5e19b9ca17c Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Sat, 18 Apr 2020 20:14:22 -0400
Subject: [PATCH 019/125] MNT removed _safe_tags utility (#16950)

---
 sklearn/tests/test_docstring_parameters.py   |  5 +-
 sklearn/utils/estimator_checks.py            | 91 +++++++++-----------
 sklearn/utils/tests/test_estimator_checks.py |  3 +-
 3 files changed, 43 insertions(+), 56 deletions(-)

diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index ca2549f2ea4c1..8ea0ec97f9fc2 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -17,7 +17,6 @@
 from sklearn.utils._testing import _get_func_name
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils._testing import all_estimators
-from sklearn.utils.estimator_checks import _safe_tags
 from sklearn.utils.estimator_checks import _enforce_estimator_tags_y
 from sklearn.utils.estimator_checks import _enforce_estimator_tags_x
 from sklearn.utils.deprecation import _is_deprecated
@@ -206,9 +205,9 @@ def test_fit_docstring_attributes(name, Estimator):
     y = _enforce_estimator_tags_y(est, y)
     X = _enforce_estimator_tags_x(est, X)
 
-    if '1dlabels' in _safe_tags(est, 'X_types'):
+    if '1dlabels' in est._get_tags()['X_types']:
         est.fit(y)
-    elif '2dlabels' in _safe_tags(est, 'X_types'):
+    elif '2dlabels' in est._get_tags()['X_types']:
         est.fit(np.c_[y, y])
     else:
         est.fit(X, y)
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 1623902b202f3..351f24b66283e 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -33,7 +33,7 @@
 from ..linear_model import Ridge
 
 from ..base import (clone, ClusterMixin, is_classifier, is_regressor,
-                    _DEFAULT_TAGS, RegressorMixin, is_outlier_detector)
+                    RegressorMixin, is_outlier_detector)
 
 from ..metrics import accuracy_score, adjusted_rand_score, f1_score
 from ..random_projection import BaseRandomProjection
@@ -58,22 +58,9 @@
 BOSTON = None
 CROSS_DECOMPOSITION = ['PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD']
 
-def _safe_tags(estimator, key=None):
-    # if estimator doesn't have _get_tags, use _DEFAULT_TAGS
-    # if estimator has tags but not key, use _DEFAULT_TAGS[key]
-    if hasattr(estimator, "_get_tags"):
-        if key is not None:
-            return estimator._get_tags().get(key, _DEFAULT_TAGS[key])
-        tags = estimator._get_tags()
-        return {key: tags.get(key, _DEFAULT_TAGS[key])
-                for key in _DEFAULT_TAGS.keys()}
-    if key is not None:
-        return _DEFAULT_TAGS[key]
-    return _DEFAULT_TAGS
-
 
 def _yield_checks(name, estimator):
-    tags = _safe_tags(estimator)
+    tags = estimator._get_tags()
     yield check_no_attributes_set_in_init
     yield check_estimators_dtypes
     yield check_fit_score_takes_y
@@ -116,7 +103,7 @@ def _yield_checks(name, estimator):
 
 
 def _yield_classifier_checks(name, classifier):
-    tags = _safe_tags(classifier)
+    tags = classifier._get_tags()
 
     # test classifiers can handle non-array data and pandas objects
     yield check_classifier_data_not_an_array
@@ -171,7 +158,7 @@ def check_supervised_y_no_nan(name, estimator_orig):
 
 
 def _yield_regressor_checks(name, regressor):
-    tags = _safe_tags(regressor)
+    tags = regressor._get_tags()
     # TODO: test with intercept
     # TODO: test with multiple responses
     # basic testing
@@ -198,12 +185,12 @@ def _yield_regressor_checks(name, regressor):
 def _yield_transformer_checks(name, transformer):
     # All transformers should either deal with sparse data or raise an
     # exception with type TypeError and an intelligible error message
-    if not _safe_tags(transformer, "no_validation"):
+    if not transformer._get_tags()["no_validation"]:
         yield check_transformer_data_not_an_array
     # these don't actually fit the data, so don't raise errors
     yield check_transformer_general
     yield partial(check_transformer_general, readonly_memmap=True)
-    if not _safe_tags(transformer, "stateless"):
+    if not transformer._get_tags()["stateless"]:
         yield check_transformers_unfitted
     # Dependent on external solvers and hence accessing the iter
     # param is non-trivial.
@@ -237,12 +224,12 @@ def _yield_outliers_checks(name, estimator):
         # test outlier detectors can handle non-array data
         yield check_classifier_data_not_an_array
         # test if NotFittedError is raised
-        if _safe_tags(estimator, "requires_fit"):
+        if estimator._get_tags()["requires_fit"]:
             yield check_estimators_unfitted
 
 
 def _yield_all_checks(name, estimator):
-    tags = _safe_tags(estimator)
+    tags = estimator._get_tags()
     if "2darray" not in tags["X_types"]:
         warnings.warn("Can't test estimator {} which requires input "
                       " of type {}".format(name, tags["X_types"]),
@@ -369,7 +356,7 @@ def _mark_xfail_checks(estimator, check, pytest):
         except Exception:
             return estimator, check
 
-    xfail_checks = _safe_tags(estimator, '_xfail_checks') or {}
+    xfail_checks = estimator._get_tags()['_xfail_checks'] or {}
     check_name = _set_check_estimator_ids(check)
 
     if check_name not in xfail_checks:
@@ -701,7 +688,7 @@ def check_estimator_sparse_data(name, estimator_orig):
     X[X < .8] = 0
     X = _pairwise_estimator_convert_X(X, estimator_orig)
     X_csr = sparse.csr_matrix(X)
-    tags = _safe_tags(estimator_orig)
+    tags = estimator_orig._get_tags()
     if tags['binary_only']:
         y = (2 * rng.rand(40)).astype(np.int)
     else:
@@ -767,7 +754,7 @@ def check_sample_weights_pandas_series(name, estimator_orig):
             X = pd.DataFrame(_pairwise_estimator_convert_X(X, estimator_orig))
             y = pd.Series([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2])
             weights = pd.Series([1] * 12)
-            if _safe_tags(estimator, "multioutput_only"):
+            if estimator._get_tags()["multioutput_only"]:
                 y = pd.DataFrame(y)
             try:
                 estimator.fit(X, y, sample_weight=weights)
@@ -792,7 +779,7 @@ def check_sample_weights_not_an_array(name, estimator_orig):
         X = _NotAnArray(pairwise_estimator_convert_X(X, estimator_orig))
         y = _NotAnArray([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2])
         weights = _NotAnArray([1] * 12)
-        if _safe_tags(estimator, "multioutput_only"):
+        if estimator._get_tags()["multioutput_only"]:
             y = _NotAnArray(y.data.reshape(-1, 1))
         estimator.fit(X, y, sample_weight=weights)
 
@@ -806,8 +793,8 @@ def check_sample_weights_list(name, estimator_orig):
         rnd = np.random.RandomState(0)
         n_samples = 30
         X = _pairwise_estimator_convert_X(rnd.uniform(size=(n_samples, 3)),
-                                         estimator_orig)
-        if _safe_tags(estimator, 'binary_only'):
+                                          estimator_orig)
+        if estimator._get_tags()['binary_only']:
             y = np.arange(n_samples) % 2
         else:
             y = np.arange(n_samples) % 3
@@ -886,7 +873,7 @@ def check_dtype_object(name, estimator_orig):
     rng = np.random.RandomState(0)
     X = _pairwise_estimator_convert_X(rng.rand(40, 10), estimator_orig)
     X = X.astype(object)
-    tags = _safe_tags(estimator_orig)
+    tags = estimator_orig._get_tags()
     if tags['binary_only']:
         y = (X[:, 0] * 2).astype(np.int)
     else:
@@ -990,7 +977,7 @@ def check_dont_overwrite_parameters(name, estimator_orig):
     X = 3 * rnd.uniform(size=(20, 3))
     X = _pairwise_estimator_convert_X(X, estimator_orig)
     y = X[:, 0].astype(np.int)
-    if _safe_tags(estimator, 'binary_only'):
+    if estimator._get_tags()['binary_only']:
         y[y == 2] = 1
     y = _enforce_estimator_tags_y(estimator, y)
 
@@ -1041,7 +1028,7 @@ def check_fit2d_predict1d(name, estimator_orig):
     X = 3 * rnd.uniform(size=(20, 3))
     X = _pairwise_estimator_convert_X(X, estimator_orig)
     y = X[:, 0].astype(np.int)
-    tags = _safe_tags(estimator_orig)
+    tags = estimator_orig._get_tags()
     if tags['binary_only']:
         y[y == 2] = 1
     estimator = clone(estimator_orig)
@@ -1092,7 +1079,7 @@ def check_methods_subset_invariance(name, estimator_orig):
     X = 3 * rnd.uniform(size=(20, 3))
     X = _pairwise_estimator_convert_X(X, estimator_orig)
     y = X[:, 0].astype(np.int)
-    if _safe_tags(estimator_orig, 'binary_only'):
+    if estimator_orig._get_tags()['binary_only']:
         y[y == 2] = 1
     estimator = clone(estimator_orig)
     y = _enforce_estimator_tags_y(estimator, y)
@@ -1193,7 +1180,7 @@ def check_fit1d(name, estimator_orig):
     X = 3 * rnd.uniform(size=(20))
     y = X.astype(np.int)
     estimator = clone(estimator_orig)
-    tags = _safe_tags(estimator)
+    tags = estimator._get_tags()
     if tags["no_validation"]:
         # FIXME this is a bit loose
         return
@@ -1285,7 +1272,7 @@ def _check_transformer(name, transformer_orig, X, y):
             X_pred2 = transformer.transform(X)
             X_pred3 = transformer.fit_transform(X, y=y_)
 
-        if _safe_tags(transformer_orig, 'non_deterministic'):
+        if transformer_orig._get_tags()['non_deterministic']:
             msg = name + ' is non deterministic'
             raise SkipTest(msg)
         if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple):
@@ -1316,7 +1303,7 @@ def _check_transformer(name, transformer_orig, X, y):
 
         # raises error on malformed input for transform
         if hasattr(X, 'shape') and \
-           not _safe_tags(transformer, "stateless") and \
+           not transformer._get_tags()["stateless"] and \
            X.ndim == 2 and X.shape[1] > 1:
 
             # If it's not an array, it does not have a 'T' property
@@ -1330,7 +1317,7 @@ def _check_transformer(name, transformer_orig, X, y):
 
 @ignore_warnings
 def check_pipeline_consistency(name, estimator_orig):
-    if _safe_tags(estimator_orig, 'non_deterministic'):
+    if estimator_orig._get_tags()['non_deterministic']:
         msg = name + ' is non deterministic'
         raise SkipTest(msg)
 
@@ -1365,7 +1352,7 @@ def check_fit_score_takes_y(name, estimator_orig):
     n_samples = 30
     X = rnd.uniform(size=(n_samples, 3))
     X = _pairwise_estimator_convert_X(X, estimator_orig)
-    if _safe_tags(estimator_orig, 'binary_only'):
+    if estimator_orig._get_tags()['binary_only']:
         y = np.arange(n_samples) % 2
     else:
         y = np.arange(n_samples) % 3
@@ -1398,7 +1385,7 @@ def check_estimators_dtypes(name, estimator_orig):
     X_train_int_64 = X_train_32.astype(np.int64)
     X_train_int_32 = X_train_32.astype(np.int32)
     y = X_train_int_64[:, 0]
-    if _safe_tags(estimator_orig, 'binary_only'):
+    if estimator_orig._get_tags()['binary_only']:
         y[y == 2] = 1
     y = _enforce_estimator_tags_y(estimator_orig, y)
 
@@ -1534,7 +1521,7 @@ def check_estimators_pickle(name, estimator_orig):
     X -= X.min()
     X = _pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel)
 
-    tags = _safe_tags(estimator_orig)
+    tags = estimator_orig._get_tags()
     # include NaN values when the estimator should deal with them
     if tags['allow_nan']:
         # set randomly 10 elements to np.nan
@@ -1599,7 +1586,7 @@ def check_estimators_partial_fit_n_features(name, estimator_orig):
 @ignore_warnings(category=FutureWarning)
 def check_classifier_multioutput(name, estimator):
     n_samples, n_labels, n_classes = 42, 5, 3
-    tags = _safe_tags(estimator)
+    tags = estimator._get_tags()
     estimator = clone(estimator)
     X, y = make_multilabel_classification(random_state=42,
                                           n_samples=n_samples,
@@ -1706,7 +1693,7 @@ def check_clustering(name, clusterer_orig, readonly_memmap=False):
     pred = clusterer.labels_
     assert pred.shape == (n_samples,)
     assert adjusted_rand_score(pred, y) > 0.4
-    if _safe_tags(clusterer, 'non_deterministic'):
+    if clusterer._get_tags()['non_deterministic']:
         return
     set_random_state(clusterer)
     with warnings.catch_warnings(record=True):
@@ -1805,7 +1792,7 @@ def check_classifiers_train(name, classifier_orig, readonly_memmap=False,
         X_m, y_m, X_b, y_b = create_memmap_backed_data([X_m, y_m, X_b, y_b])
 
     problems = [(X_b, y_b)]
-    tags = _safe_tags(classifier_orig)
+    tags = classifier_orig._get_tags()
     if not tags['binary_only']:
         problems.append((X_m, y_m))
 
@@ -2044,7 +2031,7 @@ def check_classifiers_multilabel_representation_invariance(name,
 def check_estimators_fit_returns_self(name, estimator_orig,
                                       readonly_memmap=False):
     """Check if self is returned when calling fit"""
-    if _safe_tags(estimator_orig, 'binary_only'):
+    if estimator_orig._get_tags()['binary_only']:
         n_centers = 2
     else:
         n_centers = 3
@@ -2081,7 +2068,7 @@ def check_estimators_unfitted(name, estimator_orig):
 
 @ignore_warnings(category=FutureWarning)
 def check_supervised_y_2d(name, estimator_orig):
-    tags = _safe_tags(estimator_orig)
+    tags = estimator_orig._get_tags()
     if tags['multioutput_only']:
         # These only work on 2d, so this test makes no sense
         return
@@ -2197,7 +2184,7 @@ def check_classifiers_classes(name, classifier_orig):
     y_names_binary = np.take(labels_binary, y_binary)
 
     problems = [(X_binary, y_binary, y_names_binary)]
-    if not _safe_tags(classifier_orig, 'binary_only'):
+    if not classifier_orig._get_tags()['binary_only']:
         problems.append((X_multiclass, y_multiclass, y_names_multiclass))
 
     for X, y, y_names in problems:
@@ -2282,7 +2269,7 @@ def check_regressors_train(name, regressor_orig, readonly_memmap=False,
     # TODO: find out why PLS and CCA fail. RANSAC is random
     # and furthermore assumes the presence of outliers, hence
     # skipped
-    if not _safe_tags(regressor, "poor_score"):
+    if not regressor._get_tags()["poor_score"]:
         assert regressor.score(X, y_) > 0.5
 
 
@@ -2315,7 +2302,7 @@ def check_regressors_no_decision_function(name, regressor_orig):
 @ignore_warnings(category=FutureWarning)
 def check_class_weight_classifiers(name, classifier_orig):
 
-    if _safe_tags(classifier_orig, 'binary_only'):
+    if classifier_orig._get_tags()['binary_only']:
         problems = [2]
     else:
         problems = [2, 3]
@@ -2418,7 +2405,7 @@ def check_class_weight_balanced_linear_classifier(name, Classifier):
 
 @ignore_warnings(category=FutureWarning)
 def check_estimators_overwrite_params(name, estimator_orig):
-    if _safe_tags(estimator_orig, 'binary_only'):
+    if estimator_orig._get_tags()['binary_only']:
         n_centers = 2
     else:
         n_centers = 3
@@ -2654,13 +2641,13 @@ def enforce_estimator_tags_y(estimator, y):
 def _enforce_estimator_tags_y(estimator, y):
     # Estimators with a `requires_positive_y` tag only accept strictly positive
     # data
-    if _safe_tags(estimator, "requires_positive_y"):
+    if estimator._get_tags()["requires_positive_y"]:
         # Create strictly positive y. The minimal increment above 0 is 1, as
         # y could be of integer dtype.
         y += 1 + abs(y.min())
     # Estimators in mono_output_task_error raise ValueError if y is of 1-D
     # Convert into a 2-D y for those estimators.
-    if _safe_tags(estimator, "multioutput_only"):
+    if estimator._get_tags()["multioutput_only"]:
         return np.reshape(y, (-1, 1))
     return y
 
@@ -2672,11 +2659,11 @@ def _enforce_estimator_tags_x(estimator, X):
         X = X.dot(X.T)
     # Estimators with `1darray` in `X_types` tag only accept
     # X of shape (`n_samples`,)
-    if '1darray' in _safe_tags(estimator, 'X_types'):
+    if '1darray' in estimator._get_tags()['X_types']:
         X = X[:, 0]
     # Estimators with a `requires_positive_X` tag only accept
     # strictly positive data
-    if _safe_tags(estimator, 'requires_positive_X'):
+    if estimator._get_tags()['requires_positive_X']:
         X -= X.min()
     return X
 
@@ -2814,7 +2801,7 @@ def check_classifiers_regression_target(name, estimator_orig):
     X, y = load_boston(return_X_y=True)
     e = clone(estimator_orig)
     msg = 'Unknown label type: '
-    if not _safe_tags(e, "no_validation"):
+    if not e._get_tags()["no_validation"]:
         assert_raises_regex(ValueError, msg, e.fit, X, y)
 
 
diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
index a7f4911791467..a755daa842ef5 100644
--- a/sklearn/utils/tests/test_estimator_checks.py
+++ b/sklearn/utils/tests/test_estimator_checks.py
@@ -363,7 +363,8 @@ def test_check_estimator():
     # check that we have a set_params and can clone
     msg = "it does not implement a 'get_params' method"
     assert_raises_regex(TypeError, msg, check_estimator, object)
-    assert_raises_regex(TypeError, msg, check_estimator, object())
+    msg = "object has no attribute '_get_tags'"
+    assert_raises_regex(AttributeError, msg, check_estimator, object())
     # check that values returned by get_params match set_params
     msg = "get_params result does not match what was passed to set_params"
     assert_raises_regex(AssertionError, msg, check_estimator,

From 5dfca463ac7aef27aec9d3588e0a903e33693119 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20van=20Gelder?= <dvangelder4@gmail.com>
Date: Sun, 19 Apr 2020 16:21:19 +0200
Subject: [PATCH 020/125] =?UTF-8?q?DOC=20DataConversionWarning:=20Add=20ex?=
 =?UTF-8?q?ample=20to=20doc=20of=20DataCon=E2=80=A6=20(#16704)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 sklearn/exceptions.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py
index ea34365afa703..0083632418c8b 100644
--- a/sklearn/exceptions.py
+++ b/sklearn/exceptions.py
@@ -85,6 +85,24 @@ class DataConversionWarning(UserWarning):
           implementation's data-type expectations;
         - passes an input whose shape can be interpreted ambiguously.
 
+    Examples
+    --------
+    >>> from sklearn.utils import validation
+    >>> Y = [[1],[2],[3]]
+    >>> import warnings
+    >>> from sklearn.exceptions import DataConversionWarning
+    >>> warnings.simplefilter('always', DataConversionWarning)
+    >>> with warnings.catch_warnings(record=True) as w:
+    ...     try:
+    ...         # will trigger warning as Y is a column-vector
+    ...         Y = validation.column_or_1d(Y,warn=True)
+    ...     except ValueError:
+    ...         pass
+    ...     print(repr(w[-1].message))
+    DataConversionWarning('A column-vector y was passed when a
+    1d array was expected. Please change the shape of y to
+    (n_samples, ), for example using ravel().')
+
     .. versionchanged:: 0.18
        Moved from sklearn.utils.validation.
     """

From a0e6b95540a8ddc7778f90fd60721f4d9fda85cb Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Mon, 20 Apr 2020 00:53:29 +1000
Subject: [PATCH 021/125] MNT add pip-wheel-metadata to gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 9b158da07a2ec..b8ee8d20322c3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -39,6 +39,7 @@ doc/samples
 *.prof
 .tox/
 .coverage
+pip-wheel-metadata
 
 lfw_preprocessed/
 nips2010_pdf/

From 670b85c9e9cec05210e8596bc1fb9ca66787162f Mon Sep 17 00:00:00 2001
From: lrjball <50599110+lrjball@users.noreply.github.com>
Date: Sun, 19 Apr 2020 16:24:20 +0100
Subject: [PATCH 022/125] ENH ColumnTransformer.get_feature_names() handles
 passthrough (#14048)

---
 doc/whats_new/v0.23.rst                       |  5 ++
 sklearn/compose/_column_transformer.py        | 29 ++++---
 .../compose/tests/test_column_transformer.py  | 85 ++++++++++++++++---
 3 files changed, 97 insertions(+), 22 deletions(-)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index c4fa3818aa1bc..9343f1ee46da9 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -105,6 +105,11 @@ Changelog
   a column name that is not unique in the dataframe. :pr:`16431` by
   `Thomas Fan`_.
 
+- |Enhancement| :class:`compose.ColumnTransformer` method ``get_feature_names``
+  now supports `'passthrough'` columns, with the feature name being either
+  the column name for a dataframe, or `'xi'` for column index `i`.
+  :pr:`14048` by :user:`Lewis Ball <lrjball>`.
+
 :mod:`sklearn.datasets`
 .......................
 
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index 903c63a00fd22..2ef8876b0c4e7 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -315,19 +315,18 @@ def _validate_remainder(self, X):
                 self.remainder)
 
         # Make it possible to check for reordered named columns on transform
-        if (hasattr(X, 'columns') and
-                any(_determine_key_type(cols) == 'str'
-                    for cols in self._columns)):
+        self._has_str_cols = any(_determine_key_type(cols) == 'str'
+                                 for cols in self._columns)
+        if hasattr(X, 'columns'):
             self._df_columns = X.columns
 
         self._n_features = X.shape[1]
         cols = []
         for columns in self._columns:
             cols.extend(_get_column_indices(X, columns))
-        remaining_idx = list(set(range(self._n_features)) - set(cols))
-        remaining_idx = sorted(remaining_idx) or None
 
-        self._remainder = ('remainder', self.remainder, remaining_idx)
+        remaining_idx = sorted(set(range(self._n_features)) - set(cols))
+        self._remainder = ('remainder', self.remainder, remaining_idx or None)
 
     @property
     def named_transformers_(self):
@@ -356,11 +355,18 @@ def get_feature_names(self):
             if trans == 'drop' or (
                     hasattr(column, '__len__') and not len(column)):
                 continue
-            elif trans == 'passthrough':
-                raise NotImplementedError(
-                    "get_feature_names is not yet supported when using "
-                    "a 'passthrough' transformer.")
-            elif not hasattr(trans, 'get_feature_names'):
+            if trans == 'passthrough':
+                if hasattr(self, '_df_columns'):
+                    if ((not isinstance(column, slice))
+                            and all(isinstance(col, str) for col in column)):
+                        feature_names.extend(column)
+                    else:
+                        feature_names.extend(self._df_columns[column])
+                else:
+                    indices = np.arange(self._n_features)
+                    feature_names.extend(['x%d' % i for i in indices[column]])
+                continue
+            if not hasattr(trans, 'get_feature_names'):
                 raise AttributeError("Transformer %s (type %s) does not "
                                      "provide get_feature_names."
                                      % (str(name), type(trans).__name__))
@@ -582,6 +588,7 @@ def transform(self, X):
         # name order and count. See #14237 for details.
         if (self._remainder[2] is not None and
                 hasattr(self, '_df_columns') and
+                self._has_str_cols and
                 hasattr(X, 'columns')):
             n_cols_fit = len(self._df_columns)
             n_cols_transform = len(X.columns)
diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
index ca1c185c91e06..a9f1764eb97e4 100644
--- a/sklearn/compose/tests/test_column_transformer.py
+++ b/sklearn/compose/tests/test_column_transformer.py
@@ -668,25 +668,88 @@ def test_column_transformer_get_feature_names():
     ct.fit(X)
     assert ct.get_feature_names() == ['col0__a', 'col0__b', 'col1__c']
 
-    # passthrough transformers not supported
+    # drop transformer
+    ct = ColumnTransformer(
+        [('col0', DictVectorizer(), 0), ('col1', 'drop', 1)])
+    ct.fit(X)
+    assert ct.get_feature_names() == ['col0__a', 'col0__b']
+
+    # passthrough transformer
     ct = ColumnTransformer([('trans', 'passthrough', [0, 1])])
     ct.fit(X)
-    assert_raise_message(
-        NotImplementedError, 'get_feature_names is not yet supported',
-        ct.get_feature_names)
+    assert ct.get_feature_names() == ['x0', 'x1']
 
     ct = ColumnTransformer([('trans', DictVectorizer(), 0)],
                            remainder='passthrough')
     ct.fit(X)
-    assert_raise_message(
-        NotImplementedError, 'get_feature_names is not yet supported',
-        ct.get_feature_names)
+    assert ct.get_feature_names() == ['trans__a', 'trans__b', 'x1']
 
-    # drop transformer
-    ct = ColumnTransformer(
-        [('col0', DictVectorizer(), 0), ('col1', 'drop', 1)])
+    ct = ColumnTransformer([('trans', 'passthrough', [1])],
+                           remainder='passthrough')
     ct.fit(X)
-    assert ct.get_feature_names() == ['col0__a', 'col0__b']
+    assert ct.get_feature_names() == ['x1', 'x0']
+
+    ct = ColumnTransformer([('trans', 'passthrough', lambda x: [1])],
+                           remainder='passthrough')
+    ct.fit(X)
+    assert ct.get_feature_names() == ['x1', 'x0']
+
+    ct = ColumnTransformer([('trans', 'passthrough', np.array([False, True]))],
+                           remainder='passthrough')
+    ct.fit(X)
+    assert ct.get_feature_names() == ['x1', 'x0']
+
+    ct = ColumnTransformer([('trans', 'passthrough', slice(1, 2))],
+                           remainder='passthrough')
+    ct.fit(X)
+    assert ct.get_feature_names() == ['x1', 'x0']
+
+
+def test_column_transformer_get_feature_names_dataframe():
+    # passthough transformer with a dataframe
+    pd = pytest.importorskip('pandas')
+    X = np.array([[{'a': 1, 'b': 2}, {'a': 3, 'b': 4}],
+                  [{'c': 5}, {'c': 6}]], dtype=object).T
+    X_df = pd.DataFrame(X, columns=['col0', 'col1'])
+
+    ct = ColumnTransformer([('trans', 'passthrough', ['col0', 'col1'])])
+    ct.fit(X_df)
+    assert ct.get_feature_names() == ['col0', 'col1']
+
+    ct = ColumnTransformer([('trans', 'passthrough', [0, 1])])
+    ct.fit(X_df)
+    assert ct.get_feature_names() == ['col0', 'col1']
+
+    ct = ColumnTransformer([('col0', DictVectorizer(), 0)],
+                           remainder='passthrough')
+    ct.fit(X_df)
+    assert ct.get_feature_names() == ['col0__a', 'col0__b', 'col1']
+
+    ct = ColumnTransformer([('trans', 'passthrough', ['col1'])],
+                           remainder='passthrough')
+    ct.fit(X_df)
+    assert ct.get_feature_names() == ['col1', 'col0']
+
+    ct = ColumnTransformer([('trans', 'passthrough',
+                             lambda x: x[['col1']].columns)],
+                           remainder='passthrough')
+    ct.fit(X_df)
+    assert ct.get_feature_names() == ['col1', 'col0']
+
+    ct = ColumnTransformer([('trans', 'passthrough', np.array([False, True]))],
+                           remainder='passthrough')
+    ct.fit(X_df)
+    assert ct.get_feature_names() == ['col1', 'col0']
+
+    ct = ColumnTransformer([('trans', 'passthrough', slice(1, 2))],
+                           remainder='passthrough')
+    ct.fit(X_df)
+    assert ct.get_feature_names() == ['col1', 'col0']
+
+    ct = ColumnTransformer([('trans', 'passthrough', [1])],
+                           remainder='passthrough')
+    ct.fit(X_df)
+    assert ct.get_feature_names() == ['col1', 'col0']
 
 
 def test_column_transformer_special_strings():

From 4d9478f433b8760018fcab16498f1bf4b83bf187 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Sun, 19 Apr 2020 16:55:50 -0400
Subject: [PATCH 023/125] DOC User Guide and docs for LDA and QDA (#16243)

---
 doc/modules/lda_qda.rst          | 179 ++++++++++++++++++++-----------
 sklearn/discriminant_analysis.py | 141 +++++++++++++++---------
 2 files changed, 207 insertions(+), 113 deletions(-)

diff --git a/doc/modules/lda_qda.rst b/doc/modules/lda_qda.rst
index e1dfb0c03ea4b..c3ac94dedefa9 100644
--- a/doc/modules/lda_qda.rst
+++ b/doc/modules/lda_qda.rst
@@ -7,9 +7,9 @@ Linear and Quadratic Discriminant Analysis
 .. currentmodule:: sklearn
 
 Linear Discriminant Analysis
-(:class:`discriminant_analysis.LinearDiscriminantAnalysis`) and Quadratic
+(:class:`~discriminant_analysis.LinearDiscriminantAnalysis`) and Quadratic
 Discriminant Analysis
-(:class:`discriminant_analysis.QuadraticDiscriminantAnalysis`) are two classic
+(:class:`~discriminant_analysis.QuadraticDiscriminantAnalysis`) are two classic
 classifiers, with, as their names suggest, a linear and a quadratic decision
 surface, respectively.
 
@@ -37,68 +37,59 @@ flexible.
 Dimensionality reduction using Linear Discriminant Analysis
 ===========================================================
 
-:class:`discriminant_analysis.LinearDiscriminantAnalysis` can be used to
+:class:`~discriminant_analysis.LinearDiscriminantAnalysis` can be used to
 perform supervised dimensionality reduction, by projecting the input data to a
 linear subspace consisting of the directions which maximize the separation
 between classes (in a precise sense discussed in the mathematics section
 below). The dimension of the output is necessarily less than the number of
-classes, so this is, in general, a rather strong dimensionality reduction, and
+classes, so this is in general a rather strong dimensionality reduction, and
 only makes sense in a multiclass setting.
 
-This is implemented in
-:func:`discriminant_analysis.LinearDiscriminantAnalysis.transform`. The desired
-dimensionality can be set using the ``n_components`` constructor parameter.
-This parameter has no influence on
-:func:`discriminant_analysis.LinearDiscriminantAnalysis.fit` or
-:func:`discriminant_analysis.LinearDiscriminantAnalysis.predict`.
+This is implemented in the `transform` method. The desired dimensionality can
+be set using the ``n_components`` parameter. This parameter has no influence
+on the `fit` and `predict` methods.
 
 .. topic:: Examples:
 
     :ref:`sphx_glr_auto_examples_decomposition_plot_pca_vs_lda.py`: Comparison of LDA and PCA
     for dimensionality reduction of the Iris dataset
 
+.. _lda_qda_math:
+
 Mathematical formulation of the LDA and QDA classifiers
 =======================================================
 
 Both LDA and QDA can be derived from simple probabilistic models which model
 the class conditional distribution of the data :math:`P(X|y=k)` for each class
-:math:`k`. Predictions can then be obtained by using Bayes' rule:
+:math:`k`. Predictions can then be obtained by using Bayes' rule, for each
+training sample :math:`x \in \mathcal{R}^d`:
 
 .. math::
-    P(y=k | X) = \frac{P(X | y=k) P(y=k)}{P(X)} = \frac{P(X | y=k) P(y = k)}{ \sum_{l} P(X | y=l) \cdot P(y=l)}
+    P(y=k | x) = \frac{P(x | y=k) P(y=k)}{P(x)} = \frac{P(x | y=k) P(y = k)}{ \sum_{l} P(x | y=l) \cdot P(y=l)}
 
-and we select the class :math:`k` which maximizes this conditional probability.
+and we select the class :math:`k` which maximizes this posterior probability.
 
 More specifically, for linear and quadratic discriminant analysis,
-:math:`P(X|y)` is modeled as a multivariate Gaussian distribution with
+:math:`P(x|y)` is modeled as a multivariate Gaussian distribution with
 density:
 
-.. math:: P(X | y=k) = \frac{1}{(2\pi)^{d/2} |\Sigma_k|^{1/2}}\exp\left(-\frac{1}{2} (X-\mu_k)^t \Sigma_k^{-1} (X-\mu_k)\right)
+.. math:: P(x | y=k) = \frac{1}{(2\pi)^{d/2} |\Sigma_k|^{1/2}}\exp\left(-\frac{1}{2} (x-\mu_k)^t \Sigma_k^{-1} (x-\mu_k)\right)
 
 where :math:`d` is the number of features.
 
-To use this model as a classifier, we just need to estimate from the training
-data the class priors :math:`P(y=k)` (by the proportion of instances of class
-:math:`k`), the class means :math:`\mu_k` (by the empirical sample class means)
-and the covariance matrices (either by the empirical sample class covariance
-matrices, or by a regularized estimator: see the section on shrinkage below).
+QDA
+---
 
-In the case of LDA, the Gaussians for each class are assumed to share the same
-covariance matrix: :math:`\Sigma_k = \Sigma` for all :math:`k`. This leads to
-linear decision surfaces, which can be seen by comparing the
-log-probability ratios :math:`\log[P(y=k | X) / P(y=l | X)]`:
+According to the model above, the log of the posterior is:
 
 .. math::
-    \log\left(\frac{P(y=k|X)}{P(y=l|X)}\right)=
-    \log\left(\frac{P(X|y=k)P(y=k)}{P(X|y=l)P(y=l)}\right)=0 \Leftrightarrow
 
-    (\mu_k-\mu_l)^t\Sigma^{-1} X =
-    \frac{1}{2} (\mu_k^t \Sigma^{-1} \mu_k - \mu_l^t \Sigma^{-1} \mu_l)
-    - \log\frac{P(y=k)}{P(y=l)}
+    \log P(y=k | x) &= \log P(x | y=k) + \log P(y = k) + Cst \\
+    &= -\frac{1}{2} \log |\Sigma_k| -\frac{1}{2} (x-\mu_k)^t \Sigma_k^{-1} (x-\mu_k) + \log P(y = k) + Cst,
 
-In the case of QDA, there are no assumptions on the covariance matrices
-:math:`\Sigma_k` of the Gaussians, leading to quadratic decision surfaces. See
-[#1]_ for more details.
+where the constant term :math:`Cst` corresponds to the denominator
+:math:`P(x)`, in addition to other constant terms from the Gaussian. The
+predicted class is the one that maximises this log-posterior.
 
 .. note:: **Relation with Gaussian Naive Bayes**
 
@@ -107,22 +98,60 @@ In the case of QDA, there are no assumptions on the covariance matrices
 	  and the resulting classifier is equivalent to the Gaussian Naive Bayes
 	  classifier :class:`naive_bayes.GaussianNB`.
 
-Mathematical formulation of LDA dimensionality reduction
-========================================================
+LDA
+---
+
+LDA is a special case of QDA, where the Gaussians for each class are assumed
+to share the same covariance matrix: :math:`\Sigma_k = \Sigma` for all
+:math:`k`. This reduces the log posterior to:
 
-To understand the use of LDA in dimensionality reduction, it is useful to start
-with a geometric reformulation of the LDA classification rule explained above.
-We write :math:`K` for the total number of target classes. Since in LDA we
-assume that all classes have the same estimated covariance :math:`\Sigma`, we
-can rescale the data so that this covariance is the identity:
+.. math:: \log P(y=k | x) = -\frac{1}{2} (x-\mu_k)^t \Sigma^{-1} (x-\mu_k) + \log P(y = k) + Cst.
+
+The term :math:`(x-\mu_k)^t \Sigma^{-1} (x-\mu_k)` corresponds to the
+`Mahalanobis Distance <https://en.wikipedia.org/wiki/Mahalanobis_distance>`_
+between the sample :math:`x` and the mean :math:`\mu_k`. The Mahalanobis
+distance tells how close :math:`x` is from :math:`\mu_k`, while also
+accounting for the variance of each feature. We can thus interpret LDA as
+assigning :math:`x` to the class whose mean is the closest in terms of
+Mahalanobis distance, while also accounting for the class prior
+probabilities.
+
+The log-posterior of LDA can also be written [3]_ as:
+
+.. math::
 
-.. math:: X^* = D^{-1/2}U^t X\text{ with }\Sigma = UDU^t
+    \log P(y=k | x) = \omega_k^t x + \omega_{k0} + Cst.
 
-Then one can show that to classify a data point after scaling is equivalent to
-finding the estimated class mean :math:`\mu^*_k` which is closest to the data
-point in the Euclidean distance. But this can be done just as well after
-projecting on the :math:`K-1` affine subspace :math:`H_K` generated by all the
-:math:`\mu^*_k` for all classes. This shows that, implicit in the LDA
+where :math:`\omega_k = \Sigma^{-1} \mu_k` and :math:`\omega_{k0} =
+-\frac{1}{2} \mu_k^t\Sigma^{-1}\mu_k + \log P (y = k)`. These quantities
+correspond to the `coef_` and `intercept_` attributes, respectively.
+
+From the above formula, it is clear that LDA has a linear decision surface.
+In the case of QDA, there are no assumptions on the covariance matrices
+:math:`\Sigma_k` of the Gaussians, leading to quadratic decision surfaces.
+See [1]_ for more details.
+
+Mathematical formulation of LDA dimensionality reduction
+========================================================
+
+First note that the K means :math:`\mu_k` are vectors in
+:math:`\mathcal{R}^d`, and they lie in an affine subspace :math:`H` of
+dimension at least :math:`K - 1` (2 points lie on a line, 3 points lie on a
+plane, etc).
+
+As mentioned above, we can interpret LDA as assigning :math:`x` to the class
+whose mean :math:`\mu_k` is the closest in terms of Mahalanobis distance,
+while also accounting for the class prior probabilities. Alternatively, LDA
+is equivalent to first *sphering* the data so that the covariance matrix is
+the identity, and then assigning :math:`x` to the closest mean in terms of
+Euclidean distance (still accounting for the class priors).
+
+Computing Euclidean distances in this d-dimensional space is equivalent to
+first projecting the data points into :math:`H`, and computing the distances
+there (since the other dimensions will contribute equally to each class in
+terms of distance). In other words, if :math:`x` is closest to :math:`\mu_k`
+in the original space, it will also be the case in :math:`H`.
+This shows that, implicit in the LDA
 classifier, there is a dimensionality reduction by linear projection onto a
 :math:`K-1` dimensional space.
 
@@ -131,19 +160,22 @@ onto the linear subspace :math:`H_L` which maximizes the variance of the
 :math:`\mu^*_k` after projection (in effect, we are doing a form of PCA for the
 transformed class means :math:`\mu^*_k`). This :math:`L` corresponds to the
 ``n_components`` parameter used in the
-:func:`discriminant_analysis.LinearDiscriminantAnalysis.transform` method. See
-[#1]_ for more details.
+:func:`~discriminant_analysis.LinearDiscriminantAnalysis.transform` method. See
+[1]_ for more details.
 
 Shrinkage
 =========
 
-Shrinkage is a tool to improve estimation of covariance matrices in situations
-where the number of training samples is small compared to the number of
-features. In this scenario, the empirical sample covariance is a poor
-estimator. Shrinkage LDA can be used by setting the ``shrinkage`` parameter of
-the :class:`discriminant_analysis.LinearDiscriminantAnalysis` class to 'auto'.
+Shrinkage is a form of regularization used to improve the estimation of
+covariance matrices in situations where the number of training samples is
+small compared to the number of features.
+In this scenario, the empirical sample covariance is a poor
+estimator, and shrinkage helps improving the generalization performance of
+the classifier.
+Shrinkage LDA can be used by setting the ``shrinkage`` parameter of
+the :class:`~discriminant_analysis.LinearDiscriminantAnalysis` class to 'auto'.
 This automatically determines the optimal shrinkage parameter in an analytic
-way following the lemma introduced by Ledoit and Wolf [#2]_. Note that
+way following the lemma introduced by Ledoit and Wolf [2]_. Note that
 currently shrinkage only works when setting the ``solver`` parameter to 'lsqr'
 or 'eigen'.
 
@@ -165,13 +197,33 @@ matrix.
 Estimation algorithms
 =====================
 
-The default solver is 'svd'. It can perform both classification and transform,
-and it does not rely on the calculation of the covariance matrix. This can be
-an advantage in situations where the number of features is large. However, the
-'svd' solver cannot be used with shrinkage.
-
-The 'lsqr' solver is an efficient algorithm that only works for classification.
-It supports shrinkage.
+Using LDA and QDA requires computing the log-posterior which depends on the
+class priors :math:`P(y=k)`, the class means :math:`\mu_k`, and the
+covariance matrices.
+
+The 'svd' solver is the default solver used for
+:class:`~sklearn.discriminant_analysis.LinearDiscriminantAnalysis`, and it is
+the only available solver for
+:class:`~sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis`.
+It can perform both classification and transform (for LDA).
+As it does not rely on the calculation of the covariance matrix, the 'svd'
+solver may be preferable in situations where the number of features is large.
+The 'svd' solver cannot be used with shrinkage.
+For QDA, the use of the SVD solver relies on the fact that the covariance
+matrix :math:`\Sigma_k` is, by definition, equal to :math:`\frac{1}{n - 1}
+X_k^tX_k = V S^2 V^t` where :math:`V` comes from the SVD of the (centered)
+matrix: :math:`X_k = U S V^t`. It turns out that we can compute the
+log-posterior above without having to explictly compute :math:`\Sigma`:
+computing :math:`S` and :math:`V` via the SVD of :math:`X` is enough. For
+LDA, two SVDs are computed: the SVD of the centered input matrix :math:`X`
+and the SVD of the class-wise mean vectors.
+
+The 'lsqr' solver is an efficient algorithm that only works for
+classification. It needs to explicitly compute the covariance matrix
+:math:`\Sigma`, and supports shrinkage. This solver computes the coefficients
+:math:`\omega_k = \Sigma^{-1}\mu_k` by solving for :math:`\Sigma \omega =
+\mu_k`, thus avoiding the explicit computation of the inverse
+:math:`\Sigma^{-1}`.
 
 The 'eigen' solver is based on the optimization of the between class scatter to
 within class scatter ratio. It can be used for both classification and
@@ -186,8 +238,11 @@ a high number of features.
 
 .. topic:: References:
 
-   .. [#1] "The Elements of Statistical Learning", Hastie T., Tibshirani R.,
+   .. [1] "The Elements of Statistical Learning", Hastie T., Tibshirani R.,
       Friedman J., Section 4.3, p.106-119, 2008.
 
-   .. [#2] Ledoit O, Wolf M. Honey, I Shrunk the Sample Covariance Matrix.
+   .. [2] Ledoit O, Wolf M. Honey, I Shrunk the Sample Covariance Matrix.
       The Journal of Portfolio Management 30(4), 110-119, 2004.
+
+   .. [3] R. O. Duda, P. E. Hart, D. G. Stork. Pattern Classification
+      (Second Edition), section 2.6.2.
diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py
index 45e623904b9ea..b07570a3f0a75 100644
--- a/sklearn/discriminant_analysis.py
+++ b/sklearn/discriminant_analysis.py
@@ -94,7 +94,9 @@ def _class_means(X, y):
 
 
 def _class_cov(X, y, priors, shrinkage=None):
-    """Compute class covariance matrix.
+    """Compute weighted within-class covariance matrix.
+
+    The per-class covariance are weighted by the class priors.
 
     Parameters
     ----------
@@ -116,7 +118,7 @@ def _class_cov(X, y, priors, shrinkage=None):
     Returns
     -------
     cov : array-like of shape (n_features, n_features)
-        Class covariance matrix.
+        Weighted within-class covariance matrix
     """
     classes = np.unique(y)
     cov = np.zeros(shape=(X.shape[1], X.shape[1]))
@@ -137,7 +139,8 @@ class LinearDiscriminantAnalysis(BaseEstimator, LinearClassifierMixin,
     share the same covariance matrix.
 
     The fitted model can also be used to reduce the dimensionality of the input
-    by projecting it to the most discriminative directions.
+    by projecting it to the most discriminative directions, using the
+    `transform` method.
 
     .. versionadded:: 0.17
        *LinearDiscriminantAnalysis*.
@@ -163,21 +166,27 @@ class LinearDiscriminantAnalysis(BaseEstimator, LinearClassifierMixin,
         Note that shrinkage works only with 'lsqr' and 'eigen' solvers.
 
     priors : array-like of shape (n_classes,), default=None
-        Class priors.
+        The class prior probabilities. By default, the class proportions are
+        inferred from the training data.
 
     n_components : int, default=None
         Number of components (<= min(n_classes - 1, n_features)) for
         dimensionality reduction. If None, will be set to
-        min(n_classes - 1, n_features).
+        min(n_classes - 1, n_features). This parameter only affects the
+        `transform` method.
 
     store_covariance : bool, default=False
-        Additionally compute class covariance matrix (default False), used
-        only in 'svd' solver.
+        If True, explicitely compute the weighted within-class covariance
+        matrix when solver is 'svd'. The matrix is always computed
+        and stored for the other solvers.
 
         .. versionadded:: 0.17
 
     tol : float, default=1.0e-4
-        Threshold used for rank estimation in SVD solver.
+        Absolute threshold for a singular value of X to be considered
+        significant, used to estimate the rank of X. Dimensions whose
+        singular values are non-significant are discarded. Only used if
+        solver is 'svd'.
 
         .. versionadded:: 0.17
 
@@ -190,8 +199,11 @@ class LinearDiscriminantAnalysis(BaseEstimator, LinearClassifierMixin,
         Intercept term.
 
     covariance_ : array-like of shape (n_features, n_features)
-        Covariance matrix (shared by all classes). Only available
-        `store_covariance` is True.
+        Weighted within-class covariance matrix. It corresponds to
+        `sum_k prior_k * C_k` where `C_k` is the covariance matrix of the
+        samples in class `k`. The `C_k` are estimated using the (potentially
+        shrunk) biased estimator of covariance. If solver is 'svd', only
+        exists when `store_covariance` is True.
 
     explained_variance_ratio_ : ndarray of shape (n_components,)
         Percentage of variance explained by each of the selected components.
@@ -200,16 +212,17 @@ class LinearDiscriminantAnalysis(BaseEstimator, LinearClassifierMixin,
         or svd solver is used.
 
     means_ : array-like of shape (n_classes, n_features)
-        Class means.
+        Class-wise means.
 
     priors_ : array-like of shape (n_classes,)
         Class priors (sum to 1).
 
     scalings_ : array-like of shape (rank, n_classes - 1)
         Scaling of the features in the space spanned by the class centroids.
+        Only available for 'svd' and 'eigen' solvers.
 
     xbar_ : array-like of shape (n_features,)
-        Overall mean.
+        Overall mean. Only present if solver is 'svd'.
 
     classes_ : array-like of shape (n_classes,)
         Unique class labels.
@@ -219,22 +232,6 @@ class LinearDiscriminantAnalysis(BaseEstimator, LinearClassifierMixin,
     sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis: Quadratic
         Discriminant Analysis
 
-    Notes
-    -----
-    The default solver is 'svd'. It can perform both classification and
-    transform, and it does not rely on the calculation of the covariance
-    matrix. This can be an advantage in situations where the number of features
-    is large. However, the 'svd' solver cannot be used with shrinkage.
-
-    The 'lsqr' solver is an efficient algorithm that only works for
-    classification. It supports shrinkage.
-
-    The 'eigen' solver is based on the optimization of the between class
-    scatter to within class scatter ratio. It can be used for both
-    classification and transform, and it supports shrinkage. However, the
-    'eigen' solver needs to compute the covariance matrix, so it might not be
-    suitable for situations with a high number of features.
-
     Examples
     --------
     >>> import numpy as np
@@ -542,6 +539,29 @@ def predict_log_proba(self, X):
         """
         return np.log(self.predict_proba(X))
 
+    def decision_function(self, X):
+        """Apply decision function to an array of samples.
+
+        The decision function is equal (up to a constant factor) to the
+        log-posterior of the model, i.e. `log p(y = k | x)`. In a binary
+        classification setting this instead corresponds to the difference
+        `log p(y = 1 | x) - log p(y = 0 | x)`. See :ref:`lda_qda_math`.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Array of samples (test vectors).
+
+        Returns
+        -------
+        C : ndarray of shape (n_samples,) or (n_samples, n_classes)
+            Decision function values related to each class, per sample.
+            In the two-class case, the shape is (n_samples,), giving the
+            log likelihood ratio of the positive class.
+        """
+        # Only override for the doc
+        return super().decision_function(X)
+
 
 class QuadraticDiscriminantAnalysis(ClassifierMixin, BaseEstimator):
     """Quadratic Discriminant Analysis
@@ -560,47 +580,60 @@ class QuadraticDiscriminantAnalysis(ClassifierMixin, BaseEstimator):
     Parameters
     ----------
     priors : ndarray of shape (n_classes,), default=None
-        Priors on classes
+        Class priors. By default, the class proportions are inferred from the
+        training data.
 
     reg_param : float, default=0.0
-        Regularizes the covariance estimate as
-        ``(1-reg_param)*Sigma + reg_param*np.eye(n_features)``
+        Regularizes the per-class covariance estimates by transforming S2 as
+        ``S2 = (1 - reg_param) * S2 + reg_param * np.eye(n_features)``,
+        where S2 corresponds to the `scaling_` attribute of a given class.
 
     store_covariance : bool, default=False
-        If True the covariance matrices are computed and stored in the
-        `self.covariance_` attribute.
+        If True, the class covariance matrices are explicitely computed and
+        stored in the `self.covariance_` attribute.
 
         .. versionadded:: 0.17
 
     tol : float, default=1.0e-4
-        Threshold used for rank estimation.
+        Absolute threshold for a singular value to be considered significant,
+        used to estimate the rank of `Xk` where `Xk` is the centered matrix
+        of samples in class k. This parameter does not affect the
+        predictions. It only controls a warning that is raised when features
+        are considered to be colinear.
 
         .. versionadded:: 0.17
 
     Attributes
     ----------
-    covariance_ : list of array-like of shape (n_features, n_features)
-        Covariance matrices of each class. Only available
+    covariance_ : list of len n_classes of ndarray \
+            of shape (n_features, n_features)
+        For each class, gives the covariance matrix estimated using the
+        samples of that class. The estimations are unbiased. Only present if
         `store_covariance` is True.
 
     means_ : array-like of shape (n_classes, n_features)
-        Class means.
+        Class-wise means.
 
     priors_ : array-like of shape (n_classes,)
         Class priors (sum to 1).
 
-    rotations_ : list of ndarrays
-        For each class k an array of shape (n_features, n_k), with
+    rotations_ : list of len n_classes of ndarray of shape (n_features, n_k)
+        For each class k an array of shape (n_features, n_k), where
         ``n_k = min(n_features, number of elements in class k)``
         It is the rotation of the Gaussian distribution, i.e. its
-        principal axis.
-
-    scalings_ : list of ndarrays
-        For each class k an array of shape (n_k,). It contains the scaling
-        of the Gaussian distributions along its principal axes, i.e. the
-        variance in the rotated coordinate system.
-
-    classes_ : array-like of shape (n_classes,)
+        principal axis. It corresponds to `V`, the matrix of eigenvectors
+        coming from the SVD of `Xk = U S Vt` where `Xk` is the centered
+        matrix of samples from class k.
+
+    scalings_ : list of len n_classes of ndarray of shape (n_k,)
+        For each class, contains the scaling of
+        the Gaussian distributions along its principal axes, i.e. the
+        variance in the rotated coordinate system. It corresponds to `S^2 /
+        (n_samples - 1)`, where `S` is the diagonal matrix of singular values
+        from the SVD of `Xk`, where `Xk` is the centered matrix of samples
+        from class k.
+
+    classes_ : ndarray of shape (n_classes,)
         Unique class labels.
 
     Examples
@@ -676,7 +709,7 @@ def fit(self, X, y):
                                  'is ill defined.' % str(self.classes_[ind]))
             Xgc = Xg - meang
             # Xgc = U * S * V.T
-            U, S, Vt = np.linalg.svd(Xgc, full_matrices=False)
+            _, S, Vt = np.linalg.svd(Xgc, full_matrices=False)
             rank = np.sum(S > self.tol)
             if rank < n_features:
                 warnings.warn("Variables are collinear")
@@ -695,6 +728,7 @@ def fit(self, X, y):
         return self
 
     def _decision_function(self, X):
+        # return log posterior, see eq (4.12) p. 110 of the ESL.
         check_is_fitted(self)
 
         X = check_array(X)
@@ -704,7 +738,7 @@ def _decision_function(self, X):
             S = self.scalings_[i]
             Xm = X - self.means_[i]
             X2 = np.dot(Xm, R * (S ** (-0.5)))
-            norm2.append(np.sum(X2 ** 2, 1))
+            norm2.append(np.sum(X2 ** 2, axis=1))
         norm2 = np.array(norm2).T  # shape = [len(X), n_classes]
         u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
         return (-0.5 * (norm2 + u) + np.log(self.priors_))
@@ -712,6 +746,11 @@ def _decision_function(self, X):
     def decision_function(self, X):
         """Apply decision function to an array of samples.
 
+        The decision function is equal (up to a constant factor) to the
+        log-posterior of the model, i.e. `log p(y = k | x)`. In a binary
+        classification setting this instead corresponds to the difference
+        `log p(y = 1 | x) - log p(y = 0 | x)`. See :ref:`lda_qda_math`.
+
         Parameters
         ----------
         X : array-like of shape (n_samples, n_features)
@@ -721,7 +760,7 @@ def decision_function(self, X):
         -------
         C : ndarray of shape (n_samples,) or (n_samples, n_classes)
             Decision function values related to each class, per sample.
-            In the two-class case, the shape is [n_samples,], giving the
+            In the two-class case, the shape is (n_samples,), giving the
             log likelihood ratio of the positive class.
         """
         dec_func = self._decision_function(X)
@@ -768,7 +807,7 @@ def predict_proba(self, X):
         return likelihood / likelihood.sum(axis=1)[:, np.newaxis]
 
     def predict_log_proba(self, X):
-        """Return posterior probabilities of classification.
+        """Return log of posterior probabilities of classification.
 
         Parameters
         ----------

From dc0cc6e9bec5a9fac6358384ba7a827adc62d87c Mon Sep 17 00:00:00 2001
From: Adrin Jalali <adrin.jalali@gmail.com>
Date: Sun, 19 Apr 2020 22:58:47 +0200
Subject: [PATCH 024/125] [MRG] API kwonly args in impute, inspection,
 kernel_ridge (#16926)

, and linear_model
---
 sklearn/impute/_base.py                       | 10 ++++---
 sklearn/impute/_iterative.py                  |  3 +--
 sklearn/impute/_knn.py                        |  5 ++--
 sklearn/impute/tests/test_impute.py           |  4 +--
 sklearn/inspection/_partial_dependence.py     |  4 ++-
 sklearn/inspection/_permutation_importance.py |  4 ++-
 .../inspection/_plot/partial_dependence.py    | 15 ++++++++---
 sklearn/kernel_ridge.py                       |  6 +++--
 sklearn/linear_model/_base.py                 |  5 ++--
 sklearn/linear_model/_bayes.py                |  9 ++++---
 sklearn/linear_model/_coordinate_descent.py   | 27 +++++++++++++------
 sklearn/linear_model/_huber.py                |  5 ++--
 sklearn/linear_model/_least_angle.py          | 22 ++++++++++-----
 sklearn/linear_model/_logistic.py             |  8 +++---
 sklearn/linear_model/_omp.py                  | 18 ++++++++-----
 sklearn/linear_model/_passive_aggressive.py   |  7 +++--
 sklearn/linear_model/_perceptron.py           |  5 ++--
 sklearn/linear_model/_ransac.py               |  5 ++--
 sklearn/linear_model/_ridge.py                | 26 ++++++++++--------
 sklearn/linear_model/_sag.py                  |  2 ++
 sklearn/linear_model/_stochastic_gradient.py  | 19 ++++++++-----
 sklearn/linear_model/_theil_sen.py            |  5 ++--
 sklearn/linear_model/tests/test_omp.py        | 17 +++++++-----
 sklearn/linear_model/tests/test_ransac.py     |  4 ++-
 24 files changed, 151 insertions(+), 84 deletions(-)

diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py
index 608f8f54ee162..5f1069708a20e 100644
--- a/sklearn/impute/_base.py
+++ b/sklearn/impute/_base.py
@@ -14,6 +14,7 @@
 from ..utils.sparsefuncs import _get_median
 from ..utils.validation import check_is_fitted
 from ..utils.validation import FLOAT_DTYPES
+from ..utils.validation import _deprecate_positional_args
 from ..utils._mask import _get_mask
 from ..utils import is_scalar_nan
 
@@ -67,7 +68,7 @@ class _BaseImputer(TransformerMixin, BaseEstimator):
     It adds automatically support for `add_indicator`.
     """
 
-    def __init__(self, missing_values=np.nan, add_indicator=False):
+    def __init__(self, *, missing_values=np.nan, add_indicator=False):
         self.missing_values = missing_values
         self.add_indicator = add_indicator
 
@@ -205,7 +206,8 @@ class SimpleImputer(_BaseImputer):
     upon :meth:`transform` if strategy is not "constant".
 
     """
-    def __init__(self, missing_values=np.nan, strategy="mean",
+    @_deprecate_positional_args
+    def __init__(self, *, missing_values=np.nan, strategy="mean",
                  fill_value=None, verbose=0, copy=True, add_indicator=False):
         super().__init__(
             missing_values=missing_values,
@@ -525,8 +527,8 @@ class MissingIndicator(TransformerMixin, BaseEstimator):
            [False, False]])
 
     """
-
-    def __init__(self, missing_values=np.nan, features="missing-only",
+    @_deprecate_positional_args
+    def __init__(self, *, missing_values=np.nan, features="missing-only",
                  sparse="auto", error_on_new=True):
         self.missing_values = missing_values
         self.features = features
diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py
index 88eff8503d510..17a3d05507205 100644
--- a/sklearn/impute/_iterative.py
+++ b/sklearn/impute/_iterative.py
@@ -206,9 +206,8 @@ class IterativeImputer(_BaseImputer):
         Journal of the Royal Statistical Society 22(2): 302-306.
         <https://www.jstor.org/stable/2984099>`_
     """
-
     def __init__(self,
-                 estimator=None,
+                 estimator=None, *,
                  missing_values=np.nan,
                  sample_posterior=False,
                  max_iter=10,
diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py
index f782a46a6b40d..44fccf024247e 100644
--- a/sklearn/impute/_knn.py
+++ b/sklearn/impute/_knn.py
@@ -14,6 +14,7 @@
 from ..utils import is_scalar_nan
 from ..utils._mask import _get_mask
 from ..utils.validation import check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 
 
 class KNNImputer(_BaseImputer):
@@ -94,8 +95,8 @@ class KNNImputer(_BaseImputer):
            [5.5, 6. , 5. ],
            [8. , 8. , 7. ]])
     """
-
-    def __init__(self, missing_values=np.nan, n_neighbors=5,
+    @_deprecate_positional_args
+    def __init__(self, *, missing_values=np.nan, n_neighbors=5,
                  weights="uniform", metric="nan_euclidean", copy=True,
                  add_indicator=False):
         super().__init__(
diff --git a/sklearn/impute/tests/test_impute.py b/sklearn/impute/tests/test_impute.py
index 50f60ff6e96ad..58c71660b401d 100644
--- a/sklearn/impute/tests/test_impute.py
+++ b/sklearn/impute/tests/test_impute.py
@@ -48,14 +48,14 @@ def _check_statistics(X, X_true,
         assert_ae = assert_array_almost_equal
 
     # Normal matrix
-    imputer = SimpleImputer(missing_values, strategy=strategy)
+    imputer = SimpleImputer(missing_values=missing_values, strategy=strategy)
     X_trans = imputer.fit(X).transform(X.copy())
     assert_ae(imputer.statistics_, statistics,
               err_msg=err_msg.format(False))
     assert_ae(X_trans, X_true, err_msg=err_msg.format(False))
 
     # Sparse matrix
-    imputer = SimpleImputer(missing_values, strategy=strategy)
+    imputer = SimpleImputer(missing_values=missing_values, strategy=strategy)
     imputer.fit(sparse.csc_matrix(X))
     X_trans = imputer.transform(sparse.csc_matrix(X.copy()))
 
diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py
index f0fbc23333266..f3bb10a1a3275 100644
--- a/sklearn/inspection/_partial_dependence.py
+++ b/sklearn/inspection/_partial_dependence.py
@@ -20,6 +20,7 @@
 from ..utils import _determine_key_type
 from ..utils import _get_column_indices
 from ..utils.validation import check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 from ..tree import DecisionTreeRegressor
 from ..ensemble import RandomForestRegressor
 from ..exceptions import NotFittedError
@@ -181,7 +182,8 @@ def _partial_dependence_brute(est, grid, features, X, response_method):
     return averaged_predictions
 
 
-def partial_dependence(estimator, X, features, response_method='auto',
+@_deprecate_positional_args
+def partial_dependence(estimator, X, features, *, response_method='auto',
                        percentiles=(0.05, 0.95), grid_resolution=100,
                        method='auto'):
     """Partial dependence of ``features``.
diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
index 8efafd8a7eef4..e8d77360a7ca0 100644
--- a/sklearn/inspection/_permutation_importance.py
+++ b/sklearn/inspection/_permutation_importance.py
@@ -7,6 +7,7 @@
 from ..utils import Bunch
 from ..utils import check_random_state
 from ..utils import check_array
+from ..utils.validation import _deprecate_positional_args
 
 
 def _calculate_permutation_scores(estimator, X, y, col_idx, random_state,
@@ -37,7 +38,8 @@ def _calculate_permutation_scores(estimator, X, y, col_idx, random_state,
     return scores
 
 
-def permutation_importance(estimator, X, y, scoring=None, n_repeats=5,
+@_deprecate_positional_args
+def permutation_importance(estimator, X, y, *, scoring=None, n_repeats=5,
                            n_jobs=None, random_state=None):
     """Permutation importance for feature evaluation [BRE]_.
 
diff --git a/sklearn/inspection/_plot/partial_dependence.py b/sklearn/inspection/_plot/partial_dependence.py
index f39c604cac77b..4a83ac057d91e 100644
--- a/sklearn/inspection/_plot/partial_dependence.py
+++ b/sklearn/inspection/_plot/partial_dependence.py
@@ -13,9 +13,11 @@
 from ...utils import check_array
 from ...utils import check_matplotlib_support  # noqa
 from ...utils import _safe_indexing
+from ...utils.validation import _deprecate_positional_args
 
 
-def plot_partial_dependence(estimator, X, features, feature_names=None,
+@_deprecate_positional_args
+def plot_partial_dependence(estimator, X, features, *, feature_names=None,
                             target=None, response_method='auto', n_cols=3,
                             grid_resolution=100, percentiles=(0.05, 0.95),
                             method='auto', n_jobs=None, verbose=0, fig=None,
@@ -322,8 +324,12 @@ def convert_feature(fx):
         fig.clear()
         ax = fig.gca()
 
-    display = PartialDependenceDisplay(pd_results, features, feature_names,
-                                       target_idx, pdp_lim, deciles)
+    display = PartialDependenceDisplay(pd_results=pd_results,
+                                       features=features,
+                                       feature_names=feature_names,
+                                       target_idx=target_idx,
+                                       pdp_lim=pdp_lim,
+                                       deciles=deciles)
     return display.plot(ax=ax, n_cols=n_cols, line_kw=line_kw,
                         contour_kw=contour_kw)
 
@@ -406,7 +412,8 @@ class PartialDependenceDisplay:
         Figure containing partial dependence plots.
 
     """
-    def __init__(self, pd_results, features, feature_names, target_idx,
+    @_deprecate_positional_args
+    def __init__(self, pd_results, *, features, feature_names, target_idx,
                  pdp_lim, deciles):
         self.pd_results = pd_results
         self.features = features
diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py
index d08c706caefc4..f11501960b29c 100644
--- a/sklearn/kernel_ridge.py
+++ b/sklearn/kernel_ridge.py
@@ -10,6 +10,7 @@
 from .metrics.pairwise import pairwise_kernels
 from .linear_model._ridge import _solve_cholesky_kernel
 from .utils.validation import check_is_fitted, _check_sample_weight
+from .utils.validation import _deprecate_positional_args
 
 
 class KernelRidge(MultiOutputMixin, RegressorMixin, BaseEstimator):
@@ -113,8 +114,9 @@ class KernelRidge(MultiOutputMixin, RegressorMixin, BaseEstimator):
     >>> clf.fit(X, y)
     KernelRidge(alpha=1.0)
     """
-    def __init__(self, alpha=1, kernel="linear", gamma=None, degree=3, coef0=1,
-                 kernel_params=None):
+    @_deprecate_positional_args
+    def __init__(self, alpha=1, *, kernel="linear", gamma=None, degree=3,
+                 coef0=1, kernel_params=None):
         self.alpha = alpha
         self.kernel = kernel
         self.gamma = gamma
diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index 8e91767b9ff53..56e5e24761128 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -28,6 +28,7 @@
                     MultiOutputMixin)
 from ..utils import check_array
 from ..utils.validation import FLOAT_DTYPES
+from ..utils.validation import _deprecate_positional_args
 from ..utils import check_random_state
 from ..utils.extmath import safe_sparse_dot
 from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale
@@ -466,8 +467,8 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):
     >>> reg.predict(np.array([[3, 5]]))
     array([16.])
     """
-
-    def __init__(self, fit_intercept=True, normalize=False, copy_X=True,
+    @_deprecate_positional_args
+    def __init__(self, *, fit_intercept=True, normalize=False, copy_X=True,
                  n_jobs=None):
         self.fit_intercept = fit_intercept
         self.normalize = normalize
diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py
index 397461e73d8be..eb12a271695cd 100644
--- a/sklearn/linear_model/_bayes.py
+++ b/sklearn/linear_model/_bayes.py
@@ -14,6 +14,7 @@
 from ..utils.extmath import fast_logdet
 from ..utils.fixes import pinvh
 from ..utils.validation import _check_sample_weight
+from ..utils.validation import _deprecate_positional_args
 
 
 ###############################################################################
@@ -145,8 +146,8 @@ class BayesianRidge(RegressorMixin, LinearModel):
     M. E. Tipping, Sparse Bayesian Learning and the Relevance Vector Machine,
     Journal of Machine Learning Research, Vol. 1, 2001.
     """
-
-    def __init__(self, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6,
+    @_deprecate_positional_args
+    def __init__(self, *, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6,
                  lambda_1=1.e-6, lambda_2=1.e-6, alpha_init=None,
                  lambda_init=None, compute_score=False, fit_intercept=True,
                  normalize=False, copy_X=True, verbose=False):
@@ -489,8 +490,8 @@ class ARDRegression(RegressorMixin, LinearModel):
     which ``self.lambda_ < self.threshold_lambda`` are kept and the rest are
     discarded.
     """
-
-    def __init__(self, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6,
+    @_deprecate_positional_args
+    def __init__(self, *, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6,
                  lambda_1=1.e-6, lambda_2=1.e-6, compute_score=False,
                  threshold_lambda=1.e+4, fit_intercept=True, normalize=False,
                  copy_X=True, verbose=False):
diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index 46e924abbc1d0..5d932033dbd0d 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -24,6 +24,7 @@
 from ..utils.fixes import _astype_copy_false, _joblib_parallel_args
 from ..utils.validation import check_is_fitted, _check_sample_weight
 from ..utils.validation import column_or_1d
+from ..utils.validation import _deprecate_positional_args
 
 # mypy error: Module 'sklearn.linear_model' has no attribute '_cd_fast'
 from . import _cd_fast as cd_fast  # type: ignore
@@ -690,7 +691,8 @@ class ElasticNet(MultiOutputMixin, RegressorMixin, LinearModel):
     """
     path = staticmethod(enet_path)
 
-    def __init__(self, alpha=1.0, l1_ratio=0.5, fit_intercept=True,
+    @_deprecate_positional_args
+    def __init__(self, alpha=1.0, *, l1_ratio=0.5, fit_intercept=True,
                  normalize=False, precompute=False, max_iter=1000,
                  copy_X=True, tol=1e-4, warm_start=False, positive=False,
                  random_state=None, selection='cyclic'):
@@ -1003,7 +1005,8 @@ class Lasso(ElasticNet):
     """
     path = staticmethod(enet_path)
 
-    def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
+    @_deprecate_positional_args
+    def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False,
                  precompute=False, copy_X=True, max_iter=1000,
                  tol=1e-4, warm_start=False, positive=False,
                  random_state=None, selection='cyclic'):
@@ -1466,7 +1469,9 @@ class LassoCV(RegressorMixin, LinearModelCV):
     """
     path = staticmethod(lasso_path)
 
-    def __init__(self, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True,
+    @_deprecate_positional_args
+    def __init__(self, *, eps=1e-3, n_alphas=100, alphas=None,
+                 fit_intercept=True,
                  normalize=False, precompute='auto', max_iter=1000, tol=1e-4,
                  copy_X=True, cv=None, verbose=False, n_jobs=None,
                  positive=False, random_state=None, selection='cyclic'):
@@ -1662,7 +1667,8 @@ class ElasticNetCV(RegressorMixin, LinearModelCV):
     """
     path = staticmethod(enet_path)
 
-    def __init__(self, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
+    @_deprecate_positional_args
+    def __init__(self, *, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
                  fit_intercept=True, normalize=False, precompute='auto',
                  max_iter=1000, tol=1e-4, cv=None, copy_X=True,
                  verbose=0, n_jobs=None, positive=False, random_state=None,
@@ -1801,7 +1807,8 @@ class MultiTaskElasticNet(Lasso):
     To avoid unnecessary memory duplication the X argument of the fit method
     should be directly passed as a Fortran-contiguous numpy array.
     """
-    def __init__(self, alpha=1.0, l1_ratio=0.5, fit_intercept=True,
+    @_deprecate_positional_args
+    def __init__(self, alpha=1.0, *, l1_ratio=0.5, fit_intercept=True,
                  normalize=False, copy_X=True, max_iter=1000, tol=1e-4,
                  warm_start=False, random_state=None, selection='cyclic'):
         self.l1_ratio = l1_ratio
@@ -1983,7 +1990,8 @@ class MultiTaskLasso(MultiTaskElasticNet):
     To avoid unnecessary memory duplication the X argument of the fit method
     should be directly passed as a Fortran-contiguous numpy array.
     """
-    def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
+    @_deprecate_positional_args
+    def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False,
                  copy_X=True, max_iter=1000, tol=1e-4, warm_start=False,
                  random_state=None, selection='cyclic'):
         self.alpha = alpha
@@ -2162,7 +2170,8 @@ class MultiTaskElasticNetCV(RegressorMixin, LinearModelCV):
     """
     path = staticmethod(enet_path)
 
-    def __init__(self, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
+    @_deprecate_positional_args
+    def __init__(self, *, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
                  fit_intercept=True, normalize=False,
                  max_iter=1000, tol=1e-4, cv=None, copy_X=True,
                  verbose=0, n_jobs=None, random_state=None,
@@ -2333,7 +2342,9 @@ class MultiTaskLassoCV(RegressorMixin, LinearModelCV):
     """
     path = staticmethod(lasso_path)
 
-    def __init__(self, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True,
+    @_deprecate_positional_args
+    def __init__(self, *, eps=1e-3, n_alphas=100, alphas=None,
+                 fit_intercept=True,
                  normalize=False, max_iter=1000, tol=1e-4, copy_X=True,
                  cv=None, verbose=False, n_jobs=None, random_state=None,
                  selection='cyclic'):
diff --git a/sklearn/linear_model/_huber.py b/sklearn/linear_model/_huber.py
index d9046d3a1ee9b..77e6ff944b78d 100644
--- a/sklearn/linear_model/_huber.py
+++ b/sklearn/linear_model/_huber.py
@@ -9,6 +9,7 @@
 from ._base import LinearModel
 from ..utils import axis0_safe_slice
 from ..utils.validation import _check_sample_weight
+from ..utils.validation import _deprecate_positional_args
 from ..utils.extmath import safe_sparse_dot
 from ..utils.optimize import _check_optimize_result
 
@@ -222,8 +223,8 @@ class HuberRegressor(LinearModel, RegressorMixin, BaseEstimator):
     .. [2] Art B. Owen (2006), A robust hybrid of lasso and ridge regression.
            https://statweb.stanford.edu/~owen/reports/hhu.pdf
     """
-
-    def __init__(self, epsilon=1.35, max_iter=100, alpha=0.0001,
+    @_deprecate_positional_args
+    def __init__(self, *, epsilon=1.35, max_iter=100, alpha=0.0001,
                  warm_start=False, fit_intercept=True, tol=1e-05):
         self.epsilon = epsilon
         self.max_iter = max_iter
diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index bc71d7a1fccbd..255baacea9a59 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -24,11 +24,13 @@
 from ..utils import check_random_state
 from ..model_selection import check_cv
 from ..exceptions import ConvergenceWarning
+from ..utils.validation import _deprecate_positional_args
 
 SOLVE_TRIANGULAR_ARGS = {'check_finite': False}
 
 
-def lars_path(X, y, Xy=None, Gram=None, max_iter=500, alpha_min=0,
+@_deprecate_positional_args
+def lars_path(X, y, Xy=None, *, Gram=None, max_iter=500, alpha_min=0,
               method='lar', copy_X=True, eps=np.finfo(np.float).eps,
               copy_Gram=True, verbose=0, return_path=True,
               return_n_iter=False, positive=False):
@@ -157,7 +159,8 @@ def lars_path(X, y, Xy=None, Gram=None, max_iter=500, alpha_min=0,
         return_n_iter=return_n_iter, positive=positive)
 
 
-def lars_path_gram(Xy, Gram, n_samples, max_iter=500, alpha_min=0,
+@_deprecate_positional_args
+def lars_path_gram(Xy, Gram, *, n_samples, max_iter=500, alpha_min=0,
                    method='lar', copy_X=True, eps=np.finfo(np.float).eps,
                    copy_Gram=True, verbose=0, return_path=True,
                    return_n_iter=False, positive=False):
@@ -855,7 +858,8 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel):
     method = 'lar'
     positive = False
 
-    def __init__(self, fit_intercept=True, verbose=False, normalize=True,
+    @_deprecate_positional_args
+    def __init__(self, *, fit_intercept=True, verbose=False, normalize=True,
                  precompute='auto', n_nonzero_coefs=500,
                  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
                  jitter=None, random_state=None):
@@ -1110,7 +1114,8 @@ class LassoLars(Lars):
     """
     method = 'lasso'
 
-    def __init__(self, alpha=1.0, fit_intercept=True, verbose=False,
+    @_deprecate_positional_args
+    def __init__(self, alpha=1.0, *, fit_intercept=True, verbose=False,
                  normalize=True, precompute='auto', max_iter=500,
                  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
                  positive=False, jitter=None, random_state=None):
@@ -1367,7 +1372,8 @@ class LarsCV(Lars):
 
     method = 'lar'
 
-    def __init__(self, fit_intercept=True, verbose=False, max_iter=500,
+    @_deprecate_positional_args
+    def __init__(self, *, fit_intercept=True, verbose=False, max_iter=500,
                  normalize=True, precompute='auto', cv=None,
                  max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,
                  copy_X=True):
@@ -1608,7 +1614,8 @@ class LassoLarsCV(LarsCV):
 
     method = 'lasso'
 
-    def __init__(self, fit_intercept=True, verbose=False, max_iter=500,
+    @_deprecate_positional_args
+    def __init__(self, *, fit_intercept=True, verbose=False, max_iter=500,
                  normalize=True, precompute='auto', cv=None,
                  max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,
                  copy_X=True, positive=False):
@@ -1741,7 +1748,8 @@ class LassoLarsIC(LassoLars):
     --------
     lars_path, LassoLars, LassoLarsCV
     """
-    def __init__(self, criterion='aic', fit_intercept=True, verbose=False,
+    @_deprecate_positional_args
+    def __init__(self, criterion='aic', *, fit_intercept=True, verbose=False,
                  normalize=True, precompute='auto', max_iter=500,
                  eps=np.finfo(np.float).eps, copy_X=True, positive=False):
         self.criterion = criterion
diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 874dc743f4cc2..9ef3a21e4a76d 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -29,6 +29,7 @@
 from ..utils.extmath import row_norms
 from ..utils.optimize import _newton_cg, _check_optimize_result
 from ..utils.validation import check_is_fitted, _check_sample_weight
+from ..utils.validation import _deprecate_positional_args
 from ..utils.multiclass import check_classification_targets
 from ..utils.fixes import _joblib_parallel_args
 from ..model_selection import check_cv
@@ -1246,8 +1247,8 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin,
     >>> clf.score(X, y)
     0.97...
     """
-
-    def __init__(self, penalty='l2', dual=False, tol=1e-4, C=1.0,
+    @_deprecate_positional_args
+    def __init__(self, penalty='l2', *, dual=False, tol=1e-4, C=1.0,
                  fit_intercept=True, intercept_scaling=1, class_weight=None,
                  random_state=None, solver='lbfgs', max_iter=100,
                  multi_class='auto', verbose=0, warm_start=False, n_jobs=None,
@@ -1737,7 +1738,8 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator,
     LogisticRegression
 
     """
-    def __init__(self, Cs=10, fit_intercept=True, cv=None, dual=False,
+    @_deprecate_positional_args
+    def __init__(self, *, Cs=10, fit_intercept=True, cv=None, dual=False,
                  penalty='l2', scoring=None, solver='lbfgs', tol=1e-4,
                  max_iter=100, class_weight=None, n_jobs=None, verbose=0,
                  refit=True, intercept_scaling=1., multi_class='auto',
diff --git a/sklearn/linear_model/_omp.py b/sklearn/linear_model/_omp.py
index 0d572dd17c6d7..44371e9fa76e7 100644
--- a/sklearn/linear_model/_omp.py
+++ b/sklearn/linear_model/_omp.py
@@ -16,6 +16,7 @@
 from ._base import LinearModel, _pre_fit
 from ..base import RegressorMixin, MultiOutputMixin
 from ..utils import as_float_array, check_array
+from ..utils.validation import _deprecate_positional_args
 from ..model_selection import check_cv
 
 premature = """ Orthogonal matching pursuit ended prematurely due to linear
@@ -262,7 +263,8 @@ def _gram_omp(Gram, Xy, n_nonzero_coefs, tol_0=None, tol=None,
         return gamma, indices[:n_active], n_active
 
 
-def orthogonal_mp(X, y, n_nonzero_coefs=None, tol=None, precompute=False,
+@_deprecate_positional_args
+def orthogonal_mp(X, y, *, n_nonzero_coefs=None, tol=None, precompute=False,
                   copy_X=True, return_path=False,
                   return_n_iter=False):
     r"""Orthogonal Matching Pursuit (OMP)
@@ -371,7 +373,8 @@ def orthogonal_mp(X, y, n_nonzero_coefs=None, tol=None, precompute=False,
             norms_squared = np.sum((y ** 2), axis=0)
         else:
             norms_squared = None
-        return orthogonal_mp_gram(G, Xy, n_nonzero_coefs, tol, norms_squared,
+        return orthogonal_mp_gram(G, Xy, n_nonzero_coefs=n_nonzero_coefs,
+                                  tol=tol, norms_squared=norms_squared,
                                   copy_Gram=copy_X, copy_Xy=False,
                                   return_path=return_path)
 
@@ -404,7 +407,8 @@ def orthogonal_mp(X, y, n_nonzero_coefs=None, tol=None, precompute=False,
         return np.squeeze(coef)
 
 
-def orthogonal_mp_gram(Gram, Xy, n_nonzero_coefs=None, tol=None,
+@_deprecate_positional_args
+def orthogonal_mp_gram(Gram, Xy, *, n_nonzero_coefs=None, tol=None,
                        norms_squared=None, copy_Gram=True,
                        copy_Xy=True, return_path=False,
                        return_n_iter=False):
@@ -616,7 +620,8 @@ class OrthogonalMatchingPursuit(MultiOutputMixin, RegressorMixin, LinearModel):
     decomposition.sparse_encode
     OrthogonalMatchingPursuitCV
     """
-    def __init__(self, n_nonzero_coefs=None, tol=None, fit_intercept=True,
+    @_deprecate_positional_args
+    def __init__(self, *, n_nonzero_coefs=None, tol=None, fit_intercept=True,
                  normalize=True, precompute='auto'):
         self.n_nonzero_coefs = n_nonzero_coefs
         self.tol = tol
@@ -660,7 +665,7 @@ def fit(self, X, y):
 
         if Gram is False:
             coef_, self.n_iter_ = orthogonal_mp(
-                X, y, self.n_nonzero_coefs_, self.tol,
+                X, y, n_nonzero_coefs=self.n_nonzero_coefs_, tol=self.tol,
                 precompute=False, copy_X=True,
                 return_n_iter=True)
         else:
@@ -853,7 +858,8 @@ class OrthogonalMatchingPursuitCV(RegressorMixin, LinearModel):
     decomposition.sparse_encode
 
     """
-    def __init__(self, copy=True, fit_intercept=True, normalize=True,
+    @_deprecate_positional_args
+    def __init__(self, *, copy=True, fit_intercept=True, normalize=True,
                  max_iter=None, cv=None, n_jobs=None, verbose=False):
         self.copy = copy
         self.fit_intercept = fit_intercept
diff --git a/sklearn/linear_model/_passive_aggressive.py b/sklearn/linear_model/_passive_aggressive.py
index 3b8354f5a7352..22c47fb1fcf07 100644
--- a/sklearn/linear_model/_passive_aggressive.py
+++ b/sklearn/linear_model/_passive_aggressive.py
@@ -1,6 +1,7 @@
 # Authors: Rob Zinkov, Mathieu Blondel
 # License: BSD 3 clause
 
+from ..utils.validation import _deprecate_positional_args
 from ._stochastic_gradient import BaseSGDClassifier
 from ._stochastic_gradient import BaseSGDRegressor
 from ._stochastic_gradient import DEFAULT_EPSILON
@@ -163,7 +164,8 @@ class PassiveAggressiveClassifier(BaseSGDClassifier):
     K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR (2006)
 
     """
-    def __init__(self, C=1.0, fit_intercept=True, max_iter=1000, tol=1e-3,
+    @_deprecate_positional_args
+    def __init__(self, *, C=1.0, fit_intercept=True, max_iter=1000, tol=1e-3,
                  early_stopping=False, validation_fraction=0.1,
                  n_iter_no_change=5, shuffle=True, verbose=0, loss="hinge",
                  n_jobs=None, random_state=None, warm_start=False,
@@ -390,7 +392,8 @@ class PassiveAggressiveRegressor(BaseSGDRegressor):
     K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR (2006)
 
     """
-    def __init__(self, C=1.0, fit_intercept=True, max_iter=1000, tol=1e-3,
+    @_deprecate_positional_args
+    def __init__(self, *, C=1.0, fit_intercept=True, max_iter=1000, tol=1e-3,
                  early_stopping=False, validation_fraction=0.1,
                  n_iter_no_change=5, shuffle=True, verbose=0,
                  loss="epsilon_insensitive", epsilon=DEFAULT_EPSILON,
diff --git a/sklearn/linear_model/_perceptron.py b/sklearn/linear_model/_perceptron.py
index ff50f6ebbc06e..54d7888109702 100644
--- a/sklearn/linear_model/_perceptron.py
+++ b/sklearn/linear_model/_perceptron.py
@@ -1,6 +1,7 @@
 # Author: Mathieu Blondel
 # License: BSD 3 clause
 
+from ..utils.validation import _deprecate_positional_args
 from ._stochastic_gradient import BaseSGDClassifier
 
 
@@ -143,8 +144,8 @@ class Perceptron(BaseSGDClassifier):
 
     https://en.wikipedia.org/wiki/Perceptron and references therein.
     """
-
-    def __init__(self, penalty=None, alpha=0.0001, fit_intercept=True,
+    @_deprecate_positional_args
+    def __init__(self, *, penalty=None, alpha=0.0001, fit_intercept=True,
                  max_iter=1000, tol=1e-3, shuffle=True, verbose=0, eta0=1.0,
                  n_jobs=None, random_state=0, early_stopping=False,
                  validation_fraction=0.1, n_iter_no_change=5,
diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py
index cd5e3db49842d..86ceb0d5e311f 100644
--- a/sklearn/linear_model/_ransac.py
+++ b/sklearn/linear_model/_ransac.py
@@ -12,6 +12,7 @@
 from ..utils import check_random_state, check_array, check_consistent_length
 from ..utils.random import sample_without_replacement
 from ..utils.validation import check_is_fitted, _check_sample_weight
+from ..utils.validation import _deprecate_positional_args
 from ._base import LinearRegression
 from ..utils.validation import has_fit_parameter
 from ..exceptions import ConvergenceWarning
@@ -201,8 +202,8 @@ class RANSACRegressor(MetaEstimatorMixin, RegressorMixin,
     .. [2] https://www.sri.com/sites/default/files/publications/ransac-publication.pdf
     .. [3] http://www.bmva.org/bmvc/2009/Papers/Paper355/Paper355.pdf
     """
-
-    def __init__(self, base_estimator=None, min_samples=None,
+    @_deprecate_positional_args
+    def __init__(self, base_estimator=None, *, min_samples=None,
                  residual_threshold=None, is_data_valid=None,
                  is_model_valid=None, max_trials=100, max_skips=np.inf,
                  stop_n_inliers=np.inf, stop_score=np.inf,
diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
index 9c3f703ac478e..309137bed2b5d 100644
--- a/sklearn/linear_model/_ridge.py
+++ b/sklearn/linear_model/_ridge.py
@@ -27,6 +27,7 @@
 from ..utils import compute_sample_weight
 from ..utils import column_or_1d
 from ..utils.validation import _check_sample_weight
+from ..utils.validation import _deprecate_positional_args
 from ..preprocessing import LabelBinarizer
 from ..model_selection import GridSearchCV
 from ..metrics import check_scoring
@@ -234,7 +235,8 @@ def _get_valid_accept_sparse(is_X_sparse, solver):
         return ['csr', 'csc', 'coo']
 
 
-def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
+@_deprecate_positional_args
+def ridge_regression(X, y, alpha, *, sample_weight=None, solver='auto',
                      max_iter=None, tol=1e-3, verbose=0, random_state=None,
                      return_n_iter=False, return_intercept=False,
                      check_input=True):
@@ -518,7 +520,8 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
 
 class _BaseRidge(LinearModel, metaclass=ABCMeta):
     @abstractmethod
-    def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
+    @_deprecate_positional_args
+    def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False,
                  copy_X=True, max_iter=None, tol=1e-3, solver="auto",
                  random_state=None):
         self.alpha = alpha
@@ -727,8 +730,8 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge):
     >>> clf.fit(X, y)
     Ridge()
     """
-
-    def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
+    @_deprecate_positional_args
+    def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False,
                  copy_X=True, max_iter=None, tol=1e-3, solver="auto",
                  random_state=None):
         super().__init__(
@@ -885,8 +888,8 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
     >>> clf.score(X, y)
     0.9595...
     """
-
-    def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
+    @_deprecate_positional_args
+    def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False,
                  copy_X=True, max_iter=None, tol=1e-3, class_weight=None,
                  solver="auto", random_state=None):
         super().__init__(
@@ -1112,8 +1115,8 @@ class _RidgeGCV(LinearModel):
     http://cbcl.mit.edu/publications/ps/MIT-CSAIL-TR-2007-025.pdf
     https://www.mit.edu/~9.520/spring07/Classes/rlsslides.pdf
     """
-
-    def __init__(self, alphas=(0.1, 1.0, 10.0),
+    @_deprecate_positional_args
+    def __init__(self, alphas=(0.1, 1.0, 10.0), *,
                  fit_intercept=True, normalize=False,
                  scoring=None, copy_X=True,
                  gcv_mode=None, store_cv_values=False,
@@ -1546,7 +1549,8 @@ def fit(self, X, y, sample_weight=None):
 
 
 class _BaseRidgeCV(LinearModel):
-    def __init__(self, alphas=(0.1, 1.0, 10.0),
+    @_deprecate_positional_args
+    def __init__(self, alphas=(0.1, 1.0, 10.0), *,
                  fit_intercept=True, normalize=False, scoring=None,
                  cv=None, gcv_mode=None,
                  store_cv_values=False):
@@ -1854,8 +1858,8 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV):
     a one-versus-all approach. Concretely, this is implemented by taking
     advantage of the multi-variate response support in Ridge.
     """
-
-    def __init__(self, alphas=(0.1, 1.0, 10.0), fit_intercept=True,
+    @_deprecate_positional_args
+    def __init__(self, alphas=(0.1, 1.0, 10.0), *, fit_intercept=True,
                  normalize=False, scoring=None, cv=None, class_weight=None,
                  store_cv_values=False):
         super().__init__(
diff --git a/sklearn/linear_model/_sag.py b/sklearn/linear_model/_sag.py
index 9fe6f076f5145..caa9b2d133003 100644
--- a/sklearn/linear_model/_sag.py
+++ b/sklearn/linear_model/_sag.py
@@ -13,6 +13,7 @@
 from ..exceptions import ConvergenceWarning
 from ..utils import check_array
 from ..utils.validation import _check_sample_weight
+from ..utils.validation import _deprecate_positional_args
 from ..utils.extmath import row_norms
 
 
@@ -84,6 +85,7 @@ def get_auto_step_size(max_squared_sum, alpha_scaled, loss, fit_intercept,
     return step
 
 
+@_deprecate_positional_args
 def sag_solver(X, y, sample_weight=None, loss='log', alpha=1., beta=0.,
                max_iter=1000, tol=0.001, verbose=0, random_state=None,
                check_input=True, max_squared_sum=None,
diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index bf1e77e3e355b..3bedd8a26674b 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -19,6 +19,7 @@
 from ..utils.extmath import safe_sparse_dot
 from ..utils.multiclass import _check_partial_fit_first_call
 from ..utils.validation import check_is_fitted, _check_sample_weight
+from ..utils.validation import _deprecate_positional_args
 from ..exceptions import ConvergenceWarning
 from ..model_selection import StratifiedShuffleSplit, ShuffleSplit
 
@@ -68,8 +69,8 @@ def __call__(self, coef, intercept):
 
 class BaseSGD(SparseCoefMixin, BaseEstimator, metaclass=ABCMeta):
     """Base class for SGD classification and regression."""
-
-    def __init__(self, loss, penalty='l2', alpha=0.0001, C=1.0,
+    @_deprecate_positional_args
+    def __init__(self, loss, *, penalty='l2', alpha=0.0001, C=1.0,
                  l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3,
                  shuffle=True, verbose=0, epsilon=0.1, random_state=None,
                  learning_rate="optimal", eta0=0.0, power_t=0.5,
@@ -461,7 +462,8 @@ class BaseSGDClassifier(LinearClassifierMixin, BaseSGD, metaclass=ABCMeta):
     }
 
     @abstractmethod
-    def __init__(self, loss="hinge", penalty='l2', alpha=0.0001,
+    @_deprecate_positional_args
+    def __init__(self, loss="hinge", *, penalty='l2', alpha=0.0001,
                  l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3,
                  shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON, n_jobs=None,
                  random_state=None, learning_rate="optimal", eta0=0.0,
@@ -950,8 +952,9 @@ class SGDClassifier(BaseSGDClassifier):
     >>> print(clf.predict([[-0.8, -1]]))
     [1]
     """
-
-    def __init__(self, loss="hinge", penalty='l2', alpha=0.0001, l1_ratio=0.15,
+    @_deprecate_positional_args
+    def __init__(self, loss="hinge", *, penalty='l2', alpha=0.0001,
+                 l1_ratio=0.15,
                  fit_intercept=True, max_iter=1000, tol=1e-3, shuffle=True,
                  verbose=0, epsilon=DEFAULT_EPSILON, n_jobs=None,
                  random_state=None, learning_rate="optimal", eta0=0.0,
@@ -1097,7 +1100,8 @@ class BaseSGDRegressor(RegressorMixin, BaseSGD):
     }
 
     @abstractmethod
-    def __init__(self, loss="squared_loss", penalty="l2", alpha=0.0001,
+    @_deprecate_positional_args
+    def __init__(self, loss="squared_loss", *, penalty="l2", alpha=0.0001,
                  l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3,
                  shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON,
                  random_state=None, learning_rate="invscaling", eta0=0.01,
@@ -1543,7 +1547,8 @@ class SGDRegressor(BaseSGDRegressor):
     Ridge, ElasticNet, Lasso, sklearn.svm.SVR
 
     """
-    def __init__(self, loss="squared_loss", penalty="l2", alpha=0.0001,
+    @_deprecate_positional_args
+    def __init__(self, loss="squared_loss", *, penalty="l2", alpha=0.0001,
                  l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3,
                  shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON,
                  random_state=None, learning_rate="invscaling", eta0=0.01,
diff --git a/sklearn/linear_model/_theil_sen.py b/sklearn/linear_model/_theil_sen.py
index 16f0adae12c9c..28d2dba3f8719 100644
--- a/sklearn/linear_model/_theil_sen.py
+++ b/sklearn/linear_model/_theil_sen.py
@@ -20,6 +20,7 @@
 from ._base import LinearModel
 from ..base import RegressorMixin
 from ..utils import check_random_state
+from ..utils.validation import _deprecate_positional_args
 from ..exceptions import ConvergenceWarning
 
 _EPSILON = np.finfo(np.double).eps
@@ -290,8 +291,8 @@ class TheilSenRegressor(RegressorMixin, LinearModel):
       Xin Dang, Hanxiang Peng, Xueqin Wang and Heping Zhang
       http://home.olemiss.edu/~xdang/papers/MTSE.pdf
     """
-
-    def __init__(self, fit_intercept=True, copy_X=True,
+    @_deprecate_positional_args
+    def __init__(self, *, fit_intercept=True, copy_X=True,
                  max_subpopulation=1e4, n_subsamples=None, max_iter=300,
                  tol=1.e-3, random_state=None, n_jobs=None, verbose=False):
         self.fit_intercept = fit_intercept
diff --git a/sklearn/linear_model/tests/test_omp.py b/sklearn/linear_model/tests/test_omp.py
index e742689bcde3d..791983ba62cc2 100644
--- a/sklearn/linear_model/tests/test_omp.py
+++ b/sklearn/linear_model/tests/test_omp.py
@@ -94,8 +94,8 @@ def test_bad_input():
 
 def test_perfect_signal_recovery():
     idx, = gamma[:, 0].nonzero()
-    gamma_rec = orthogonal_mp(X, y[:, 0], 5)
-    gamma_gram = orthogonal_mp_gram(G, Xy[:, 0], 5)
+    gamma_rec = orthogonal_mp(X, y[:, 0], n_nonzero_coefs=5)
+    gamma_gram = orthogonal_mp_gram(G, Xy[:, 0], n_nonzero_coefs=5)
     assert_array_equal(idx, np.flatnonzero(gamma_rec))
     assert_array_equal(idx, np.flatnonzero(gamma_gram))
     assert_array_almost_equal(gamma[:, 0], gamma_rec, decimal=2)
@@ -110,7 +110,8 @@ def test_orthogonal_mp_gram_readonly():
     G_readonly.setflags(write=False)
     Xy_readonly = Xy.copy()
     Xy_readonly.setflags(write=False)
-    gamma_gram = orthogonal_mp_gram(G_readonly, Xy_readonly[:, 0], 5,
+    gamma_gram = orthogonal_mp_gram(G_readonly, Xy_readonly[:, 0],
+                                    n_nonzero_coefs=5,
                                     copy_Gram=False, copy_Xy=False)
     assert_array_equal(idx, np.flatnonzero(gamma_gram))
     assert_array_almost_equal(gamma[:, 0], gamma_gram, decimal=2)
@@ -163,8 +164,8 @@ def test_swapped_regressors():
     gamma[0] = 0.5
     new_y = np.dot(X, gamma)
     new_Xy = np.dot(X.T, new_y)
-    gamma_hat = orthogonal_mp(X, new_y, 2)
-    gamma_hat_gram = orthogonal_mp_gram(G, new_Xy, 2)
+    gamma_hat = orthogonal_mp(X, new_y, n_nonzero_coefs=2)
+    gamma_hat_gram = orthogonal_mp_gram(G, new_Xy, n_nonzero_coefs=2)
     assert_array_equal(np.flatnonzero(gamma_hat), [0, 21])
     assert_array_equal(np.flatnonzero(gamma_hat_gram), [0, 21])
 
@@ -172,8 +173,10 @@ def test_swapped_regressors():
 def test_no_atoms():
     y_empty = np.zeros_like(y)
     Xy_empty = np.dot(X.T, y_empty)
-    gamma_empty = ignore_warnings(orthogonal_mp)(X, y_empty, 1)
-    gamma_empty_gram = ignore_warnings(orthogonal_mp)(G, Xy_empty, 1)
+    gamma_empty = ignore_warnings(orthogonal_mp)(X, y_empty,
+                                                 n_nonzero_coefs=1)
+    gamma_empty_gram = ignore_warnings(orthogonal_mp)(G, Xy_empty,
+                                                      n_nonzero_coefs=1)
     assert np.all(gamma_empty == 0)
     assert np.all(gamma_empty_gram == 0)
 
diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py
index 1f7d3c2569bab..3710f3857a2a7 100644
--- a/sklearn/linear_model/tests/test_ransac.py
+++ b/sklearn/linear_model/tests/test_ransac.py
@@ -288,7 +288,9 @@ def test_ransac_none_estimator():
 
     ransac_estimator = RANSACRegressor(base_estimator, min_samples=2,
                                        residual_threshold=5, random_state=0)
-    ransac_none_estimator = RANSACRegressor(None, 2, 5, random_state=0)
+    ransac_none_estimator = RANSACRegressor(None, min_samples=2,
+                                            residual_threshold=5,
+                                            random_state=0)
 
     ransac_estimator.fit(X, y)
     ransac_none_estimator.fit(X, y)

From b4757f7fe5b8d238ebb4cb150aeba52306c12071 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Sun, 19 Apr 2020 17:28:07 -0400
Subject: [PATCH 025/125] ENH add vlines_ attribute to PDP Display to hide
 deciles (#15785)

---
 doc/whats_new/v0.23.rst                       |  7 +++
 .../inspection/_plot/partial_dependence.py    | 43 +++++++++++++------
 .../tests/test_plot_partial_dependence.py     | 19 ++++++++
 3 files changed, 57 insertions(+), 12 deletions(-)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index 9343f1ee46da9..b672c1c156c97 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -373,6 +373,13 @@ Changelog
   :class:`neural_network.MLPClassifier` by clipping the probabilities.
   :pr:`16117` by `Thomas Fan`_.
 
+:mod:`sklearn.inspection`
+.........................
+
+- |Enhancement| :class:`inspection.PartialDependenceDisplay` now exposes the
+  deciles lines as attributes so they can be hidden or customized. :pr:`15785`
+  by `Nicolas Hug`_
+
 :mod:`sklearn.preprocessing`
 ............................
 
diff --git a/sklearn/inspection/_plot/partial_dependence.py b/sklearn/inspection/_plot/partial_dependence.py
index 4a83ac057d91e..812005f5ab2ae 100644
--- a/sklearn/inspection/_plot/partial_dependence.py
+++ b/sklearn/inspection/_plot/partial_dependence.py
@@ -391,21 +391,36 @@ class PartialDependenceDisplay:
     axes_ : ndarray of matplotlib Axes
         If `ax` is an axes or None, `axes_[i, j]` is the axes on the i-th row
         and j-th column. If `ax` is a list of axes, `axes_[i]` is the i-th item
-        in `ax`. Elements that are None corresponds to a nonexisting axes in
+        in `ax`. Elements that are None correspond to a nonexisting axes in
         that position.
 
     lines_ : ndarray of matplotlib Artists
-        If `ax` is an axes or None, `line_[i, j]` is the partial dependence
+        If `ax` is an axes or None, `lines_[i, j]` is the partial dependence
         curve on the i-th row and j-th column. If `ax` is a list of axes,
         `lines_[i]` is the partial dependence curve corresponding to the i-th
-        item in `ax`. Elements that are None corresponds to a nonexisting axes
+        item in `ax`. Elements that are None correspond to a nonexisting axes
         or an axes that does not include a line plot.
 
+    deciles_vlines_ : ndarray of matplotlib LineCollection
+        If `ax` is an axes or None, `vlines_[i, j]` is the line collection
+        representing the x axis deciles of the i-th row and j-th column. If
+        `ax` is a list of axes, `vlines_[i]` corresponds to the i-th item in
+        `ax`. Elements that are None correspond to a nonexisting axes or an
+        axes that does not include a PDP plot.
+        .. versionadded:: 0.23
+    deciles_hlines_ : ndarray of matplotlib LineCollection
+        If `ax` is an axes or None, `vlines_[i, j]` is the line collection
+        representing the y axis deciles of the i-th row and j-th column. If
+        `ax` is a list of axes, `vlines_[i]` corresponds to the i-th item in
+        `ax`. Elements that are None correspond to a nonexisting axes or an
+        axes that does not include a 2-way plot.
+        .. versionadded:: 0.23
+
     contours_ : ndarray of matplotlib Artists
         If `ax` is an axes or None, `contours_[i, j]` is the partial dependence
         plot on the i-th row and j-th column. If `ax` is a list of axes,
         `contours_[i]` is the partial dependence plot corresponding to the i-th
-        item in `ax`. Elements that are None corresponds to a nonexisting axes
+        item in `ax`. Elements that are None correspond to a nonexisting axes
         or an axes that does not include a contour plot.
 
     figure_ : matplotlib Figure
@@ -490,8 +505,6 @@ def plot(self, ax=None, n_cols=3, line_kw=None, contour_kw=None):
             n_rows = int(np.ceil(n_features / float(n_cols)))
 
             self.axes_ = np.empty((n_rows, n_cols), dtype=np.object)
-            self.lines_ = np.empty((n_rows, n_cols), dtype=np.object)
-            self.contours_ = np.empty((n_rows, n_cols), dtype=np.object)
 
             axes_ravel = self.axes_.ravel()
 
@@ -514,14 +527,20 @@ def plot(self, ax=None, n_cols=3, line_kw=None, contour_kw=None):
             self.bounding_ax_ = None
             self.figure_ = ax.ravel()[0].figure
             self.axes_ = ax
-            self.lines_ = np.empty_like(ax, dtype=np.object)
-            self.contours_ = np.empty_like(ax, dtype=np.object)
 
         # create contour levels for two-way plots
         if 2 in self.pdp_lim:
             Z_level = np.linspace(*self.pdp_lim[2], num=8)
+
+        self.lines_ = np.empty_like(self.axes_, dtype=np.object)
+        self.contours_ = np.empty_like(self.axes_, dtype=np.object)
+        self.deciles_vlines_ = np.empty_like(self.axes_, dtype=np.object)
+        self.deciles_hlines_ = np.empty_like(self.axes_, dtype=np.object)
+        # Create 1d views of these 2d arrays for easy indexing
         lines_ravel = self.lines_.ravel(order='C')
         contours_ravel = self.contours_.ravel(order='C')
+        vlines_ravel = self.deciles_vlines_.ravel(order='C')
+        hlines_ravel = self.deciles_hlines_.ravel(order='C')
 
         for i, axi, fx, (avg_preds, values) in zip(count(),
                                                    self.axes_.ravel(),
@@ -547,8 +566,8 @@ def plot(self, ax=None, n_cols=3, line_kw=None, contour_kw=None):
             trans = transforms.blended_transform_factory(axi.transData,
                                                          axi.transAxes)
             ylim = axi.get_ylim()
-            axi.vlines(self.deciles[fx[0]], 0, 0.05, transform=trans,
-                       color='k')
+            vlines_ravel[i] = axi.vlines(self.deciles[fx[0]], 0, 0.05,
+                                         transform=trans, color='k')
             axi.set_ylim(ylim)
 
             # Set xlabel if it is not already set
@@ -566,8 +585,8 @@ def plot(self, ax=None, n_cols=3, line_kw=None, contour_kw=None):
                 trans = transforms.blended_transform_factory(axi.transAxes,
                                                              axi.transData)
                 xlim = axi.get_xlim()
-                axi.hlines(self.deciles[fx[1]], 0, 0.05, transform=trans,
-                           color='k')
+                hlines_ravel[i] = axi.hlines(self.deciles[fx[1]], 0, 0.05,
+                                             transform=trans, color='k')
                 # hline erases xlim
                 axi.set_ylabel(self.feature_names[fx[1]])
                 axi.set_xlim(xlim)
diff --git a/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py b/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py
index abae91d4d2642..41da3f08c9094 100644
--- a/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py
+++ b/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py
@@ -51,11 +51,20 @@ def test_plot_partial_dependence(grid_resolution, pyplot, clf_boston, boston):
     assert disp.axes_.shape == (1, 3)
     assert disp.lines_.shape == (1, 3)
     assert disp.contours_.shape == (1, 3)
+    assert disp.deciles_vlines_.shape == (1, 3)
+    assert disp.deciles_hlines_.shape == (1, 3)
 
     assert disp.lines_[0, 2] is None
     assert disp.contours_[0, 0] is None
     assert disp.contours_[0, 1] is None
 
+    # deciles lines: always show on xaxis, only show on yaxis if 2-way PDP
+    for i in range(3):
+        assert disp.deciles_vlines_[0, i] is not None
+    assert disp.deciles_hlines_[0, 0] is None
+    assert disp.deciles_hlines_[0, 1] is None
+    assert disp.deciles_hlines_[0, 2] is not None
+
     assert disp.features == [(0, ), (1, ), (0, 1)]
     assert np.all(disp.feature_names == feature_names)
     assert len(disp.deciles) == 2
@@ -132,9 +141,15 @@ def test_plot_partial_dependence_str_features(pyplot, clf_boston, boston,
     assert disp.axes_.shape == (2, 1)
     assert disp.lines_.shape == (2, 1)
     assert disp.contours_.shape == (2, 1)
+    assert disp.deciles_vlines_.shape == (2, 1)
+    assert disp.deciles_hlines_.shape == (2, 1)
 
     assert disp.lines_[0, 0] is None
+    assert disp.deciles_vlines_[0, 0] is not None
+    assert disp.deciles_hlines_[0, 0] is not None
     assert disp.contours_[1, 0] is None
+    assert disp.deciles_hlines_[1, 0] is None
+    assert disp.deciles_vlines_[1, 0] is not None
 
     # line
     ax = disp.axes_[1, 0]
@@ -309,6 +324,8 @@ def test_plot_partial_dependence_multiclass(pyplot):
     assert disp_target_0.axes_.shape == (1, 2)
     assert disp_target_0.lines_.shape == (1, 2)
     assert disp_target_0.contours_.shape == (1, 2)
+    assert disp_target_0.deciles_vlines_.shape == (1, 2)
+    assert disp_target_0.deciles_hlines_.shape == (1, 2)
     assert all(c is None for c in disp_target_0.contours_.flat)
     assert disp_target_0.target_idx == 0
 
@@ -323,6 +340,8 @@ def test_plot_partial_dependence_multiclass(pyplot):
     assert disp_symbol.axes_.shape == (1, 2)
     assert disp_symbol.lines_.shape == (1, 2)
     assert disp_symbol.contours_.shape == (1, 2)
+    assert disp_symbol.deciles_vlines_.shape == (1, 2)
+    assert disp_symbol.deciles_hlines_.shape == (1, 2)
     assert all(c is None for c in disp_symbol.contours_.flat)
     assert disp_symbol.target_idx == 0
 

From 6973096a0f0a2d1eb6c5d3cadd55a89276368311 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 20 Apr 2020 11:31:18 -0400
Subject: [PATCH 026/125] DOC details on the use of xfail_checks (#16968)

---
 doc/developers/develop.rst | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst
index d8ae6dd224840..cc4e8f6678c01 100644
--- a/doc/developers/develop.rst
+++ b/doc/developers/develop.rst
@@ -518,9 +518,16 @@ _skip_test (default=False)
     *very good* reason.
 
 _xfail_checks (default=False)
-    dictionary ``{check_name : reason}`` of common checks to mark as a
-    known failure, with the associated reason. Don't use this unless you have a
-    *very good* reason.
+    dictionary ``{check_name: reason}`` of common checks that will be marked
+    as `XFAIL` for pytest, when using
+    :func:`~sklearn.utils.estimator_checks.parametrize_with_checks`. This tag
+    currently has no effect on
+    :func:`~sklearn.utils.estimator_checks.check_estimator`.
+    Don't use this unless there is a *very good* reason for your estimator
+    not to pass the check.
+    Also note that the usage of this tag is highly subject to change because
+    we are trying to make it more flexible: be prepared for breaking changes
+    in the future.
 
 stateless (default=False)
     whether the estimator needs access to data for fitting. Even though an

From 7e15285d2e0587af976848fbc44c95c706640839 Mon Sep 17 00:00:00 2001
From: Adrin Jalali <adrin.jalali@gmail.com>
Date: Mon, 20 Apr 2020 19:48:36 +0200
Subject: [PATCH 027/125] API make gaussian_process __init__ params kwarg
 (#16870)

---
 sklearn/gaussian_process/_gpc.py | 17 ++++++++++++-----
 sklearn/gaussian_process/_gpr.py |  4 +++-
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/sklearn/gaussian_process/_gpc.py b/sklearn/gaussian_process/_gpc.py
index e70838c6d251a..2c9c0ef483d4f 100644
--- a/sklearn/gaussian_process/_gpc.py
+++ b/sklearn/gaussian_process/_gpc.py
@@ -19,6 +19,7 @@
 from ..utils.optimize import _check_optimize_result
 from ..preprocessing import LabelEncoder
 from ..multiclass import OneVsRestClassifier, OneVsOneClassifier
+from ..utils.validation import _deprecate_positional_args
 
 
 # Values required for approximating the logistic sigmoid by
@@ -144,7 +145,8 @@ def optimizer(obj_func, initial_theta, bounds):
         The log-marginal-likelihood of ``self.kernel_.theta``
 
     """
-    def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b",
+    @_deprecate_positional_args
+    def __init__(self, kernel=None, *, optimizer="fmin_l_bfgs_b",
                  n_restarts_optimizer=0, max_iter_predict=100,
                  warm_start=False, copy_X_train=True, random_state=None):
         self.kernel = kernel
@@ -586,7 +588,8 @@ def optimizer(obj_func, initial_theta, bounds):
 
     .. versionadded:: 0.18
     """
-    def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b",
+    @_deprecate_positional_args
+    def __init__(self, kernel=None, *, optimizer="fmin_l_bfgs_b",
                  n_restarts_optimizer=0, max_iter_predict=100,
                  warm_start=False, copy_X_train=True, random_state=None,
                  multi_class="one_vs_rest", n_jobs=None):
@@ -623,9 +626,13 @@ def fit(self, X, y):
                                        ensure_2d=False, dtype=None)
 
         self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
-            self.kernel, self.optimizer, self.n_restarts_optimizer,
-            self.max_iter_predict, self.warm_start, self.copy_X_train,
-            self.random_state)
+            kernel=self.kernel,
+            optimizer=self.optimizer,
+            n_restarts_optimizer=self.n_restarts_optimizer,
+            max_iter_predict=self.max_iter_predict,
+            warm_start=self.warm_start,
+            copy_X_train=self.copy_X_train,
+            random_state=self.random_state)
 
         self.classes_ = np.unique(y)
         self.n_classes_ = self.classes_.size
diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py
index caf94ce41c1b4..0ba594a7ffaac 100644
--- a/sklearn/gaussian_process/_gpr.py
+++ b/sklearn/gaussian_process/_gpr.py
@@ -17,6 +17,7 @@
 from ..utils import check_random_state
 from ..utils.validation import check_array
 from ..utils.optimize import _check_optimize_result
+from ..utils.validation import _deprecate_positional_args
 
 
 class GaussianProcessRegressor(MultiOutputMixin,
@@ -149,7 +150,8 @@ def optimizer(obj_func, initial_theta, bounds):
     (array([653.0..., 592.1...]), array([316.6..., 316.6...]))
 
     """
-    def __init__(self, kernel=None, alpha=1e-10,
+    @_deprecate_positional_args
+    def __init__(self, kernel=None, *, alpha=1e-10,
                  optimizer="fmin_l_bfgs_b", n_restarts_optimizer=0,
                  normalize_y=False, copy_X_train=True, random_state=None):
         self.kernel = kernel

From c8e055883b5fcc3f3ac8850eefd7b3c5e7f7ff2d Mon Sep 17 00:00:00 2001
From: Lucy Liu <jliu176@gmail.com>
Date: Mon, 20 Apr 2020 21:44:38 +0200
Subject: [PATCH 028/125] DOC Add link, fix wording of KNeighborsRegressor
 (#16969)

---
 sklearn/neighbors/_regression.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
index 00d8f10c8880d..a00d83c98102b 100644
--- a/sklearn/neighbors/_regression.py
+++ b/sklearn/neighbors/_regression.py
@@ -77,10 +77,10 @@ class KNeighborsRegressor(NeighborsBase, KNeighborsMixin,
     metric : str or callable, default='minkowski'
         the distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of the DistanceMetric class for a
+        metric. See the documentation of :class:`DistanceMetric` for a
         list of available metrics.
         If metric is "precomputed", X is assumed to be a distance matrix and
-        must be square during fit. X may be a :term:`Glossary <sparse graph>`,
+        must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
 
     metric_params : dict, default=None

From 7736062aaeb301b367500b6c387b1806dd7ff1eb Mon Sep 17 00:00:00 2001
From: smarie <sylvain.marie@schneider-electric.com>
Date: Tue, 21 Apr 2020 14:06:07 +0200
Subject: [PATCH 029/125] DOC Libsvm liblinear rand fix - minor doc and header
 fixes (#16979)

---
 doc/whats_new/v0.23.rst              | 6 +++---
 sklearn/svm/src/liblinear/linear.cpp | 2 +-
 sklearn/svm/src/libsvm/svm.cpp       | 2 +-
 sklearn/svm/src/newrand/newrand.h    | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index b672c1c156c97..b5cb4b7012182 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -407,14 +407,14 @@ Changelog
   generators used to randomly select coordinates in the coordinate descent
   algorithms. Platform-dependent C ``rand()`` was used, which is only able to
   generate numbers up to ``32767`` on windows platform (see this `blog
-  post <https://codeforces.com/blog/entry/61587>`) and also has poor
+  post <https://codeforces.com/blog/entry/61587>`_) and also has poor
   randomization power as suggested by `this presentation
-  <https://channel9.msdn.com/Events/GoingNative/2013/rand-Considered-Harmful>`.
+  <https://channel9.msdn.com/Events/GoingNative/2013/rand-Considered-Harmful>`_.
   It was replaced with C++11 ``mt19937``, a Mersenne Twister that correctly
   generates 31bits/63bits random numbers on all platforms. In addition, the
   crude "modulo" postprocessor used to get a random number in a bounded
   interval was replaced by the tweaked Lemire method as suggested by `this blog
-  post <http://www.pcg-random.org/posts/bounded-rands.html>`.
+  post <http://www.pcg-random.org/posts/bounded-rands.html>`_.
   Any model using the :func:`svm.libsvm` or the :func:`svm.liblinear` solver,
   including :class:`svm.LinearSVC`, :class:`svm.LinearSVR`,
   :class:`svm.NuSVC`, :class:`svm.NuSVR`, :class:`svm.OneClassSVM`,
diff --git a/sklearn/svm/src/liblinear/linear.cpp b/sklearn/svm/src/liblinear/linear.cpp
index cc603b435f655..29a5581c280dc 100644
--- a/sklearn/svm/src/liblinear/linear.cpp
+++ b/sklearn/svm/src/liblinear/linear.cpp
@@ -26,7 +26,7 @@
    Modified 2020:
    - Improved random number generator by using a mersenne twister + tweaked
      lemire postprocessor. This fixed a convergence issue on windows targets.
-     Sylvain Marie
+     Sylvain Marie, Schneider Electric
      See <https://github.com/scikit-learn/scikit-learn/pull/13511#issuecomment-481729756>
 
  */
diff --git a/sklearn/svm/src/libsvm/svm.cpp b/sklearn/svm/src/libsvm/svm.cpp
index c9a5df10c4924..a5f735d8111cf 100644
--- a/sklearn/svm/src/libsvm/svm.cpp
+++ b/sklearn/svm/src/libsvm/svm.cpp
@@ -52,7 +52,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
    - Improved random number generator by using a mersenne twister + tweaked
      lemire postprocessor. This fixed a convergence issue on windows targets.
-     Sylvain Marie,
+     Sylvain Marie, Schneider Electric
      see <https://github.com/scikit-learn/scikit-learn/pull/13511#issuecomment-481729756>
 
  */
diff --git a/sklearn/svm/src/newrand/newrand.h b/sklearn/svm/src/newrand/newrand.h
index b46861b71e765..7cd7b4c9fbf2b 100644
--- a/sklearn/svm/src/newrand/newrand.h
+++ b/sklearn/svm/src/newrand/newrand.h
@@ -3,7 +3,7 @@
    - New random number generator using a mersenne twister + tweaked lemire
      postprocessor. This fixed a convergence issue on windows targets for
      libsvm and liblinear.
-     Sylvain Marie
+     Sylvain Marie, Schneider Electric
      See <https://github.com/scikit-learn/scikit-learn/pull/13511#issuecomment-481729756>
 
  */

From c531bd0d865702af7b033b872b6214aded1a710b Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Tue, 21 Apr 2020 10:21:48 -0400
Subject: [PATCH 030/125] API Adds missing keyword only argument to PCA
 (#16975)

---
 sklearn/decomposition/_pca.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py
index 87092d7ccd17e..14a993c56dce8 100644
--- a/sklearn/decomposition/_pca.py
+++ b/sklearn/decomposition/_pca.py
@@ -320,7 +320,7 @@ class PCA(_BasePCA):
     [6.30061...]
     """
     @_deprecate_positional_args
-    def __init__(self, n_components=None, copy=True, whiten=False,
+    def __init__(self, n_components=None, *, copy=True, whiten=False,
                  svd_solver='auto', tol=0.0, iterated_power='auto',
                  random_state=None):
         self.n_components = n_components

From f82a2cb33871a67b36150647ece1c7e56d3132bb Mon Sep 17 00:00:00 2001
From: Lucy Liu <jliu176@gmail.com>
Date: Tue, 21 Apr 2020 19:48:00 +0200
Subject: [PATCH 031/125] replace boston (#16922)

---
 sklearn/ensemble/tests/test_voting.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/ensemble/tests/test_voting.py b/sklearn/ensemble/tests/test_voting.py
index 4eb47bea0a514..f81b9e59a5f1b 100644
--- a/sklearn/ensemble/tests/test_voting.py
+++ b/sklearn/ensemble/tests/test_voting.py
@@ -33,7 +33,7 @@
 iris = datasets.load_iris()
 X, y = iris.data[:, 1:3], iris.target
 
-X_r, y_r = datasets.load_boston(return_X_y=True)
+X_r, y_r = datasets.load_diabetes(return_X_y=True)
 
 
 @pytest.mark.parametrize(
@@ -120,7 +120,7 @@ def test_weights_iris():
 
 
 def test_weights_regressor():
-    """Check weighted average regression prediction on boston dataset."""
+    """Check weighted average regression prediction on diabetes dataset."""
     reg1 = DummyRegressor(strategy='mean')
     reg2 = DummyRegressor(strategy='median')
     reg3 = DummyRegressor(strategy='quantile', quantile=.2)

From 089c8a17a166b039691bfee45d14577e14292a41 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 22 Apr 2020 08:41:45 -0400
Subject: [PATCH 032/125] [MRG] MNT requires_y tag with y=None validation
 (#16622)

---
 doc/developers/develop.rst                    |  5 ++
 doc/whats_new/v0.23.rst                       |  8 +-
 sklearn/base.py                               | 36 +++++++-
 sklearn/covariance/_empirical_covariance.py   |  1 +
 sklearn/cross_decomposition/_pls.py           |  3 +-
 sklearn/ensemble/_gb.py                       |  6 +-
 sklearn/feature_selection/_rfe.py             |  4 +-
 .../_univariate_selection.py                  |  3 +
 sklearn/linear_model/_base.py                 |  3 +
 sklearn/linear_model/_coordinate_descent.py   | 89 ++++++++++++-------
 sklearn/linear_model/_ransac.py               | 10 ++-
 sklearn/neighbors/_base.py                    | 14 ++-
 sklearn/neighbors/_graph.py                   |  8 +-
 sklearn/neighbors/_lof.py                     |  4 +-
 sklearn/neighbors/_nca.py                     |  3 +
 sklearn/neighbors/_unsupervised.py            |  4 +-
 sklearn/tree/_classes.py                      | 10 ++-
 sklearn/utils/estimator_checks.py             | 34 +++++++
 18 files changed, 185 insertions(+), 60 deletions(-)

diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst
index cc4e8f6678c01..f17c58cee0d7f 100644
--- a/doc/developers/develop.rst
+++ b/doc/developers/develop.rst
@@ -510,6 +510,11 @@ requires_fit (default=True)
 requires_positive_X (default=False)
     whether the estimator requires positive X.
 
+requires_y (default=False)
+    whether the estimator requires y to be passed to `fit`, `fit_predict` or
+    `fit_transform` methods. The tag is True for estimators inheriting from
+    `~sklearn.base.RegressorMixin` and `~sklearn.base.ClassifierMixin`.
+
 requires_positive_y (default=False)
     whether the estimator requires a positive y (only applicable for regression).
 
diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index b5cb4b7012182..3b8af7ccf416d 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -476,4 +476,10 @@ Miscellaneous
   attribute is equal to the number of features passed to the `fit` method.
   See `SLEP010
   <https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep010/proposal.html>`_
-  for details. :pr:`16112` by `Nicolas Hug`_.
+  for details. :pr:`16112` and :pr:`16622` by `Nicolas Hug`_.
+
+- |API| Estimators now have a `requires_y` tags which is False by default
+  except for estimators that inherit from `~sklearn.base.RegressorMixin` or
+  `~sklearn.base.ClassifierMixin`. This tag is used to ensure that a proper
+  error message is raised when y was expected but None was passed.
+  :pr:`16622` by `Nicolas Hug`_.
diff --git a/sklearn/base.py b/sklearn/base.py
index 8a6041cc17982..c0328e00d84d0 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -36,7 +36,9 @@
     '_xfail_checks': False,
     'multioutput_only': False,
     'binary_only': False,
-    'requires_fit': True}
+    'requires_fit': True,
+    'requires_y': False,
+    }
 
 
 def clone(estimator, safe=True):
@@ -374,7 +376,8 @@ def _check_n_features(self, X, reset):
                                        self.n_features_in_)
                 )
 
-    def _validate_data(self, X, y=None, reset=True, **check_params):
+    def _validate_data(self, X, y=None, reset=True,
+                       validate_separately=False, **check_params):
         """Validate input data and set or check the `n_features_in_` attribute.
 
         Parameters
@@ -389,9 +392,14 @@ def _validate_data(self, X, y=None, reset=True, **check_params):
             Whether to reset the `n_features_in_` attribute.
             If False, the input will be checked for consistency with data
             provided when reset was last True.
+        validate_separately : False or tuple of dicts, default=False
+            Only used if y is not None.
+            If False, call validate_X_y(). Else, it must be a tuple of kwargs
+            to be used for calling check_array() on X and y respectively.
         **check_params : kwargs
             Parameters passed to :func:`sklearn.utils.check_array` or
-            :func:`sklearn.utils.check_X_y`.
+            :func:`sklearn.utils.check_X_y`. Ignored if validate_separately
+            is not False.
 
         Returns
         -------
@@ -400,10 +408,24 @@ def _validate_data(self, X, y=None, reset=True, **check_params):
         """
 
         if y is None:
+            if self._get_tags()['requires_y']:
+                raise ValueError(
+                    f"This {self.__class__.__name__} estimator "
+                    f"requires y to be passed, but the target y is None."
+                )
             X = check_array(X, **check_params)
             out = X
         else:
-            X, y = check_X_y(X, y, **check_params)
+            if validate_separately:
+                # We need this because some estimators validate X and y
+                # separately, and in general, separately calling check_array()
+                # on X and y isn't equivalent to just calling check_X_y()
+                # :(
+                check_X_params, check_y_params = validate_separately
+                X = check_array(X, **check_X_params)
+                y = check_array(y, **check_y_params)
+            else:
+                X, y = check_X_y(X, y, **check_params)
             out = X, y
 
         if check_params.get('ensure_2d', True):
@@ -444,6 +466,9 @@ def score(self, X, y, sample_weight=None):
         from .metrics import accuracy_score
         return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
 
+    def _more_tags(self):
+        return {'requires_y': True}
+
 
 class RegressorMixin:
     """Mixin class for all regression estimators in scikit-learn."""
@@ -494,6 +519,9 @@ def score(self, X, y, sample_weight=None):
         y_pred = self.predict(X)
         return r2_score(y, y_pred, sample_weight=sample_weight)
 
+    def _more_tags(self):
+        return {'requires_y': True}
+
 
 class ClusterMixin:
     """Mixin class for all cluster estimators in scikit-learn."""
diff --git a/sklearn/covariance/_empirical_covariance.py b/sklearn/covariance/_empirical_covariance.py
index c83dbc89697e1..684a57fdeb296 100644
--- a/sklearn/covariance/_empirical_covariance.py
+++ b/sklearn/covariance/_empirical_covariance.py
@@ -79,6 +79,7 @@ def empirical_covariance(X, assume_centered=False):
            [0.25, 0.25, 0.25]])
     """
     X = np.asarray(X)
+
     if X.ndim == 1:
         X = np.reshape(X, (1, -1))
 
diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py
index 508448c3ede39..b6912f81105a8 100644
--- a/sklearn/cross_decomposition/_pls.py
+++ b/sklearn/cross_decomposition/_pls.py
@@ -519,7 +519,8 @@ def fit_transform(self, X, y=None):
         return self.fit(X, y).transform(X, y)
 
     def _more_tags(self):
-        return {'poor_score': True}
+        return {'poor_score': True,
+                'requires_y': False}
 
 
 class PLSRegression(_PLS):
diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py
index 32e534fdc8517..439500c1917d8 100644
--- a/sklearn/ensemble/_gb.py
+++ b/sklearn/ensemble/_gb.py
@@ -405,15 +405,15 @@ def fit(self, X, y, sample_weight=None, monitor=None):
         # Check input
         # Since check_array converts both X and y to the same dtype, but the
         # trees use different types for X and y, checking them separately.
-        X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'],
-                                dtype=DTYPE)
+
+        X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'],
+                                   dtype=DTYPE, multi_output=True)
         n_samples, self.n_features_ = X.shape
 
         sample_weight_is_none = sample_weight is None
 
         sample_weight = _check_sample_weight(sample_weight, X)
 
-        y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
         y = column_or_1d(y, warn=True)
         y = self._validate_y(y, sample_weight)
 
diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py
index 7e7aada0d70b3..6d9bb8c463df6 100644
--- a/sklearn/feature_selection/_rfe.py
+++ b/sklearn/feature_selection/_rfe.py
@@ -340,7 +340,9 @@ def predict_log_proba(self, X):
     def _more_tags(self):
         estimator_tags = self.estimator._get_tags()
         return {'poor_score': True,
-                'allow_nan': estimator_tags.get('allow_nan', True)}
+                'allow_nan': estimator_tags.get('allow_nan', True),
+                'requires_y': True,
+                }
 
 
 class RFECV(RFE):
diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py
index 7ca0ce4a36715..21a2ddc10a1eb 100644
--- a/sklearn/feature_selection/_univariate_selection.py
+++ b/sklearn/feature_selection/_univariate_selection.py
@@ -363,6 +363,9 @@ def fit(self, X, y):
     def _check_params(self, X, y):
         pass
 
+    def _more_tags(self):
+        return {'requires_y': True}
+
 
 ######################################################################
 # Specific filters
diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index 56e5e24761128..c1f6b8233bdac 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -246,6 +246,9 @@ def _set_intercept(self, X_offset, y_offset, X_scale):
         else:
             self.intercept_ = 0.
 
+    def _more_tags(self):
+        return {'requires_y': True}
+
 
 # XXX Should this derive from LinearModel? It should be a mixin, not an ABC.
 # Maybe the n_features checking can be moved to LinearModel.
diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index 5d932033dbd0d..f4430c5bcac55 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -1157,8 +1157,52 @@ def fit(self, X, y):
         y : array-like of shape (n_samples,) or (n_samples, n_targets)
             Target values
         """
-        y = check_array(y, copy=False, dtype=[np.float64, np.float32],
-                        ensure_2d=False)
+        # This makes sure that there is no duplication in memory.
+        # Dealing right with copy_X is important in the following:
+        # Multiple functions touch X and subsamples of X and can induce a
+        # lot of duplication of memory
+        copy_X = self.copy_X and self.fit_intercept
+
+        check_y_params = dict(copy=False, dtype=[np.float64, np.float32],
+                              ensure_2d=False)
+        if isinstance(X, np.ndarray) or sparse.isspmatrix(X):
+            # Keep a reference to X
+            reference_to_old_X = X
+            # Let us not impose fortran ordering so far: it is
+            # not useful for the cross-validation loop and will be done
+            # by the model fitting itself
+
+            # Need to validate separately here.
+            # We can't pass multi_ouput=True because that would allow y to be
+            # csr. We also want to allow y to be 64 or 32 but check_X_y only
+            # allows to convert for 64.
+            check_X_params = dict(accept_sparse='csc',
+                                  dtype=[np.float64, np.float32], copy=False)
+            X, y = self._validate_data(X, y,
+                                       validate_separately=(check_X_params,
+                                                            check_y_params))
+            if sparse.isspmatrix(X):
+                if (hasattr(reference_to_old_X, "data") and
+                   not np.may_share_memory(reference_to_old_X.data, X.data)):
+                    # X is a sparse matrix and has been copied
+                    copy_X = False
+            elif not np.may_share_memory(reference_to_old_X, X):
+                # X has been copied
+                copy_X = False
+            del reference_to_old_X
+        else:
+            # Need to validate separately here.
+            # We can't pass multi_ouput=True because that would allow y to be
+            # csr. We also want to allow y to be 64 or 32 but check_X_y only
+            # allows to convert for 64.
+            check_X_params = dict(accept_sparse='csc',
+                                  dtype=[np.float64, np.float32], order='F',
+                                  copy=copy_X)
+            X, y = self._validate_data(X, y,
+                                       validate_separately=(check_X_params,
+                                                            check_y_params))
+            copy_X = False
+
         if y.shape[0] == 0:
             raise ValueError("y has 0 samples: %r" % y)
 
@@ -1191,35 +1235,6 @@ def fit(self, X, y):
         if self.selection not in ["random", "cyclic"]:
             raise ValueError("selection should be either random or cyclic.")
 
-        # This makes sure that there is no duplication in memory.
-        # Dealing right with copy_X is important in the following:
-        # Multiple functions touch X and subsamples of X and can induce a
-        # lot of duplication of memory
-        copy_X = self.copy_X and self.fit_intercept
-
-        if isinstance(X, np.ndarray) or sparse.isspmatrix(X):
-            # Keep a reference to X
-            reference_to_old_X = X
-            # Let us not impose fortran ordering so far: it is
-            # not useful for the cross-validation loop and will be done
-            # by the model fitting itself
-            X = self._validate_data(X, accept_sparse='csc',
-                                    dtype=[np.float64, np.float32], copy=False)
-            if sparse.isspmatrix(X):
-                if (hasattr(reference_to_old_X, "data") and
-                   not np.may_share_memory(reference_to_old_X.data, X.data)):
-                    # X is a sparse matrix and has been copied
-                    copy_X = False
-            elif not np.may_share_memory(reference_to_old_X, X):
-                # X has been copied
-                copy_X = False
-            del reference_to_old_X
-        else:
-            X = self._validate_data(X, accept_sparse='csc',
-                                    dtype=[np.float64, np.float32], order='F',
-                                    copy=copy_X)
-            copy_X = False
-
         if X.shape[0] != y.shape[0]:
             raise ValueError("X and y have inconsistent dimensions (%d != %d)"
                              % (X.shape[0], y.shape[0]))
@@ -1842,9 +1857,15 @@ def fit(self, X, y):
         To avoid memory re-allocation it is advised to allocate the
         initial data in memory directly using that format.
         """
-        X = self._validate_data(X, dtype=[np.float64, np.float32], order='F',
-                                copy=self.copy_X and self.fit_intercept)
-        y = check_array(y, dtype=X.dtype.type, ensure_2d=False)
+
+        # Need to validate separately here.
+        # We can't pass multi_ouput=True because that would allow y to be csr.
+        check_X_params = dict(dtype=[np.float64, np.float32], order='F',
+                              copy=self.copy_X and self.fit_intercept)
+        check_y_params = dict(ensure_2d=False)
+        X, y = self._validate_data(X, y, validate_separately=(check_X_params,
+                                                              check_y_params))
+        y = y.astype(X.dtype)
 
         if hasattr(self, 'l1_ratio'):
             model_str = 'ElasticNet'
diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py
index 86ceb0d5e311f..5eac651c76383 100644
--- a/sklearn/linear_model/_ransac.py
+++ b/sklearn/linear_model/_ransac.py
@@ -9,7 +9,7 @@
 
 from ..base import BaseEstimator, MetaEstimatorMixin, RegressorMixin, clone
 from ..base import MultiOutputMixin
-from ..utils import check_random_state, check_array, check_consistent_length
+from ..utils import check_random_state, check_consistent_length
 from ..utils.random import sample_without_replacement
 from ..utils.validation import check_is_fitted, _check_sample_weight
 from ..utils.validation import _deprecate_positional_args
@@ -247,8 +247,12 @@ def fit(self, X, y, sample_weight=None):
             `max_trials` randomly chosen sub-samples.
 
         """
-        X = self._validate_data(X, accept_sparse='csr')
-        y = check_array(y, ensure_2d=False)
+        # Need to validate separately here.
+        # We can't pass multi_ouput=True because that would allow y to be csr.
+        check_X_params = dict(accept_sparse='csr')
+        check_y_params = dict(ensure_2d=False)
+        X, y = self._validate_data(X, y, validate_separately=(check_X_params,
+                                                              check_y_params))
         check_consistent_length(X, y)
 
         if self.base_estimator is not None:
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
index 945959ef10d9c..a1eebdcf78648 100644
--- a/sklearn/neighbors/_base.py
+++ b/sklearn/neighbors/_base.py
@@ -23,7 +23,7 @@
 from ..base import BaseEstimator, MultiOutputMixin
 from ..metrics import pairwise_distances_chunked
 from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
-from ..utils import check_X_y, check_array, gen_even_slices
+from ..utils import check_array, gen_even_slices
 from ..utils import _to_object_array
 from ..utils.multiclass import check_classification_targets
 from ..utils.validation import check_is_fitted
@@ -1104,10 +1104,14 @@ def fit(self, X, y):
              or [n_samples, n_outputs]
         """
         if not isinstance(X, (KDTree, BallTree)):
-            X, y = check_X_y(X, y, "csr", multi_output=True)
+            X, y = self._validate_data(X, y, accept_sparse="csr",
+                                       multi_output=True)
         self._y = y
         return self._fit(X)
 
+    def _more_tags(self):
+        return {'requires_y': True}
+
 
 class SupervisedIntegerMixin:
     def fit(self, X, y):
@@ -1124,7 +1128,8 @@ def fit(self, X, y):
 
         """
         if not isinstance(X, (KDTree, BallTree)):
-            X, y = check_X_y(X, y, "csr", multi_output=True)
+            X, y = self._validate_data(X, y, accept_sparse="csr",
+                                       multi_output=True)
 
         if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1:
             if y.ndim != 1:
@@ -1151,6 +1156,9 @@ def fit(self, X, y):
 
         return self._fit(X)
 
+    def _more_tags(self):
+        return {'requires_y': True}
+
 
 class UnsupervisedMixin:
     def fit(self, X, y=None):
diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py
index 9fc4a6e830cde..d217999196950 100644
--- a/sklearn/neighbors/_graph.py
+++ b/sklearn/neighbors/_graph.py
@@ -192,8 +192,8 @@ def radius_neighbors_graph(X, radius, mode='connectivity', metric='minkowski',
     return X.radius_neighbors_graph(query, radius, mode)
 
 
-class KNeighborsTransformer(NeighborsBase, KNeighborsMixin,
-                            UnsupervisedMixin, TransformerMixin):
+class KNeighborsTransformer(KNeighborsMixin, UnsupervisedMixin,
+                            TransformerMixin, NeighborsBase):
     """Transform X into a (weighted) graph of k nearest neighbors
 
     The transformed data is a sparse graph as returned by kneighbors_graph.
@@ -335,8 +335,8 @@ def fit_transform(self, X, y=None):
         return self.fit(X).transform(X)
 
 
-class RadiusNeighborsTransformer(NeighborsBase, RadiusNeighborsMixin,
-                                 UnsupervisedMixin, TransformerMixin):
+class RadiusNeighborsTransformer(RadiusNeighborsMixin, UnsupervisedMixin,
+                                 TransformerMixin, NeighborsBase):
     """Transform X into a (weighted) graph of neighbors nearer than a radius
 
     The transformed data is a sparse graph as returned by
diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py
index e03c4d9cb1e0e..f3b141bf499e5 100644
--- a/sklearn/neighbors/_lof.py
+++ b/sklearn/neighbors/_lof.py
@@ -16,8 +16,8 @@
 __all__ = ["LocalOutlierFactor"]
 
 
-class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin,
-                         OutlierMixin):
+class LocalOutlierFactor(KNeighborsMixin, UnsupervisedMixin,
+                         OutlierMixin, NeighborsBase):
     """Unsupervised Outlier Detection using Local Outlier Factor (LOF)
 
     The anomaly score of each sample is called Local Outlier Factor.
diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py
index cd87d594281da..1017f5cf12606 100644
--- a/sklearn/neighbors/_nca.py
+++ b/sklearn/neighbors/_nca.py
@@ -520,3 +520,6 @@ def _loss_grad_lbfgs(self, transformation, X, same_class_mask, sign=1.0):
             sys.stdout.flush()
 
         return sign * loss, sign * gradient.ravel()
+
+    def _more_tags(self):
+        return {'requires_y': True}
diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py
index 20be4f636c2a4..6faafeee9ffcd 100644
--- a/sklearn/neighbors/_unsupervised.py
+++ b/sklearn/neighbors/_unsupervised.py
@@ -5,8 +5,8 @@
 from ._base import UnsupervisedMixin
 
 
-class NearestNeighbors(NeighborsBase, KNeighborsMixin,
-                       RadiusNeighborsMixin, UnsupervisedMixin):
+class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin,
+                       UnsupervisedMixin, NeighborsBase):
     """Unsupervised learner for implementing neighbor searches.
 
     Read more in the :ref:`User Guide <unsupervised_neighbors>`.
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index f252ba0acbb1c..81cd7bd8e8989 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -146,8 +146,14 @@ def fit(self, X, y, sample_weight=None, check_input=True,
             raise ValueError("ccp_alpha must be greater than or equal to 0")
 
         if check_input:
-            X = self._validate_data(X, dtype=DTYPE, accept_sparse="csc")
-            y = check_array(y, ensure_2d=False, dtype=None)
+            # Need to validate separately here.
+            # We can't pass multi_ouput=True because that would allow y to be
+            # csr.
+            check_X_params = dict(dtype=DTYPE, accept_sparse="csc")
+            check_y_params = dict(ensure_2d=False, dtype=None)
+            X, y = self._validate_data(X, y,
+                                       validate_separately=(check_X_params,
+                                                            check_y_params))
             if issparse(X):
                 X.sort_indices()
 
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 351f24b66283e..efac2aca2a2df 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -270,6 +270,8 @@ def _yield_all_checks(name, estimator):
     yield check_fit_idempotent
     if not tags["no_validation"]:
         yield check_n_features_in
+        if tags["requires_y"]:
+            yield check_requires_y_none
     if tags["requires_positive_X"]:
         yield check_fit_non_negative
 
@@ -2976,3 +2978,35 @@ def check_n_features_in(name, estimator_orig):
             "https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep010/proposal.html",  # noqa
             FutureWarning
         )
+
+
+def check_requires_y_none(name, estimator_orig):
+    # Make sure that an estimator with requires_y=True fails gracefully when
+    # given y=None
+
+    rng = np.random.RandomState(0)
+
+    estimator = clone(estimator_orig)
+    set_random_state(estimator)
+
+    n_samples = 100
+    X = rng.normal(loc=100, size=(n_samples, 2))
+    X = _pairwise_estimator_convert_X(X, estimator)
+
+    warning_msg = ("As of scikit-learn 0.23, estimators should have a "
+                   "'requires_y' tag set to the appropriate value. "
+                   "The default value of the tag is False. "
+                   "An error will be raised from version 0.25 when calling "
+                   "check_estimator() if the tag isn't properly set.")
+
+    expected_err_msgs = (
+        "requires y to be passed, but the target y is None",
+        "Expected array-like (array or non-string sequence), got None",
+        "y should be a 1d array"
+    )
+
+    try:
+        estimator.fit(X, None)
+    except ValueError as ve:
+        if not any(msg in str(ve) for msg in expected_err_msgs):
+            warnings.warn(warning_msg, FutureWarning)

From 5f6dfcb4d18efb100e5f540d04e0b2287383f845 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Wed, 22 Apr 2020 08:42:04 -0400
Subject: [PATCH 033/125] API Deprecate positional arguments in pipeline
 (#16997)

---
 sklearn/pipeline.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index c1bbdbd629ff8..477354107e133 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -21,6 +21,7 @@
 from .utils.metaestimators import if_delegate_has_method
 from .utils import Bunch, _print_elapsed_time
 from .utils.validation import check_memory
+from .utils.validation import _deprecate_positional_args
 
 from .utils.metaestimators import _BaseComposition
 
@@ -104,7 +105,8 @@ class Pipeline(_BaseComposition):
     # BaseEstimator interface
     _required_parameters = ['steps']
 
-    def __init__(self, steps, memory=None, verbose=False):
+    @_deprecate_positional_args
+    def __init__(self, steps, *, memory=None, verbose=False):
         self.steps = steps
         self.memory = memory
         self.verbose = verbose
@@ -797,7 +799,8 @@ class FeatureUnion(TransformerMixin, _BaseComposition):
     """
     _required_parameters = ["transformer_list"]
 
-    def __init__(self, transformer_list, n_jobs=None,
+    @_deprecate_positional_args
+    def __init__(self, transformer_list, *, n_jobs=None,
                  transformer_weights=None, verbose=False):
         self.transformer_list = transformer_list
         self.n_jobs = n_jobs

From 1523f395e952dd79d2427d8230056ed0cf47f7a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Wed, 22 Apr 2020 14:50:42 +0200
Subject: [PATCH 034/125] MNT fix memory leak in elkan KMeans (#17000)

---
 sklearn/cluster/_k_means_elkan.pyx | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/sklearn/cluster/_k_means_elkan.pyx b/sklearn/cluster/_k_means_elkan.pyx
index 65c8871fbb456..70c4abb0c4ac7 100644
--- a/sklearn/cluster/_k_means_elkan.pyx
+++ b/sklearn/cluster/_k_means_elkan.pyx
@@ -322,6 +322,9 @@ def _elkan_iter_chunked_dense(
                     for k in range(n_features):
                         centers_new[j, k] += centers_new_chunk[j * n_features + k]
 
+        free(centers_new_chunk)
+        free(weight_in_clusters_chunk)
+
     if update_centers:
         _relocate_empty_clusters_dense(X, sample_weight, centers_old,
                                        centers_new, weight_in_clusters, labels)
@@ -553,6 +556,9 @@ def _elkan_iter_chunked_sparse(
                     for k in range(n_features):
                         centers_new[j, k] += centers_new_chunk[j * n_features + k]
 
+        free(centers_new_chunk)
+        free(weight_in_clusters_chunk)
+
     if update_centers:
         _relocate_empty_clusters_sparse(
             X_data, X_indices, X_indptr, sample_weight,

From 388999b59d0070fafdcd76cf599af6758b25d987 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Wed, 22 Apr 2020 09:20:57 -0400
Subject: [PATCH 035/125] API Deprecate positional arguments in tree module
 (#16966)

---
 sklearn/tree/_classes.py          | 16 +++++++++++-----
 sklearn/tree/_export.py           | 10 +++++++---
 sklearn/tree/tests/test_export.py |  3 +--
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 81cd7bd8e8989..3994613d92b6b 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -36,6 +36,7 @@
 from ..utils import compute_sample_weight
 from ..utils.multiclass import check_classification_targets
 from ..utils.validation import check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 
 from ._criterion import Criterion
 from ._splitter import Splitter
@@ -82,7 +83,8 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
     """
 
     @abstractmethod
-    def __init__(self,
+    @_deprecate_positional_args
+    def __init__(self, *,
                  criterion,
                  splitter,
                  max_depth,
@@ -815,7 +817,8 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
     array([ 1.     ,  0.93...,  0.86...,  0.93...,  0.93...,
             0.93...,  0.93...,  1.     ,  0.93...,  1.      ])
     """
-    def __init__(self,
+    @_deprecate_positional_args
+    def __init__(self, *,
                  criterion="gini",
                  splitter="best",
                  max_depth=None,
@@ -1169,7 +1172,8 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
     array([-0.39..., -0.46...,  0.02...,  0.06..., -0.50...,
            0.16...,  0.11..., -0.73..., -0.30..., -0.00...])
     """
-    def __init__(self,
+    @_deprecate_positional_args
+    def __init__(self, *,
                  criterion="mse",
                  splitter="best",
                  max_depth=None,
@@ -1499,7 +1503,8 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
     >>> cls.score(X_test, y_test)
     0.8947...
     """
-    def __init__(self,
+    @_deprecate_positional_args
+    def __init__(self, *,
                  criterion="gini",
                  splitter="random",
                  max_depth=None,
@@ -1716,7 +1721,8 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
     >>> reg.score(X_test, y_test)
     0.33...
     """
-    def __init__(self,
+    @_deprecate_positional_args
+    def __init__(self, *,
                  criterion="mse",
                  splitter="random",
                  max_depth=None,
diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py
index 3197995818f81..d0f67326012e9 100644
--- a/sklearn/tree/_export.py
+++ b/sklearn/tree/_export.py
@@ -17,6 +17,7 @@
 import numpy as np
 
 from ..utils.validation import check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 from ..base import is_classifier
 
 from . import _criterion
@@ -77,7 +78,8 @@ def __repr__(self):
 SENTINEL = Sentinel()
 
 
-def plot_tree(decision_tree, max_depth=None, feature_names=None,
+@_deprecate_positional_args
+def plot_tree(decision_tree, *, max_depth=None, feature_names=None,
               class_names=None, label='all', filled=False,
               impurity=True, node_ids=False,
               proportion=False, rotate='deprecated', rounded=False,
@@ -656,7 +658,8 @@ def recurse(self, node, tree, ax, scale_x, scale_y, height, depth=0):
             ax.annotate("\n  (...)  \n", xy_parent, xy, **kwargs)
 
 
-def export_graphviz(decision_tree, out_file=None, max_depth=None,
+@_deprecate_positional_args
+def export_graphviz(decision_tree, out_file=None, *, max_depth=None,
                     feature_names=None, class_names=None, label='all',
                     filled=False, leaves_parallel=False, impurity=True,
                     node_ids=False, proportion=False, rotate=False,
@@ -807,7 +810,8 @@ def compute_depth_(current_node, current_depth,
     return max(depths)
 
 
-def export_text(decision_tree, feature_names=None, max_depth=10,
+@_deprecate_positional_args
+def export_text(decision_tree, *, feature_names=None, max_depth=10,
                 spacing=3, decimals=2, show_weights=False):
     """Build a text report showing the rules of a decision tree.
 
diff --git a/sklearn/tree/tests/test_export.py b/sklearn/tree/tests/test_export.py
index ad49f81fcf9ac..f12f1daeb57c1 100644
--- a/sklearn/tree/tests/test_export.py
+++ b/sklearn/tree/tests/test_export.py
@@ -465,6 +465,5 @@ def test_not_fitted_tree(pyplot):
 
     # Testing if not fitted tree throws the correct error
     clf = DecisionTreeRegressor()
-    out = StringIO()
     with pytest.raises(NotFittedError):
-        plot_tree(clf, out)
+        plot_tree(clf)

From 79df4068585ec06e591bffe3330bcb783c0497dd Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Wed, 22 Apr 2020 17:11:32 -0400
Subject: [PATCH 036/125] API Deprecate positional arguments in
 random_projection (#16995)

* API Deprecate positional arguments in random_projection

* CLN Address comments
---
 sklearn/random_projection.py            |  9 ++++++---
 sklearn/tests/test_random_projection.py | 14 +++++++-------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py
index d18f3bf846901..61eeeea5ef45e 100644
--- a/sklearn/random_projection.py
+++ b/sklearn/random_projection.py
@@ -39,6 +39,7 @@
 from .utils.extmath import safe_sparse_dot
 from .utils.random import sample_without_replacement
 from .utils.validation import check_array, check_is_fitted
+from .utils.validation import _deprecate_positional_args
 from .exceptions import DataDimensionalityWarning
 from .utils import deprecated
 
@@ -310,7 +311,7 @@ class BaseRandomProjection(TransformerMixin, BaseEstimator, metaclass=ABCMeta):
     """
 
     @abstractmethod
-    def __init__(self, n_components='auto', eps=0.1, dense_output=False,
+    def __init__(self, n_components='auto', *, eps=0.1, dense_output=False,
                  random_state=None):
         self.n_components = n_components
         self.eps = eps
@@ -489,7 +490,8 @@ class GaussianRandomProjection(BaseRandomProjection):
     SparseRandomProjection
 
     """
-    def __init__(self, n_components='auto', eps=0.1, random_state=None):
+    @_deprecate_positional_args
+    def __init__(self, n_components='auto', *, eps=0.1, random_state=None):
         super().__init__(
             n_components=n_components,
             eps=eps,
@@ -626,7 +628,8 @@ class SparseRandomProjection(BaseRandomProjection):
            https://users.soe.ucsc.edu/~optas/papers/jl.pdf
 
     """
-    def __init__(self, n_components='auto', density='auto', eps=0.1,
+    @_deprecate_positional_args
+    def __init__(self, n_components='auto', *, density='auto', eps=0.1,
                  dense_output=False, random_state=None):
         super().__init__(
             n_components=n_components,
diff --git a/sklearn/tests/test_random_projection.py b/sklearn/tests/test_random_projection.py
index 033bb84279d54..b8d69632105b0 100644
--- a/sklearn/tests/test_random_projection.py
+++ b/sklearn/tests/test_random_projection.py
@@ -62,21 +62,21 @@ def densify(matrix):
 # test on JL lemma
 ###############################################################################
 def test_invalid_jl_domain():
-    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, 1.1)
-    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, 0.0)
-    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, -0.1)
-    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 0, 0.5)
+    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, eps=1.1)
+    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, eps=0.0)
+    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, eps=-0.1)
+    assert_raises(ValueError, johnson_lindenstrauss_min_dim, 0, eps=0.5)
 
 
 def test_input_size_jl_min_dim():
     assert_raises(ValueError, johnson_lindenstrauss_min_dim,
-                  3 * [100], 2 * [0.9])
+                  3 * [100], eps=2 * [0.9])
 
     assert_raises(ValueError, johnson_lindenstrauss_min_dim, 3 * [100],
-                  2 * [0.9])
+                  eps=2 * [0.9])
 
     johnson_lindenstrauss_min_dim(np.random.randint(1, 10, size=(10, 10)),
-                                  np.full((10, 10), 0.5))
+                                  eps=np.full((10, 10), 0.5))
 
 
 ###############################################################################

From 6717c6afa0d1f7b1c7c9ed7b2abee903d7072c7e Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Wed, 22 Apr 2020 17:14:06 -0400
Subject: [PATCH 037/125] API Deprecate positional arguments in semi_supervised
 module (#16974)

* API Keyword only for semi_supervised

* CLN Address comments
---
 sklearn/semi_supervised/_label_propagation.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py
index d46dacbe754e4..ccc6b889f41f6 100644
--- a/sklearn/semi_supervised/_label_propagation.py
+++ b/sklearn/semi_supervised/_label_propagation.py
@@ -68,6 +68,7 @@
 from ..utils.extmath import safe_sparse_dot
 from ..utils.multiclass import check_classification_targets
 from ..utils.validation import check_is_fitted, check_array
+from ..utils.validation import _deprecate_positional_args
 from ..exceptions import ConvergenceWarning
 
 
@@ -105,7 +106,8 @@ class BaseLabelPropagation(ClassifierMixin, BaseEstimator, metaclass=ABCMeta):
         for more details.
     """
 
-    def __init__(self, kernel='rbf', gamma=20, n_neighbors=7,
+    @_deprecate_positional_args
+    def __init__(self, kernel='rbf', *, gamma=20, n_neighbors=7,
                  alpha=1, max_iter=30, tol=1e-3, n_jobs=None):
 
         self.max_iter = max_iter
@@ -378,7 +380,8 @@ class LabelPropagation(BaseLabelPropagation):
 
     _variant = 'propagation'
 
-    def __init__(self, kernel='rbf', gamma=20, n_neighbors=7,
+    @_deprecate_positional_args
+    def __init__(self, kernel='rbf', *, gamma=20, n_neighbors=7,
                  max_iter=1000, tol=1e-3, n_jobs=None):
         super().__init__(kernel=kernel, gamma=gamma,
                          n_neighbors=n_neighbors, max_iter=max_iter,
@@ -491,7 +494,8 @@ class LabelSpreading(BaseLabelPropagation):
 
     _variant = 'spreading'
 
-    def __init__(self, kernel='rbf', gamma=20, n_neighbors=7, alpha=0.2,
+    @_deprecate_positional_args
+    def __init__(self, kernel='rbf', *, gamma=20, n_neighbors=7, alpha=0.2,
                  max_iter=30, tol=1e-3, n_jobs=None):
 
         # this one has different base parameters

From 8010cadf6ca9d319321ac72ff5e604b941f0d40c Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Thu, 23 Apr 2020 04:03:51 -0400
Subject: [PATCH 038/125] API Deprecate positional arguments in svm module
 (#16973)

* API Keyword only for svm

* BUG Fix

* CLN Address comments
---
 sklearn/svm/_bounds.py           |  4 +++-
 sklearn/svm/_classes.py          | 25 ++++++++++++++++---------
 sklearn/svm/tests/test_bounds.py |  7 ++++---
 3 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/sklearn/svm/_bounds.py b/sklearn/svm/_bounds.py
index 1e1ed8939ce5f..b35728041f6cf 100644
--- a/sklearn/svm/_bounds.py
+++ b/sklearn/svm/_bounds.py
@@ -6,10 +6,12 @@
 
 from ..preprocessing import LabelBinarizer
 from ..utils.validation import check_consistent_length, check_array
+from ..utils.validation import _deprecate_positional_args
 from ..utils.extmath import safe_sparse_dot
 
 
-def l1_min_c(X, y, loss='squared_hinge', fit_intercept=True,
+@_deprecate_positional_args
+def l1_min_c(X, y, *, loss='squared_hinge', fit_intercept=True,
              intercept_scaling=1.0):
     """
     Return the lowest bound for C such that for C in (l1_min_C, infinity)
diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py
index 10975a6f8e4a2..77110da119a02 100644
--- a/sklearn/svm/_classes.py
+++ b/sklearn/svm/_classes.py
@@ -5,6 +5,7 @@
 from ..linear_model._base import LinearClassifierMixin, SparseCoefMixin, \
     LinearModel
 from ..utils.validation import _num_samples
+from ..utils.validation import _deprecate_positional_args
 from ..utils.multiclass import check_classification_targets
 from ..utils.deprecation import deprecated
 
@@ -177,9 +178,9 @@ class LinearSVC(BaseEstimator, LinearClassifierMixin,
     >>> print(clf.predict([[0, 0, 0, 0]]))
     [1]
     """
-
-    def __init__(self, penalty='l2', loss='squared_hinge', dual=True, tol=1e-4,
-                 C=1.0, multi_class='ovr', fit_intercept=True,
+    @_deprecate_positional_args
+    def __init__(self, penalty='l2', loss='squared_hinge', *, dual=True,
+                 tol=1e-4, C=1.0, multi_class='ovr', fit_intercept=True,
                  intercept_scaling=1, class_weight=None, verbose=0,
                  random_state=None, max_iter=1000):
         self.dual = dual
@@ -364,7 +365,8 @@ class LinearSVR(RegressorMixin, LinearModel):
         various loss functions and regularization regimes.
     """
 
-    def __init__(self, epsilon=0.0, tol=1e-4, C=1.0,
+    @_deprecate_positional_args
+    def __init__(self, *, epsilon=0.0, tol=1e-4, C=1.0,
                  loss='epsilon_insensitive', fit_intercept=True,
                  intercept_scaling=1., dual=True, verbose=0,
                  random_state=None, max_iter=1000):
@@ -627,7 +629,8 @@ class SVC(BaseSVC):
 
     _impl = 'c_svc'
 
-    def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='scale',
+    @_deprecate_positional_args
+    def __init__(self, *, C=1.0, kernel='rbf', degree=3, gamma='scale',
                  coef0=0.0, shrinking=True, probability=False,
                  tol=1e-3, cache_size=200, class_weight=None,
                  verbose=False, max_iter=-1, decision_function_shape='ovr',
@@ -838,7 +841,8 @@ class NuSVC(BaseSVC):
 
     _impl = 'nu_svc'
 
-    def __init__(self, nu=0.5, kernel='rbf', degree=3, gamma='scale',
+    @_deprecate_positional_args
+    def __init__(self, *, nu=0.5, kernel='rbf', degree=3, gamma='scale',
                  coef0=0.0, shrinking=True, probability=False, tol=1e-3,
                  cache_size=200, class_weight=None, verbose=False, max_iter=-1,
                  decision_function_shape='ovr', break_ties=False,
@@ -992,7 +996,8 @@ class SVR(RegressorMixin, BaseLibSVM):
 
     _impl = 'epsilon_svr'
 
-    def __init__(self, kernel='rbf', degree=3, gamma='scale',
+    @_deprecate_positional_args
+    def __init__(self, *, kernel='rbf', degree=3, gamma='scale',
                  coef0=0.0, tol=1e-3, C=1.0, epsilon=0.1, shrinking=True,
                  cache_size=200, verbose=False, max_iter=-1):
 
@@ -1137,7 +1142,8 @@ class NuSVR(RegressorMixin, BaseLibSVM):
 
     _impl = 'nu_svr'
 
-    def __init__(self, nu=0.5, C=1.0, kernel='rbf', degree=3,
+    @_deprecate_positional_args
+    def __init__(self, *, nu=0.5, C=1.0, kernel='rbf', degree=3,
                  gamma='scale', coef0=0.0, shrinking=True,
                  tol=1e-3, cache_size=200, verbose=False, max_iter=-1):
 
@@ -1251,7 +1257,8 @@ class OneClassSVM(OutlierMixin, BaseLibSVM):
 
     _impl = 'one_class'
 
-    def __init__(self, kernel='rbf', degree=3, gamma='scale',
+    @_deprecate_positional_args
+    def __init__(self, *, kernel='rbf', degree=3, gamma='scale',
                  coef0=0.0, tol=1e-3, nu=0.5, shrinking=True, cache_size=200,
                  verbose=False, max_iter=-1):
 
diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py
index 664a39c790b9b..8454ebf64de1a 100644
--- a/sklearn/svm/tests/test_bounds.py
+++ b/sklearn/svm/tests/test_bounds.py
@@ -37,11 +37,12 @@ def test_l1_min_c(loss, X_label, Y_label, intercept_label):
 def test_l1_min_c_l2_loss():
     # loss='l2' should raise ValueError
     assert_raise_message(ValueError, "loss type not in",
-                         l1_min_c, dense_X, Y1, "l2")
+                         l1_min_c, dense_X, Y1, loss="l2")
 
 
 def check_l1_min_c(X, y, loss, fit_intercept=True, intercept_scaling=None):
-    min_c = l1_min_c(X, y, loss, fit_intercept, intercept_scaling)
+    min_c = l1_min_c(X, y, loss=loss, fit_intercept=fit_intercept,
+                     intercept_scaling=intercept_scaling)
 
     clf = {
         'log': LogisticRegression(penalty='l1', solver='liblinear'),
@@ -72,4 +73,4 @@ def test_ill_posed_min_c():
 
 def test_unsupported_loss():
     with pytest.raises(ValueError):
-        l1_min_c(dense_X, Y1, 'l1')
+        l1_min_c(dense_X, Y1, loss='l1')

From fbae1edfd8a7c0dda20ed7822c9f88f4bbf4f266 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 23 Apr 2020 04:20:52 -0400
Subject: [PATCH 039/125] API kwonly args in base (#17006)

---
 sklearn/base.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index c0328e00d84d0..bf5ee370aa8f1 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -20,6 +20,7 @@
 from .utils import _IS_32BIT
 from .utils.validation import check_X_y
 from .utils.validation import check_array
+from .utils.validation import _deprecate_positional_args
 
 _DEFAULT_TAGS = {
     'non_deterministic': False,
@@ -41,7 +42,8 @@
     }
 
 
-def clone(estimator, safe=True):
+@_deprecate_positional_args
+def clone(estimator, *, safe=True):
     """Constructs a new estimator with the same parameters.
 
     Clone does a deep copy of the model in an estimator

From bbedfa0216877e26376acf147c50458710a4fc03 Mon Sep 17 00:00:00 2001
From: mathurinm <mathurinm@users.noreply.github.com>
Date: Thu, 23 Apr 2020 11:43:07 +0200
Subject: [PATCH 040/125] use semilogx for a more readable xaxis (#17001)

---
 .../plot_lasso_model_selection.py             | 32 ++++++++-----------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/examples/linear_model/plot_lasso_model_selection.py b/examples/linear_model/plot_lasso_model_selection.py
index 88e83d434a3c6..73fc94fb94600 100644
--- a/examples/linear_model/plot_lasso_model_selection.py
+++ b/examples/linear_model/plot_lasso_model_selection.py
@@ -80,14 +80,12 @@
 
 
 def plot_ic_criterion(model, name, color):
-    alpha_ = model.alpha_ + EPSILON
-    alphas_ = model.alphas_ + EPSILON
     criterion_ = model.criterion_
-    plt.plot(-np.log10(alphas_), criterion_, '--', color=color,
-             linewidth=3, label='%s criterion' % name)
-    plt.axvline(-np.log10(alpha_), color=color, linewidth=3,
+    plt.semilogx(model.alphas_ + EPSILON, criterion_, '--', color=color,
+                 linewidth=3, label='%s criterion' % name)
+    plt.axvline(model.alpha_ + EPSILON, color=color, linewidth=3,
                 label='alpha: %s estimate' % name)
-    plt.xlabel('-log(alpha)')
+    plt.xlabel(r'$\alpha$')
     plt.ylabel('criterion')
 
 
@@ -108,19 +106,17 @@ def plot_ic_criterion(model, name, color):
 t_lasso_cv = time.time() - t1
 
 # Display results
-m_log_alphas = -np.log10(model.alphas_ + EPSILON)
-
 plt.figure()
 ymin, ymax = 2300, 3800
-plt.plot(m_log_alphas, model.mse_path_, ':')
-plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k',
+plt.semilogx(model.alphas_ + EPSILON, model.mse_path_, ':')
+plt.plot(model.alphas_ + EPSILON, model.mse_path_.mean(axis=-1), 'k',
          label='Average across the folds', linewidth=2)
-plt.axvline(-np.log10(model.alpha_ + EPSILON), linestyle='--', color='k',
+plt.axvline(model.alpha_ + EPSILON, linestyle='--', color='k',
             label='alpha: CV estimate')
 
 plt.legend()
 
-plt.xlabel('-log(alpha)')
+plt.xlabel(r'$\alpha$')
 plt.ylabel('Mean square error')
 plt.title('Mean square error on each fold: coordinate descent '
           '(train time: %.2fs)' % t_lasso_cv)
@@ -137,17 +133,15 @@ def plot_ic_criterion(model, name, color):
 t_lasso_lars_cv = time.time() - t1
 
 # Display results
-m_log_alphas = -np.log10(model.cv_alphas_ + EPSILON)
-
 plt.figure()
-plt.plot(m_log_alphas, model.mse_path_, ':')
-plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k',
-         label='Average across the folds', linewidth=2)
-plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
+plt.semilogx(model.cv_alphas_ + EPSILON, model.mse_path_, ':')
+plt.semilogx(model.cv_alphas_ + EPSILON, model.mse_path_.mean(axis=-1), 'k',
+             label='Average across the folds', linewidth=2)
+plt.axvline(model.alpha_, linestyle='--', color='k',
             label='alpha CV')
 plt.legend()
 
-plt.xlabel('-log(alpha)')
+plt.xlabel(r'$\alpha$')
 plt.ylabel('Mean square error')
 plt.title('Mean square error on each fold: Lars (train time: %.2fs)'
           % t_lasso_lars_cv)

From 49d213d2fc8b9676c5ab3238d3756c8397543b05 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 23 Apr 2020 08:19:35 -0400
Subject: [PATCH 041/125] [MRG] API kwonly for neural_network module (#17002)

---
 sklearn/neural_network/_multilayer_perceptron.py | 8 +++++---
 sklearn/neural_network/_rbm.py                   | 5 +++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py
index 3ec30336c23c1..f9b8fce5eb0c7 100644
--- a/sklearn/neural_network/_multilayer_perceptron.py
+++ b/sklearn/neural_network/_multilayer_perceptron.py
@@ -25,7 +25,7 @@
 from ..utils import check_array, column_or_1d
 from ..exceptions import ConvergenceWarning
 from ..utils.extmath import safe_sparse_dot
-from ..utils.validation import check_is_fitted
+from ..utils.validation import check_is_fitted, _deprecate_positional_args
 from ..utils.multiclass import _check_partial_fit_first_call, unique_labels
 from ..utils.multiclass import type_of_target
 from ..utils.optimize import _check_optimize_result
@@ -936,7 +936,8 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):
     Kingma, Diederik, and Jimmy Ba. "Adam: A method for stochastic
         optimization." arXiv preprint arXiv:1412.6980 (2014).
     """
-    def __init__(self, hidden_layer_sizes=(100,), activation="relu",
+    @_deprecate_positional_args
+    def __init__(self, hidden_layer_sizes=(100,), activation="relu", *,
                  solver='adam', alpha=0.0001,
                  batch_size='auto', learning_rate="constant",
                  learning_rate_init=0.001, power_t=0.5, max_iter=200,
@@ -1339,7 +1340,8 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron):
     Kingma, Diederik, and Jimmy Ba. "Adam: A method for stochastic
         optimization." arXiv preprint arXiv:1412.6980 (2014).
     """
-    def __init__(self, hidden_layer_sizes=(100,), activation="relu",
+    @_deprecate_positional_args
+    def __init__(self, hidden_layer_sizes=(100,), activation="relu", *,
                  solver='adam', alpha=0.0001,
                  batch_size='auto', learning_rate="constant",
                  learning_rate_init=0.001,
diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py
index 03b69c656b4a3..67e1d68a3607e 100644
--- a/sklearn/neural_network/_rbm.py
+++ b/sklearn/neural_network/_rbm.py
@@ -20,7 +20,7 @@
 from ..utils import gen_even_slices
 from ..utils.extmath import safe_sparse_dot
 from ..utils.extmath import log_logistic
-from ..utils.validation import check_is_fitted
+from ..utils.validation import check_is_fitted, _deprecate_positional_args
 
 
 class BernoulliRBM(TransformerMixin, BaseEstimator):
@@ -106,7 +106,8 @@ class BernoulliRBM(TransformerMixin, BaseEstimator):
         Approximations to the Likelihood Gradient. International Conference
         on Machine Learning (ICML) 2008
     """
-    def __init__(self, n_components=256, learning_rate=0.1, batch_size=10,
+    @_deprecate_positional_args
+    def __init__(self, n_components=256, *, learning_rate=0.1, batch_size=10,
                  n_iter=10, verbose=0, random_state=None):
         self.n_components = n_components
         self.learning_rate = learning_rate

From 946fddec7b62215191524c3f950a41fe944d014c Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 23 Apr 2020 10:19:45 -0400
Subject: [PATCH 042/125] BUG Fix instability issue of ARDRegression (with
 speedup) (#16849)

---
 doc/whats_new/v0.23.rst                  |   6 ++
 sklearn/externals/_scipy_linalg.py       | 118 -----------------------
 sklearn/linear_model/_bayes.py           |  59 ++++++++----
 sklearn/linear_model/tests/test_bayes.py |  48 ++++++---
 sklearn/utils/fixes.py                   |   8 --
 5 files changed, 83 insertions(+), 156 deletions(-)
 delete mode 100644 sklearn/externals/_scipy_linalg.py

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index 3b8af7ccf416d..6719e511a583d 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -303,6 +303,12 @@ Changelog
   of strictly inferior for maximum of `absgrad` and `tol` in `utils.optimize._newton_cg`.
   :pr:`16266` by :user:`Rushabh Vasani <rushabh-v>`.
 
+- |Fix| |Efficiency| :class:`linear_model.ARDRegression` is more stable and
+  much faster when `n_samples > n_features`. It can now scale to hundreds of
+  thousands of samples. The stability fix might imply changes in the number
+  of non-zero coefficients and in the predicted output. :pr:`16849` by
+  `Nicolas Hug`_.
+
 - |Enhancement| :class:`linear_model.LassoLars` and
   :class:`linear_model.Lars` now support a `jitter` parameter that adds
   random noise to the target. This might help with stability in some edge
diff --git a/sklearn/externals/_scipy_linalg.py b/sklearn/externals/_scipy_linalg.py
deleted file mode 100644
index 70a6ff5a0c623..0000000000000
--- a/sklearn/externals/_scipy_linalg.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# This should remained pinned to version 1.2 and not updated like other
-# externals.
-"""Copyright (c) 2001-2002 Enthought, Inc.  2003-2019, SciPy Developers.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-1. Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above
-   copyright notice, this list of conditions and the following
-   disclaimer in the documentation and/or other materials provided
-   with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its
-   contributors may be used to endorse or promote products derived
-   from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-"""
-
-import numpy as np
-import scipy.linalg.decomp as decomp
-
-
-def pinvh(a, cond=None, rcond=None, lower=True, return_rank=False,
-          check_finite=True):
-    """
-    Compute the (Moore-Penrose) pseudo-inverse of a Hermitian matrix.
-
-    Copied in from scipy==1.2.2, in order to preserve the default choice of the
-    `cond` and `above_cutoff` values which determine which values of the matrix
-    inversion lie below threshold and are so set to zero. Changes in scipy 1.3
-    resulted in a smaller default threshold and thus slower convergence of
-    dependent algorithms in some cases (see Sklearn github issue #14055).
-
-    Calculate a generalized inverse of a Hermitian or real symmetric matrix
-    using its eigenvalue decomposition and including all eigenvalues with
-    'large' absolute value.
-
-    Parameters
-    ----------
-    a : (N, N) array_like
-        Real symmetric or complex hermetian matrix to be pseudo-inverted
-    cond, rcond : float or None
-        Cutoff for 'small' eigenvalues.
-        Singular values smaller than rcond * largest_eigenvalue are considered
-        zero.
-
-        If None or -1, suitable machine precision is used.
-    lower : bool, optional
-        Whether the pertinent array data is taken from the lower or upper
-        triangle of a. (Default: lower)
-    return_rank : bool, optional
-        if True, return the effective rank of the matrix
-    check_finite : bool, optional
-        Whether to check that the input matrix contains only finite numbers.
-        Disabling may give a performance gain, but may result in problems
-        (crashes, non-termination) if the inputs do contain infinities or NaNs.
-
-    Returns
-    -------
-    B : (N, N) ndarray
-        The pseudo-inverse of matrix `a`.
-    rank : int
-        The effective rank of the matrix.  Returned if return_rank == True
-
-    Raises
-    ------
-    LinAlgError
-        If eigenvalue does not converge
-
-    Examples
-    --------
-    >>> from scipy.linalg import pinvh
-    >>> a = np.random.randn(9, 6)
-    >>> a = np.dot(a, a.T)
-    >>> B = pinvh(a)
-    >>> np.allclose(a, np.dot(a, np.dot(B, a)))
-    True
-    >>> np.allclose(B, np.dot(B, np.dot(a, B)))
-    True
-
-    """
-    a = decomp._asarray_validated(a, check_finite=check_finite)
-    s, u = decomp.eigh(a, lower=lower, check_finite=False)
-
-    if rcond is not None:
-        cond = rcond
-    if cond in [None, -1]:
-        t = u.dtype.char.lower()
-        factor = {'f': 1E3, 'd': 1E6}
-        cond = factor[t] * np.finfo(t).eps
-
-    # For Hermitian matrices, singular values equal abs(eigenvalues)
-    above_cutoff = (abs(s) > cond * np.max(abs(s)))
-    psigma_diag = 1.0 / s[above_cutoff]
-    u = u[:, above_cutoff]
-
-    B = np.dot(u * psigma_diag, np.conjugate(u).T)
-
-    if return_rank:
-        return B, len(psigma_diag)
-    else:
-        return B
diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py
index eb12a271695cd..c69ebc1ce4307 100644
--- a/sklearn/linear_model/_bayes.py
+++ b/sklearn/linear_model/_bayes.py
@@ -12,7 +12,7 @@
 from ._base import LinearModel, _rescale_data
 from ..base import RegressorMixin
 from ..utils.extmath import fast_logdet
-from ..utils.fixes import pinvh
+from scipy.linalg import pinvh
 from ..utils.validation import _check_sample_weight
 from ..utils.validation import _deprecate_positional_args
 
@@ -554,27 +554,16 @@ def fit(self, X, y):
         self.scores_ = list()
         coef_old_ = None
 
-        # Compute sigma and mu (using Woodbury matrix identity)
-        def update_sigma(X, alpha_, lambda_, keep_lambda, n_samples):
-            sigma_ = pinvh(np.eye(n_samples) / alpha_ +
-                           np.dot(X[:, keep_lambda] *
-                           np.reshape(1. / lambda_[keep_lambda], [1, -1]),
-                           X[:, keep_lambda].T))
-            sigma_ = np.dot(sigma_, X[:, keep_lambda] *
-                            np.reshape(1. / lambda_[keep_lambda], [1, -1]))
-            sigma_ = - np.dot(np.reshape(1. / lambda_[keep_lambda], [-1, 1]) *
-                              X[:, keep_lambda].T, sigma_)
-            sigma_.flat[::(sigma_.shape[1] + 1)] += 1. / lambda_[keep_lambda]
-            return sigma_
-
         def update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_):
             coef_[keep_lambda] = alpha_ * np.dot(
                 sigma_, np.dot(X[:, keep_lambda].T, y))
             return coef_
 
+        update_sigma = (self._update_sigma if n_samples >= n_features
+                        else self._update_sigma_woodbury)
         # Iterative procedure of ARDRegression
         for iter_ in range(self.n_iter):
-            sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda, n_samples)
+            sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)
             coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
 
             # Update alpha and lambda
@@ -606,9 +595,15 @@ def update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_):
                 break
             coef_old_ = np.copy(coef_)
 
-        # update sigma and mu using updated parameters from the last iteration
-        sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda, n_samples)
-        coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
+            if not keep_lambda.any():
+                break
+
+        if keep_lambda.any():
+            # update sigma and mu using updated params from the last iteration
+            sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)
+            coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
+        else:
+            sigma_ = np.array([]).reshape(0, 0)
 
         self.coef_ = coef_
         self.alpha_ = alpha_
@@ -617,6 +612,34 @@ def update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_):
         self._set_intercept(X_offset_, y_offset_, X_scale_)
         return self
 
+    def _update_sigma_woodbury(self, X, alpha_, lambda_, keep_lambda):
+        # See slides as referenced in the docstring note
+        # this function is used when n_samples < n_features and will invert
+        # a matrix of shape (n_samples, n_samples) making use of the
+        # woodbury formula:
+        # https://en.wikipedia.org/wiki/Woodbury_matrix_identity
+        n_samples = X.shape[0]
+        X_keep = X[:, keep_lambda]
+        inv_lambda = 1 / lambda_[keep_lambda].reshape(1, -1)
+        sigma_ = pinvh(
+            np.eye(n_samples) / alpha_ + np.dot(X_keep * inv_lambda, X_keep.T)
+        )
+        sigma_ = np.dot(sigma_, X_keep * inv_lambda)
+        sigma_ = - np.dot(inv_lambda.reshape(-1, 1) * X_keep.T, sigma_)
+        sigma_[np.diag_indices(sigma_.shape[1])] += 1. / lambda_[keep_lambda]
+        return sigma_
+
+    def _update_sigma(self, X, alpha_, lambda_, keep_lambda):
+        # See slides as referenced in the docstring note
+        # this function is used when n_samples >= n_features and will
+        # invert a matrix of shape (n_features, n_features)
+        X_keep = X[:, keep_lambda]
+        gram = np.dot(X_keep.T, X_keep)
+        eye = np.eye(gram.shape[0])
+        sigma_inv = lambda_[keep_lambda] * eye + alpha_ * gram
+        sigma_ = pinvh(sigma_inv)
+        return sigma_
+
     def predict(self, X, return_std=False):
         """Predict using the linear model.
 
diff --git a/sklearn/linear_model/tests/test_bayes.py b/sklearn/linear_model/tests/test_bayes.py
index e1922a010514f..ff3ac13c2d7f6 100644
--- a/sklearn/linear_model/tests/test_bayes.py
+++ b/sklearn/linear_model/tests/test_bayes.py
@@ -7,6 +7,8 @@
 
 import numpy as np
 from scipy.linalg import pinvh
+import pytest
+
 
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_almost_equal
@@ -159,7 +161,7 @@ def test_std_bayesian_ridge_ard_with_constant_input():
     # Test BayesianRidge and ARDRegression standard dev. for edge case of
     # constant target vector
     # The standard dev. should be relatively small (< 0.01 is tested here)
-    n_samples = 4
+    n_samples = 10
     n_features = 5
     random_state = check_random_state(42)
     constant_value = random_state.rand()
@@ -181,9 +183,9 @@ def test_update_of_sigma_in_ard():
     y = np.array([0, 0])
     clf = ARDRegression(n_iter=1)
     clf.fit(X, y)
-    # With the inputs above, ARDRegression prunes one of the two coefficients
-    # in the first iteration. Hence, the expected shape of `sigma_` is (1, 1).
-    assert clf.sigma_.shape == (1, 1)
+    # With the inputs above, ARDRegression prunes both of the two coefficients
+    # in the first iteration. Hence, the expected shape of `sigma_` is (0, 0).
+    assert clf.sigma_.shape == (0, 0)
     # Ensure that no error is thrown at prediction stage
     clf.predict(X, return_std=True)
 
@@ -200,22 +202,19 @@ def test_toy_ard_object():
     assert_array_almost_equal(clf.predict(test), [1, 3, 4], 2)
 
 
-def test_ard_accuracy_on_easy_problem():
+@pytest.mark.parametrize('seed', range(100))
+@pytest.mark.parametrize('n_samples, n_features', ((10, 100), (100, 10)))
+def test_ard_accuracy_on_easy_problem(seed, n_samples, n_features):
     # Check that ARD converges with reasonable accuracy on an easy problem
     # (Github issue #14055)
-    # This particular seed seems to converge poorly in the failure-case
-    # (scipy==1.3.0, sklearn==0.21.2)
-    seed = 45
     X = np.random.RandomState(seed=seed).normal(size=(250, 3))
     y = X[:, 1]
 
-    regressor = ARDRegression(n_iter=600)
+    regressor = ARDRegression()
     regressor.fit(X, y)
 
     abs_coef_error = np.abs(1 - regressor.coef_[1])
-    # Expect an accuracy of better than 1E-4 in most cases -
-    # Failure-case produces 0.16!
-    assert abs_coef_error < 0.01
+    assert abs_coef_error < 1e-10
 
 
 def test_return_std():
@@ -248,3 +247,28 @@ def f_noise(X, noise_mult):
         m2.fit(X, y)
         y_mean2, y_std2 = m2.predict(X_test, return_std=True)
         assert_array_almost_equal(y_std2, noise_mult, decimal=decimal)
+
+
+@pytest.mark.parametrize('seed', range(10))
+def test_update_sigma(seed):
+    # make sure the two update_sigma() helpers are equivalent. The woodbury
+    # formula is used when n_samples < n_features, and the other one is used
+    # otherwise.
+
+    rng = np.random.RandomState(seed)
+
+    # set n_samples == n_features to avoid instability issues when inverting
+    # the matrices. Using the woodbury formula would be unstable when
+    # n_samples > n_features
+    n_samples = n_features = 10
+    X = rng.randn(n_samples, n_features)
+    alpha = 1
+    lmbda = np.arange(1, n_features + 1)
+    keep_lambda = np.array([True] * n_features)
+
+    reg = ARDRegression()
+
+    sigma = reg._update_sigma(X, alpha, lmbda, keep_lambda)
+    sigma_woodbury = reg._update_sigma_woodbury(X, alpha, lmbda, keep_lambda)
+
+    np.testing.assert_allclose(sigma, sigma_woodbury)
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
index 622c102fbbd0b..6635c7345aec5 100644
--- a/sklearn/utils/fixes.py
+++ b/sklearn/utils/fixes.py
@@ -42,14 +42,6 @@ def _parse_version(version_string):
     # mypy error: Name 'lobpcg' already defined (possibly by an import)
     from ..externals._lobpcg import lobpcg  # type: ignore  # noqa
 
-if sp_version >= (1, 3):
-    # Preserves earlier default choice of pinvh cutoff `cond` value.
-    # Can be removed once issue #14055 is fully addressed.
-    from ..externals._scipy_linalg import pinvh
-else:
-    # mypy error: Name 'pinvh' already defined (possibly by an import)
-    from scipy.linalg import pinvh  # type: ignore  # noqa
-
 
 def _object_dtype_isnan(X):
     return X != X

From a93b15f19bf7db8027e04705603161c336fb1454 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Thu, 23 Apr 2020 16:27:35 +0200
Subject: [PATCH 043/125] ENH Poisson loss for HistGradientBoostingRegressor
 (#16692)

---
 doc/modules/ensemble.rst                      |  5 +-
 doc/whats_new/v0.23.rst                       |  5 ++
 .../_hist_gradient_boosting/_loss.pyx         | 33 +++++++++++-
 .../gradient_boosting.py                      | 23 +++++---
 .../ensemble/_hist_gradient_boosting/loss.py  | 54 ++++++++++++++++++-
 .../tests/test_gradient_boosting.py           | 42 +++++++++++++++
 .../tests/test_loss.py                        | 39 ++++++++++++--
 7 files changed, 184 insertions(+), 17 deletions(-)

diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index 1416b9d3a6045..e731ece0bdb20 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -952,8 +952,9 @@ controls the number of iterations of the boosting process::
   >>> clf.score(X_test, y_test)
   0.8965
 
-Available losses for regression are 'least_squares' and
-'least_absolute_deviation', which is less sensitive to outliers. For
+Available losses for regression are 'least_squares',
+'least_absolute_deviation', which is less sensitive to outliers, and
+'poisson', which is well suited to model counts and frequencies. For
 classification, 'binary_crossentropy' is used for binary classification and
 'categorical_crossentropy' is used for multiclass classification. By default
 the loss is 'auto' and will select the appropriate loss depending on
diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index 6719e511a583d..844cdf0360f73 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -215,6 +215,11 @@ Changelog
   to obtain the input to the meta estimator.
   :pr:`16539` by :user:`Bill DeRose <wderose>`.
 
+- |Feature| Added additional option `loss="poisson"` to
+  :class:`ensemble.HistGradientBoostingRegressor`, which adds Poisson deviance
+  with log-link useful for modeling count data.
+  :pr:`16692` by :user:`Christian Lorentzen <lorentzenchr>`
+
 :mod:`sklearn.feature_extraction`
 .................................
 
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
index 821a81a48fcf3..64480911439e5 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
@@ -10,7 +10,7 @@ from cython.parallel import prange
 import numpy as np
 cimport numpy as np
 
-from libc.math cimport exp
+from libc.math cimport exp, log
 
 from .common cimport Y_DTYPE_C
 from .common cimport G_H_DTYPE_C
@@ -27,7 +27,7 @@ def _update_gradients_least_squares(
 
     n_samples = raw_predictions.shape[0]
     for i in prange(n_samples, schedule='static', nogil=True):
-        # Note: a more correct exp is 2 * (raw_predictions - y_true)
+        # Note: a more correct expression is 2 * (raw_predictions - y_true)
         # but since we use 1 for the constant hessian value (and not 2) this
         # is strictly equivalent for the leaves values.
         gradients[i] = raw_predictions[i] - y_true[i]
@@ -87,6 +87,35 @@ def _update_gradients_least_absolute_deviation(
         gradients[i] = 2 * (y_true[i] - raw_predictions[i] < 0) - 1
 
 
+def _update_gradients_hessians_poisson(
+        G_H_DTYPE_C [::1] gradients,  # OUT
+        G_H_DTYPE_C [::1] hessians,  # OUT
+        const Y_DTYPE_C [::1] y_true,  # IN
+        const Y_DTYPE_C [::1] raw_predictions,  # IN
+        const Y_DTYPE_C [::1] sample_weight):  # IN
+
+    cdef:
+        int n_samples
+        int i
+        Y_DTYPE_C y_pred
+
+    n_samples = raw_predictions.shape[0]
+    if sample_weight is None:
+        for i in prange(n_samples, schedule='static', nogil=True):
+            # Note: We use only half of the deviance loss. Therefore, there is
+            # no factor of 2.
+            y_pred = exp(raw_predictions[i])
+            gradients[i] = (y_pred - y_true[i])
+            hessians[i] = y_pred
+    else:
+        for i in prange(n_samples, schedule='static', nogil=True):
+            # Note: We use only half of the deviance loss. Therefore, there is
+            # no factor of 2.
+            y_pred = exp(raw_predictions[i])
+            gradients[i] = (y_pred - y_true[i]) * sample_weight[i]
+            hessians[i] = y_pred * sample_weight[i]
+
+
 def _update_gradients_hessians_binary_crossentropy(
         G_H_DTYPE_C [::1] gradients,  # OUT
         G_H_DTYPE_C [::1] hessians,  # OUT
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 6087adb0b6575..8287cda367a10 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -758,11 +758,13 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
 
     Parameters
     ----------
-    loss : {'least_squares', 'least_absolute_deviation'}, \
+    loss : {'least_squares', 'least_absolute_deviation', 'poisson'}, \
             optional (default='least_squares')
         The loss function to use in the boosting process. Note that the
-        "least squares" loss actually implements an "half least squares loss"
-        to simplify the computation of the gradient.
+        "least squares" and "poisson" losses actually implement
+        "half least squares loss" and "half poisson deviance" to simplify the
+        computation of the gradient. Furthermore, "poisson" loss internally
+        uses a log-link and requires ``y >= 0``
     learning_rate : float, optional (default=0.1)
         The learning rate, also known as *shrinkage*. This is used as a
         multiplicative factor for the leaves values. Use ``1`` for no
@@ -868,7 +870,8 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
     0.92...
     """
 
-    _VALID_LOSSES = ('least_squares', 'least_absolute_deviation')
+    _VALID_LOSSES = ('least_squares', 'least_absolute_deviation',
+                     'poisson')
 
     @_deprecate_positional_args
     def __init__(self, loss='least_squares', *, learning_rate=0.1,
@@ -902,14 +905,20 @@ def predict(self, X):
         y : ndarray, shape (n_samples,)
             The predicted values.
         """
-        # Return raw predictions after converting shape
-        # (n_samples, 1) to (n_samples,)
-        return self._raw_predict(X).ravel()
+        check_is_fitted(self)
+        # Return inverse link of raw predictions after converting
+        # shape (n_samples, 1) to (n_samples,)
+        return self.loss_.inverse_link_function(self._raw_predict(X).ravel())
 
     def _encode_y(self, y):
         # Just convert y to the expected dtype
         self.n_trees_per_iteration_ = 1
         y = y.astype(Y_DTYPE, copy=False)
+        if self.loss == 'poisson':
+            # Ensure y >= 0 and sum(y) > 0
+            if not (np.all(y >= 0) and np.sum(y) > 0):
+                raise ValueError("loss='poisson' requires non-negative y and "
+                                 "sum(y) > 0.")
         return y
 
     def _get_loss(self, sample_weight):
diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py
index c7884a25a9c41..f256408bf01fb 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/loss.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py
@@ -9,7 +9,7 @@
 from abc import ABC, abstractmethod
 
 import numpy as np
-from scipy.special import expit, logsumexp
+from scipy.special import expit, logsumexp, xlogy
 
 from .common import Y_DTYPE
 from .common import G_H_DTYPE
@@ -19,11 +19,13 @@
 from ._loss import _update_gradients_hessians_least_absolute_deviation
 from ._loss import _update_gradients_hessians_binary_crossentropy
 from ._loss import _update_gradients_hessians_categorical_crossentropy
+from ._loss import _update_gradients_hessians_poisson
 from ...utils.stats import _weighted_percentile
 
 
 class BaseLoss(ABC):
     """Base class for a loss."""
+
     def __init__(self, hessians_are_constant):
         self.hessians_are_constant = hessians_are_constant
 
@@ -153,6 +155,7 @@ class LeastSquares(BaseLoss):
     the computation of the gradients and get a unit hessian (and be consistent
     with what is done in LightGBM).
     """
+
     def __init__(self, sample_weight):
         # If sample weights are provided, the hessians and gradients
         # are multiplied by sample_weight, which means the hessians are
@@ -195,6 +198,7 @@ class LeastAbsoluteDeviation(BaseLoss):
 
         loss(x_i) = |y_true_i - raw_pred_i|
     """
+
     def __init__(self, sample_weight):
         # If sample weights are provided, the hessians and gradients
         # are multiplied by sample_weight, which means the hessians are
@@ -265,6 +269,51 @@ def update_leaves_values(self, grower, y_true, raw_predictions,
             # Note that the regularization is ignored here
 
 
+class Poisson(BaseLoss):
+    """Poisson deviance loss with log-link, for regression.
+
+    For a given sample x_i, Poisson deviance loss is defined as::
+
+        loss(x_i) = y_true_i * log(y_true_i/exp(raw_pred_i))
+                    - y_true_i + exp(raw_pred_i))
+
+    This actually computes half the Poisson deviance to simplify
+    the computation of the gradients.
+    """
+
+    def __init__(self, sample_weight):
+        super().__init__(hessians_are_constant=False)
+
+    inverse_link_function = staticmethod(np.exp)
+
+    def pointwise_loss(self, y_true, raw_predictions):
+        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
+        # return a view.
+        raw_predictions = raw_predictions.reshape(-1)
+        # TODO: For speed, we could remove the constant xlogy(y_true, y_true)
+        # Advantage of this form: minimum of zero at raw_predictions = y_true.
+        loss = (xlogy(y_true, y_true) - y_true * (raw_predictions + 1)
+                + np.exp(raw_predictions))
+        return loss
+
+    def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
+        y_pred = np.average(y_train, weights=sample_weight)
+        eps = np.finfo(y_train.dtype).eps
+        y_pred = np.clip(y_pred, eps, None)
+        return np.log(y_pred)
+
+    def update_gradients_and_hessians(self, gradients, hessians, y_true,
+                                      raw_predictions, sample_weight):
+        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
+        # return a view.
+        raw_predictions = raw_predictions.reshape(-1)
+        gradients = gradients.reshape(-1)
+        hessians = hessians.reshape(-1)
+        _update_gradients_hessians_poisson(gradients, hessians,
+                                           y_true, raw_predictions,
+                                           sample_weight)
+
+
 class BinaryCrossEntropy(BaseLoss):
     """Binary cross-entropy loss, for binary classification.
 
@@ -372,5 +421,6 @@ def predict_proba(self, raw_predictions):
     'least_squares': LeastSquares,
     'least_absolute_deviation': LeastAbsoluteDeviation,
     'binary_crossentropy': BinaryCrossEntropy,
-    'categorical_crossentropy': CategoricalCrossEntropy
+    'categorical_crossentropy': CategoricalCrossEntropy,
+    'poisson': Poisson,
 }
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index 6fc412942d180..dfed16dafca39 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -2,10 +2,13 @@
 import pytest
 from numpy.testing import assert_allclose, assert_array_equal
 from sklearn.datasets import make_classification, make_regression
+from sklearn.datasets import make_low_rank_matrix
 from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler
 from sklearn.model_selection import train_test_split
 from sklearn.base import clone, BaseEstimator, TransformerMixin
 from sklearn.pipeline import make_pipeline
+from sklearn.metrics import mean_poisson_deviance
+from sklearn.dummy import DummyRegressor
 
 # To use this experimental feature, we need to explicitly ask for it:
 from sklearn.experimental import enable_hist_gradient_boosting  # noqa
@@ -194,6 +197,45 @@ def test_least_absolute_deviation():
     assert gbdt.score(X, y) > .9
 
 
+@pytest.mark.parametrize('y', [([1., -2., 0.]), ([0., 0., 0.])])
+def test_poisson_y_positive(y):
+    # Test that ValueError is raised if either one y_i < 0 or sum(y_i) <= 0.
+    err_msg = r"loss='poisson' requires non-negative y and sum\(y\) > 0."
+    gbdt = HistGradientBoostingRegressor(loss='poisson', random_state=0)
+    with pytest.raises(ValueError, match=err_msg):
+        gbdt.fit(np.zeros(shape=(len(y), 1)), y)
+
+
+def test_poisson():
+    # For Poisson distributed target, Poisson loss should give better results
+    # than least squares measured in Poisson deviance as metric.
+    rng = np.random.RandomState(42)
+    n_train, n_test, n_features = 500, 100, 100
+    X = make_low_rank_matrix(n_samples=n_train+n_test, n_features=n_features,
+                             random_state=rng)
+    # We create a log-linear Poisson model and downscale coef as it will get
+    # exponentiated.
+    coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0)
+    y = rng.poisson(lam=np.exp(X @ coef))
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test,
+                                                        random_state=rng)
+    gbdt_pois = HistGradientBoostingRegressor(loss='poisson', random_state=rng)
+    gbdt_ls = HistGradientBoostingRegressor(loss='least_squares',
+                                            random_state=rng)
+    gbdt_pois.fit(X_train, y_train)
+    gbdt_ls.fit(X_train, y_train)
+    dummy = DummyRegressor(strategy="mean").fit(X_train, y_train)
+
+    for X, y in [(X_train, y_train), (X_test, y_test)]:
+        metric_pois = mean_poisson_deviance(y, gbdt_pois.predict(X))
+        # least_squares might produce non-positive predictions => clip
+        metric_ls = mean_poisson_deviance(y, np.clip(gbdt_ls.predict(X), 1e-15,
+                                                     None))
+        metric_dummy = mean_poisson_deviance(y, dummy.predict(X))
+        assert metric_pois < metric_ls
+        assert metric_pois < metric_dummy
+
+
 def test_binning_train_validation_are_separated():
     # Make sure training and validation data are binned separately.
     # See issue 13926
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
index 915dc300e4760..7fc6ab9097873 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
@@ -52,6 +52,9 @@ def get_hessians(y_true, raw_predictions):
     # ('binary_crossentropy', 0.3, 0),
     ('binary_crossentropy', -12, 1),
     ('binary_crossentropy', 30, 1),
+    ('poisson', 12., 1.),
+    ('poisson', 0., 2.),
+    ('poisson', -22., 10.),
 ])
 @pytest.mark.skipif(sp_version == (1, 2, 0),
                     reason='bug in scipy 1.2.0, see scipy issue #9608')
@@ -76,10 +79,11 @@ def fprime(x):
     def fprime2(x):
         return get_hessians(y_true, x)
 
-    optimum = newton(func, x0=x0, fprime=fprime, fprime2=fprime2)
+    optimum = newton(func, x0=x0, fprime=fprime, fprime2=fprime2,
+                     maxiter=70, tol=2e-8)
     assert np.allclose(loss.inverse_link_function(optimum), y_true)
     assert np.allclose(loss.pointwise_loss(y_true, optimum), 0)
-    assert np.allclose(get_gradients(y_true, optimum), 0)
+    assert np.allclose(get_gradients(y_true, optimum), 0, atol=1e-7)
 
 
 @pytest.mark.parametrize('loss, n_classes, prediction_dim', [
@@ -87,6 +91,7 @@ def fprime2(x):
     ('least_absolute_deviation', 0, 1),
     ('binary_crossentropy', 2, 1),
     ('categorical_crossentropy', 3, 3),
+    ('poisson', 0, 1),
 ])
 @pytest.mark.skipif(Y_DTYPE != np.float64,
                     reason='Need 64 bits float precision for numerical checks')
@@ -100,6 +105,8 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0):
     n_samples = 100
     if loss in ('least_squares', 'least_absolute_deviation'):
         y_true = rng.normal(size=n_samples).astype(Y_DTYPE)
+    elif loss in ('poisson'):
+        y_true = rng.poisson(size=n_samples).astype(Y_DTYPE)
     else:
         y_true = rng.randint(0, n_classes, size=n_samples).astype(Y_DTYPE)
     raw_predictions = rng.normal(
@@ -114,7 +121,7 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0):
 
     # Approximate gradients
     # For multiclass loss, we should only change the predictions of one tree
-    # (here the first), hence the use of offset[:, 0] += eps
+    # (here the first), hence the use of offset[0, :] += eps
     # As a softmax is computed, offsetting the whole array by a constant would
     # have no effect on the probabilities, and thus on the loss
     eps = 1e-9
@@ -164,6 +171,27 @@ def test_baseline_least_absolute_deviation():
     assert baseline_prediction == pytest.approx(np.median(y_train))
 
 
+def test_baseline_poisson():
+    rng = np.random.RandomState(0)
+
+    loss = _LOSSES['poisson'](sample_weight=None)
+    y_train = rng.poisson(size=100).astype(np.float64)
+    # Sanity check, make sure at least one sample is non-zero so we don't take
+    # log(0)
+    assert y_train.sum() > 0
+    baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
+    assert np.isscalar(baseline_prediction)
+    assert baseline_prediction.dtype == y_train.dtype
+    assert_all_finite(baseline_prediction)
+    # Make sure baseline prediction produces the log of the mean of all targets
+    assert_almost_equal(np.log(y_train.mean()), baseline_prediction)
+
+    # Test baseline for y_true = 0
+    y_train.fill(0.)
+    baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
+    assert_all_finite(baseline_prediction)
+
+
 def test_baseline_binary_crossentropy():
     rng = np.random.RandomState(0)
 
@@ -215,7 +243,8 @@ def test_baseline_categorical_crossentropy():
     ('least_squares', 'regression'),
     ('least_absolute_deviation', 'regression'),
     ('binary_crossentropy', 'classification'),
-    ('categorical_crossentropy', 'classification')
+    ('categorical_crossentropy', 'classification'),
+    ('poisson', 'poisson_regression'),
     ])
 @pytest.mark.parametrize('sample_weight', ['ones', 'random'])
 def test_sample_weight_multiplies_gradients(loss, problem, sample_weight):
@@ -232,6 +261,8 @@ def test_sample_weight_multiplies_gradients(loss, problem, sample_weight):
 
     if problem == 'regression':
         y_true = rng.normal(size=n_samples).astype(Y_DTYPE)
+    elif problem == 'poisson_regression':
+        y_true = rng.poisson(size=n_samples).astype(Y_DTYPE)
     else:
         y_true = rng.randint(0, n_classes, size=n_samples).astype(Y_DTYPE)
 

From 36934921c5c6a39eb09ee80493483ff0520146d8 Mon Sep 17 00:00:00 2001
From: Lucy Liu <jliu176@gmail.com>
Date: Thu, 23 Apr 2020 18:06:36 +0200
Subject: [PATCH 044/125] DOC Add link and fix typo in nearest neighbours
 estimators (#17017)

---
 sklearn/neighbors/_classification.py | 8 ++++----
 sklearn/neighbors/_regression.py     | 4 ++--
 sklearn/neighbors/_unsupervised.py   | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py
index 0580b710afd44..e223476d3107b 100644
--- a/sklearn/neighbors/_classification.py
+++ b/sklearn/neighbors/_classification.py
@@ -71,10 +71,10 @@ class KNeighborsClassifier(NeighborsBase, KNeighborsMixin,
     metric : str or callable, default='minkowski'
         the distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of the DistanceMetric class for a
+        metric. See the documentation of :class:`DistanceMetric` for a
         list of available metrics.
         If metric is "precomputed", X is assumed to be a distance matrix and
-        must be square during fit. X may be a :term:`Glossary <sparse graph>`,
+        must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
 
     metric_params : dict, default=None
@@ -303,10 +303,10 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin,
     metric : str or callable, default='minkowski'
         the distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of the DistanceMetric class for a
+        metric. See the documentation of :class:`DistanceMetric` for a
         list of available metrics.
         If metric is "precomputed", X is assumed to be a distance matrix and
-        must be square during fit. X may be a :term:`Glossary <sparse graph>`,
+        must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
 
     outlier_label : {manual label, 'most_frequent'}, default=None
diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
index a00d83c98102b..cce218062a3d5 100644
--- a/sklearn/neighbors/_regression.py
+++ b/sklearn/neighbors/_regression.py
@@ -253,10 +253,10 @@ class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin,
     metric : str or callable, default='minkowski'
         the distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of the DistanceMetric class for a
+        metric. See the documentation of :class:`DistanceMetric` for a
         list of available metrics.
         If metric is "precomputed", X is assumed to be a distance matrix and
-        must be square during fit. X may be a :term:`Glossary <sparse graph>`,
+        must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
 
     metric_params : dict, default=None
diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py
index 6faafeee9ffcd..923a465b1d31b 100644
--- a/sklearn/neighbors/_unsupervised.py
+++ b/sklearn/neighbors/_unsupervised.py
@@ -43,10 +43,10 @@ class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin,
     metric : str or callable, default='minkowski'
         the distance metric to use for the tree.  The default metric is
         minkowski, and with p=2 is equivalent to the standard Euclidean
-        metric. See the documentation of the DistanceMetric class for a
+        metric. See the documentation of :class:`DistanceMetric` for a
         list of available metrics.
         If metric is "precomputed", X is assumed to be a distance matrix and
-        must be square during fit. X may be a :term:`Glossary <sparse graph>`,
+        must be square during fit. X may be a :term:`sparse graph`,
         in which case only "nonzero" elements may be considered neighbors.
 
     p : int, default=2

From 2592edddb6eb326a65311f081e668d93ab044703 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Thu, 23 Apr 2020 12:12:07 -0400
Subject: [PATCH 045/125] MNT Fixes DataConversionWarning doctest in pypy
 (#16965)

---
 sklearn/exceptions.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py
index 0083632418c8b..1b71050813d2b 100644
--- a/sklearn/exceptions.py
+++ b/sklearn/exceptions.py
@@ -98,10 +98,9 @@ class DataConversionWarning(UserWarning):
     ...         Y = validation.column_or_1d(Y,warn=True)
     ...     except ValueError:
     ...         pass
-    ...     print(repr(w[-1].message))
-    DataConversionWarning('A column-vector y was passed when a
-    1d array was expected. Please change the shape of y to
-    (n_samples, ), for example using ravel().')
+    ...     print(w[-1].message)
+    A column-vector y was passed when a 1d array was expected. Please change
+    the shape of y to (n_samples, ), for example using ravel().
 
     .. versionchanged:: 0.18
        Moved from sklearn.utils.validation.

From e54cd3c0617e3485baa19e2c69332da55b363636 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Thu, 23 Apr 2020 13:10:49 -0400
Subject: [PATCH 046/125] API Deprecate positional arguments in preprocessing
 (#16996)

* API Deprecate positional arguments in preprocessing

* CLN Address comments

* CLN Uses classes as a keyword only argument

* BUG Fix
---
 sklearn/calibration.py                        |  4 +-
 sklearn/metrics/_ranking.py                   |  4 +-
 sklearn/metrics/tests/test_classification.py  |  2 +-
 sklearn/preprocessing/_data.py                | 54 ++++++++++++-------
 sklearn/preprocessing/_discretization.py      |  4 +-
 sklearn/preprocessing/_encoders.py            |  7 ++-
 .../preprocessing/_function_transformer.py    |  5 +-
 sklearn/preprocessing/_label.py               | 13 +++--
 sklearn/preprocessing/tests/test_data.py      | 10 ++--
 sklearn/preprocessing/tests/test_label.py     | 10 ++--
 10 files changed, 71 insertions(+), 42 deletions(-)

diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index 8a719d49bd6de..31df362ddb009 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -329,7 +329,7 @@ def fit(self, X, y, sample_weight=None):
             self.label_encoder_.fit(self.classes)
 
         self.classes_ = self.label_encoder_.classes_
-        Y = label_binarize(y, self.classes_)
+        Y = label_binarize(y, classes=self.classes_)
 
         df, idx_pos_class = self._preproc(X)
         self.calibrators_ = []
@@ -574,7 +574,7 @@ def calibration_curve(y_true, y_prob, normalize=False, n_bins=5,
     if len(labels) > 2:
         raise ValueError("Only binary classification is supported. "
                          "Provided labels %s." % labels)
-    y_true = label_binarize(y_true, labels)[:, 0]
+    y_true = label_binarize(y_true, classes=labels)[:, 0]
 
     if strategy == 'quantile':  # Determine bin edges by distribution of data
         quantiles = np.linspace(0, 1, n_bins + 1)
diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py
index e525539c0d706..ad867efb8bfa3 100644
--- a/sklearn/metrics/_ranking.py
+++ b/sklearn/metrics/_ranking.py
@@ -383,7 +383,7 @@ def roc_auc_score(y_true, y_score, average="macro", sample_weight=None,
                                          multi_class, average, sample_weight)
     elif y_type == "binary":
         labels = np.unique(y_true)
-        y_true = label_binarize(y_true, labels)[:, 0]
+        y_true = label_binarize(y_true, classes=labels)[:, 0]
         return _average_binary_score(partial(_binary_roc_auc_score,
                                              max_fpr=max_fpr),
                                      y_true, y_score, average,
@@ -489,7 +489,7 @@ def _multiclass_roc_auc_score(y_true, y_score, labels,
                                              y_score, average=average)
     else:
         # ovr is same as multi-label
-        y_true_multilabel = label_binarize(y_true, classes)
+        y_true_multilabel = label_binarize(y_true, classes=classes)
         return _average_binary_score(_binary_roc_auc_score, y_true_multilabel,
                                      y_score, average,
                                      sample_weight=sample_weight)
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index be6364e63b2cd..ca56e79299adb 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -654,7 +654,7 @@ def test_matthews_corrcoef():
     y_true_inv = ["b" if i == "a" else "a" for i in y_true]
     assert_almost_equal(matthews_corrcoef(y_true, y_true_inv), -1)
 
-    y_true_inv2 = label_binarize(y_true, ["a", "b"])
+    y_true_inv2 = label_binarize(y_true, classes=["a", "b"])
     y_true_inv2 = np.where(y_true_inv2, 'a', 'b')
     assert_almost_equal(matthews_corrcoef(y_true, y_true_inv2), -1)
 
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index c95351db9d985..f9af3dbac6d0d 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -29,7 +29,7 @@
                                  mean_variance_axis, incr_mean_variance_axis,
                                  min_max_axis)
 from ..utils.validation import (check_is_fitted, check_random_state,
-                                FLOAT_DTYPES)
+                                FLOAT_DTYPES, _deprecate_positional_args)
 
 from ._csr_polynomial_expansion import _csr_polynomial_expansion
 
@@ -78,7 +78,8 @@ def _handle_zeros_in_scale(scale, copy=True):
         return scale
 
 
-def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
+@_deprecate_positional_args
+def scale(X, *, axis=0, with_mean=True, with_std=True, copy=True):
     """Standardize a dataset along any axis
 
     Center to the mean and component wise scale to unit variance.
@@ -291,7 +292,8 @@ class MinMaxScaler(TransformerMixin, BaseEstimator):
     <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
     """
 
-    def __init__(self, feature_range=(0, 1), copy=True):
+    @_deprecate_positional_args
+    def __init__(self, feature_range=(0, 1), *, copy=True):
         self.feature_range = feature_range
         self.copy = copy
 
@@ -435,7 +437,8 @@ def _more_tags(self):
         return {'allow_nan': True}
 
 
-def minmax_scale(X, feature_range=(0, 1), axis=0, copy=True):
+@_deprecate_positional_args
+def minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True):
     """Transform features by scaling each feature to a given range.
 
     This estimator scales and translates each feature individually such
@@ -626,7 +629,8 @@ class StandardScaler(TransformerMixin, BaseEstimator):
     <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
     """  # noqa
 
-    def __init__(self, copy=True, with_mean=True, with_std=True):
+    @_deprecate_positional_args
+    def __init__(self, *, copy=True, with_mean=True, with_std=True):
         self.with_mean = with_mean
         self.with_std = with_std
         self.copy = copy
@@ -908,7 +912,8 @@ class MaxAbsScaler(TransformerMixin, BaseEstimator):
     <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
     """
 
-    def __init__(self, copy=True):
+    @_deprecate_positional_args
+    def __init__(self, *, copy=True):
         self.copy = copy
 
     def _reset(self):
@@ -1024,7 +1029,8 @@ def _more_tags(self):
         return {'allow_nan': True}
 
 
-def maxabs_scale(X, axis=0, copy=True):
+@_deprecate_positional_args
+def maxabs_scale(X, *, axis=0, copy=True):
     """Scale each feature to the [-1, 1] range without breaking the sparsity.
 
     This estimator scales each feature individually such
@@ -1172,8 +1178,8 @@ class RobustScaler(TransformerMixin, BaseEstimator):
     https://en.wikipedia.org/wiki/Median
     https://en.wikipedia.org/wiki/Interquartile_range
     """
-
-    def __init__(self, with_centering=True, with_scaling=True,
+    @_deprecate_positional_args
+    def __init__(self, *, with_centering=True, with_scaling=True,
                  quantile_range=(25.0, 75.0), copy=True):
         self.with_centering = with_centering
         self.with_scaling = with_scaling
@@ -1282,7 +1288,8 @@ def _more_tags(self):
         return {'allow_nan': True}
 
 
-def robust_scale(X, axis=0, with_centering=True, with_scaling=True,
+@_deprecate_positional_args
+def robust_scale(X, *, axis=0, with_centering=True, with_scaling=True,
                  quantile_range=(25.0, 75.0), copy=True):
     """Standardize a dataset along any axis
 
@@ -1433,7 +1440,8 @@ class PolynomialFeatures(TransformerMixin, BaseEstimator):
     See :ref:`examples/linear_model/plot_polynomial_interpolation.py
     <sphx_glr_auto_examples_linear_model_plot_polynomial_interpolation.py>`
     """
-    def __init__(self, degree=2, interaction_only=False, include_bias=True,
+    @_deprecate_positional_args
+    def __init__(self, degree=2, *, interaction_only=False, include_bias=True,
                  order='C'):
         self.degree = degree
         self.interaction_only = interaction_only
@@ -1638,7 +1646,8 @@ def transform(self, X):
         return XP
 
 
-def normalize(X, norm='l2', axis=1, copy=True, return_norm=False):
+@_deprecate_positional_args
+def normalize(X, norm='l2', *, axis=1, copy=True, return_norm=False):
     """Scale input vectors individually to unit norm (vector length).
 
     Read more in the :ref:`User Guide <preprocessing_normalization>`.
@@ -1797,7 +1806,8 @@ class Normalizer(TransformerMixin, BaseEstimator):
     normalize: Equivalent function without the estimator API.
     """
 
-    def __init__(self, norm='l2', copy=True):
+    @_deprecate_positional_args
+    def __init__(self, norm='l2', *, copy=True):
         self.norm = norm
         self.copy = copy
 
@@ -1833,7 +1843,8 @@ def _more_tags(self):
         return {'stateless': True}
 
 
-def binarize(X, threshold=0.0, copy=True):
+@_deprecate_positional_args
+def binarize(X, *, threshold=0.0, copy=True):
     """Boolean thresholding of array-like or scipy.sparse matrix
 
     Read more in the :ref:`User Guide <preprocessing_binarization>`.
@@ -1931,7 +1942,8 @@ class Binarizer(TransformerMixin, BaseEstimator):
     binarize: Equivalent function without the estimator API.
     """
 
-    def __init__(self, threshold=0.0, copy=True):
+    @_deprecate_positional_args
+    def __init__(self, *, threshold=0.0, copy=True):
         self.threshold = threshold
         self.copy = copy
 
@@ -2228,7 +2240,8 @@ class QuantileTransformer(TransformerMixin, BaseEstimator):
     <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
     """
 
-    def __init__(self, n_quantiles=1000, output_distribution='uniform',
+    @_deprecate_positional_args
+    def __init__(self, *, n_quantiles=1000, output_distribution='uniform',
                  ignore_implicit_zeros=False, subsample=int(1e5),
                  random_state=None, copy=True):
         self.n_quantiles = n_quantiles
@@ -2560,7 +2573,8 @@ def _more_tags(self):
         return {'allow_nan': True}
 
 
-def quantile_transform(X, axis=0, n_quantiles=1000,
+@_deprecate_positional_args
+def quantile_transform(X, *, axis=0, n_quantiles=1000,
                        output_distribution='uniform',
                        ignore_implicit_zeros=False,
                        subsample=int(1e5),
@@ -2764,7 +2778,8 @@ class PowerTransformer(TransformerMixin, BaseEstimator):
     .. [2] G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal
            of the Royal Statistical Society B, 26, 211-252 (1964).
     """
-    def __init__(self, method='yeo-johnson', standardize=True, copy=True):
+    @_deprecate_positional_args
+    def __init__(self, method='yeo-johnson', *, standardize=True, copy=True):
         self.method = method
         self.standardize = standardize
         self.copy = copy
@@ -3034,7 +3049,8 @@ def _more_tags(self):
         return {'allow_nan': True}
 
 
-def power_transform(X, method='yeo-johnson', standardize=True, copy=True):
+@_deprecate_positional_args
+def power_transform(X, method='yeo-johnson', *, standardize=True, copy=True):
     """
     Power transforms are a family of parametric, monotonic transformations
     that are applied to make data more Gaussian-like. This is useful for
diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py
index 67641601e06f5..581765a81361e 100644
--- a/sklearn/preprocessing/_discretization.py
+++ b/sklearn/preprocessing/_discretization.py
@@ -16,6 +16,7 @@
 from ..utils.validation import check_array
 from ..utils.validation import check_is_fitted
 from ..utils.validation import FLOAT_DTYPES
+from ..utils.validation import _deprecate_positional_args
 
 
 class KBinsDiscretizer(TransformerMixin, BaseEstimator):
@@ -115,7 +116,8 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator):
            [ 0.5,  3.5, -1.5,  1.5]])
     """
 
-    def __init__(self, n_bins=5, encode='onehot', strategy='quantile'):
+    @_deprecate_positional_args
+    def __init__(self, n_bins=5, *, encode='onehot', strategy='quantile'):
         self.n_bins = n_bins
         self.encode = encode
         self.strategy = strategy
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index aa3d8d9dabca8..c8f8ba6781400 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -8,6 +8,7 @@
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array
 from ..utils.validation import check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 
 from ._label import _encode, _encode_check_unknown
 
@@ -292,7 +293,8 @@ class OneHotEncoder(_BaseEncoder):
            [1., 0., 1., 0.]])
     """
 
-    def __init__(self, categories='auto', drop=None, sparse=True,
+    @_deprecate_positional_args
+    def __init__(self, *, categories='auto', drop=None, sparse=True,
                  dtype=np.float64, handle_unknown='error'):
         self.categories = categories
         self.sparse = sparse
@@ -653,7 +655,8 @@ class OrdinalEncoder(_BaseEncoder):
            ['Female', 2]], dtype=object)
     """
 
-    def __init__(self, categories='auto', dtype=np.float64):
+    @_deprecate_positional_args
+    def __init__(self, *, categories='auto', dtype=np.float64):
         self.categories = categories
         self.dtype = dtype
 
diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py
index 9cf365ebb3cdf..21dd40365f5a0 100644
--- a/sklearn/preprocessing/_function_transformer.py
+++ b/sklearn/preprocessing/_function_transformer.py
@@ -2,6 +2,7 @@
 
 from ..base import BaseEstimator, TransformerMixin
 from ..utils.validation import _allclose_dense_sparse
+from ..utils.validation import _deprecate_positional_args
 
 
 def _identity(X):
@@ -78,7 +79,9 @@ class FunctionTransformer(TransformerMixin, BaseEstimator):
     array([[0.       , 0.6931...],
            [1.0986..., 1.3862...]])
     """
-    def __init__(self, func=None, inverse_func=None, validate=False,
+
+    @_deprecate_positional_args
+    def __init__(self, func=None, inverse_func=None, *, validate=False,
                  accept_sparse=False, check_inverse=True, kw_args=None,
                  inv_kw_args=None):
         self.func = func
diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py
index 43b6ac642284c..88fad3670cb01 100644
--- a/sklearn/preprocessing/_label.py
+++ b/sklearn/preprocessing/_label.py
@@ -21,6 +21,7 @@
 from ..utils.validation import check_array
 from ..utils.validation import check_is_fitted
 from ..utils.validation import _num_samples
+from ..utils.validation import _deprecate_positional_args
 from ..utils.multiclass import unique_labels
 from ..utils.multiclass import type_of_target
 
@@ -396,7 +397,8 @@ class LabelBinarizer(TransformerMixin, BaseEstimator):
         using a one-hot aka one-of-K scheme.
     """
 
-    def __init__(self, neg_label=0, pos_label=1, sparse_output=False):
+    @_deprecate_positional_args
+    def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False):
         if neg_label >= pos_label:
             raise ValueError("neg_label={0} must be strictly less than "
                              "pos_label={1}.".format(neg_label, pos_label))
@@ -483,7 +485,7 @@ def transform(self, y):
             raise ValueError("The object was not fitted with multilabel"
                              " input.")
 
-        return label_binarize(y, self.classes_,
+        return label_binarize(y, classes=self.classes_,
                               pos_label=self.pos_label,
                               neg_label=self.neg_label,
                               sparse_output=self.sparse_output)
@@ -541,7 +543,9 @@ def _more_tags(self):
         return {'X_types': ['1dlabels']}
 
 
-def label_binarize(y, classes, neg_label=0, pos_label=1, sparse_output=False):
+@_deprecate_positional_args
+def label_binarize(y, *, classes, neg_label=0, pos_label=1,
+                   sparse_output=False):
     """Binarize labels in a one-vs-all fashion
 
     Several regression and binary classification algorithms are
@@ -851,7 +855,8 @@ class MultiLabelBinarizer(TransformerMixin, BaseEstimator):
         using a one-hot aka one-of-K scheme.
     """
 
-    def __init__(self, classes=None, sparse_output=False):
+    @_deprecate_positional_args
+    def __init__(self, *, classes=None, sparse_output=False):
         self.classes = classes
         self.sparse_output = sparse_output
 
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 7999df083631c..f79703610bee5 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -2295,7 +2295,7 @@ def test_power_transformer_boxcox_strictly_positive_exception():
         pt.fit(X_with_negatives)
 
     with pytest.raises(ValueError, match=not_positive_message):
-        power_transform(X_with_negatives, 'box-cox')
+        power_transform(X_with_negatives, method='box-cox')
 
     with pytest.raises(ValueError, match=not_positive_message):
         pt.transform(np.zeros(X_2d.shape))
@@ -2304,7 +2304,7 @@ def test_power_transformer_boxcox_strictly_positive_exception():
         pt.fit(np.zeros(X_2d.shape))
 
     with pytest.raises(ValueError, match=not_positive_message):
-        power_transform(np.zeros(X_2d.shape), 'box-cox')
+        power_transform(np.zeros(X_2d.shape), method='box-cox')
 
 
 @pytest.mark.parametrize('X', [X_2d, np.abs(X_2d), -np.abs(X_2d),
@@ -2432,7 +2432,7 @@ def test_power_transformer_fit_transform(method, standardize):
     if method == 'box-cox':
         X = np.abs(X)
 
-    pt = PowerTransformer(method, standardize)
+    pt = PowerTransformer(method, standardize=standardize)
     assert_array_almost_equal(pt.fit(X).transform(X), pt.fit_transform(X))
 
 
@@ -2449,7 +2449,7 @@ def test_power_transformer_copy_True(method, standardize):
     assert X is not X_original  # sanity checks
     assert_array_almost_equal(X, X_original)
 
-    pt = PowerTransformer(method, standardize, copy=True)
+    pt = PowerTransformer(method, standardize=standardize, copy=True)
 
     pt.fit(X)
     assert_array_almost_equal(X, X_original)
@@ -2477,7 +2477,7 @@ def test_power_transformer_copy_False(method, standardize):
     assert X is not X_original  # sanity checks
     assert_array_almost_equal(X, X_original)
 
-    pt = PowerTransformer(method, standardize, copy=False)
+    pt = PowerTransformer(method, standardize=standardize, copy=False)
 
     pt.fit(X)
     assert_array_almost_equal(X, X_original)  # fit didn't change X
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 887fa90c98d61..505c57cb5f1c1 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -178,7 +178,7 @@ def test_label_binarizer_errors():
     with pytest.raises(ValueError):
         LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
     with pytest.raises(ValueError):
-        label_binarize(np.array([[1, 3], [2, 1]]), [1, 2, 3])
+        label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
 
 
 @pytest.mark.parametrize(
@@ -509,13 +509,13 @@ def check_binarized_results(y, classes, pos_label, neg_label, expected):
     for sparse_output in [True, False]:
         if ((pos_label == 0 or neg_label != 0) and sparse_output):
             with pytest.raises(ValueError):
-                label_binarize(y, classes, neg_label=neg_label,
+                label_binarize(y, classes=classes, neg_label=neg_label,
                                pos_label=pos_label,
                                sparse_output=sparse_output)
             continue
 
         # check label_binarize
-        binarized = label_binarize(y, classes, neg_label=neg_label,
+        binarized = label_binarize(y, classes=classes, neg_label=neg_label,
                                    pos_label=pos_label,
                                    sparse_output=sparse_output)
         assert_array_equal(toarray(binarized), expected)
@@ -576,7 +576,7 @@ def test_label_binarize_multiclass():
     check_binarized_results(y, classes, pos_label, neg_label, expected)
 
     with pytest.raises(ValueError):
-        label_binarize(y, classes, neg_label=-1, pos_label=pos_label,
+        label_binarize(y, classes=classes, neg_label=-1, pos_label=pos_label,
                        sparse_output=True)
 
 
@@ -595,7 +595,7 @@ def test_label_binarize_multilabel():
                                 expected)
 
     with pytest.raises(ValueError):
-        label_binarize(y, classes, neg_label=-1, pos_label=pos_label,
+        label_binarize(y, classes=classes, neg_label=-1, pos_label=pos_label,
                        sparse_output=True)
 
 

From 923b13ceda1d1d26a1013d0e326734a1dc58bd46 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Thu, 23 Apr 2020 13:11:45 -0400
Subject: [PATCH 047/125] API Adds defaults to Display Objects (#16933)

---
 doc/conf.py                                   |  1 +
 doc/visualizations.rst                        |  1 +
 examples/plot_display_object_visualization.py | 92 +++++++++++++++++++
 sklearn/metrics/_plot/confusion_matrix.py     | 16 +++-
 .../metrics/_plot/precision_recall_curve.py   | 29 +++---
 sklearn/metrics/_plot/roc_curve.py            | 25 +++--
 .../_plot/tests/test_plot_confusion_matrix.py | 15 +++
 .../_plot/tests/test_plot_precision_recall.py | 20 ++++
 .../_plot/tests/test_plot_roc_curve.py        | 18 ++++
 9 files changed, 192 insertions(+), 25 deletions(-)
 create mode 100644 examples/plot_display_object_visualization.py

diff --git a/doc/conf.py b/doc/conf.py
index c3ab17d3e73af..a13ed14216de4 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -314,6 +314,7 @@ def __call__(self, directory):
     },
     # avoid generating too many cross links
     'inspect_global_variables': False,
+    'remove_config_comments': True,
 }
 
 
diff --git a/doc/visualizations.rst b/doc/visualizations.rst
index 47d826602b62f..e50a9a90a0b84 100644
--- a/doc/visualizations.rst
+++ b/doc/visualizations.rst
@@ -60,6 +60,7 @@ values of the curves.
 
     * :ref:`sphx_glr_auto_examples_plot_roc_curve_visualization_api.py`
     * :ref:`sphx_glr_auto_examples_plot_partial_dependence_visualization_api.py`
+    * :ref:`sphx_glr_auto_examples_plot_display_object_visualization.py`
 
 Available Plotting Utilities
 ============================
diff --git a/examples/plot_display_object_visualization.py b/examples/plot_display_object_visualization.py
new file mode 100644
index 0000000000000..32ea3ef2d1120
--- /dev/null
+++ b/examples/plot_display_object_visualization.py
@@ -0,0 +1,92 @@
+"""
+===================================
+Visualizations with Display Objects
+===================================
+
+.. currentmodule:: sklearn.metrics
+
+In this example, we will construct display objects,
+:class:`ConfusionMatrixDisplay`, :class:`RocCurveDisplay`, and
+:class:`PrecisionRecallDisplay` directly from their respective metrics. This
+is an alternative to using their corresponding plot functions when
+a model's predictions are already computed or expensive to compute. Note that
+this is advanced usage, and in general we recommend using their respective
+plot functions.
+"""
+print(__doc__)
+
+##############################################################################
+# Load Data and train model
+# -------------------------
+# For this example, we load a blood transfusion service center data set from
+# `OpenML <https://www.openml.org/d/1464>`. This is a binary classification
+# problem where the target is whether an individual donated blood. Then the
+# data is split into a train and test dataset and a logistic regression is
+# fitted wtih the train dataset.
+from sklearn.datasets import fetch_openml
+from sklearn.preprocessing import StandardScaler
+from sklearn.pipeline import make_pipeline
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+
+X, y = fetch_openml(data_id=1464, return_X_y=True)
+X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
+
+clf = make_pipeline(StandardScaler(), LogisticRegression(random_state=0))
+clf.fit(X_train, y_train)
+
+##############################################################################
+# Create :class:`ConfusionMatrixDisplay`
+##############################################################################
+# With the fitted model, we compute the predictions of the model on the test
+# dataset. These predictions are used to compute the confustion matrix which
+# is plotted with the :class:`ConfusionMatrixDisplay`
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import ConfusionMatrixDisplay
+
+y_pred = clf.predict(X_test)
+cm = confusion_matrix(y_test, y_pred)
+
+cm_display = ConfusionMatrixDisplay(cm).plot()
+
+
+##############################################################################
+# Create :class:`RocCurveDisplay`
+##############################################################################
+# The roc curve requires either the probabilities or the non-thresholded
+# decision values from the estimator. Since the logistic regression provides
+# a decision function, we will use it to plot the roc curve:
+from sklearn.metrics import roc_curve
+from sklearn.metrics import RocCurveDisplay
+y_score = clf.decision_function(X_test)
+
+fpr, tpr, _ = roc_curve(y_test, y_score, pos_label=clf.classes_[1])
+roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()
+
+##############################################################################
+# Create :class:`PrecisionRecallDisplay`
+##############################################################################
+# Similarly, the precision recall curve can be plotted using `y_score` from
+# the prevision sections.
+from sklearn.metrics import precision_recall_curve
+from sklearn.metrics import PrecisionRecallDisplay
+
+prec, recall, _ = precision_recall_curve(y_test, y_score,
+                                         pos_label=clf.classes_[1])
+pr_display = PrecisionRecallDisplay(precision=prec, recall=recall).plot()
+
+##############################################################################
+# Combining the display objects into a single plot
+##############################################################################
+# The display objects store the computed values that were passed as arguments.
+# This allows for the visualizations to be easliy combined using matplotlib's
+# API. In the following example, we place the displays next to each other in a
+# row.
+
+# sphinx_gallery_thumbnail_number = 4
+import matplotlib.pyplot as plt
+fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
+
+roc_display.plot(ax=ax1)
+pr_display.plot(ax=ax2)
+plt.show()
diff --git a/sklearn/metrics/_plot/confusion_matrix.py b/sklearn/metrics/_plot/confusion_matrix.py
index 8916b523fc273..861a2558a3cef 100644
--- a/sklearn/metrics/_plot/confusion_matrix.py
+++ b/sklearn/metrics/_plot/confusion_matrix.py
@@ -21,8 +21,9 @@ class ConfusionMatrixDisplay:
     confusion_matrix : ndarray of shape (n_classes, n_classes)
         Confusion matrix.
 
-    display_labels : ndarray of shape (n_classes,)
-        Display labels for plot.
+    display_labels : ndarray of shape (n_classes,), default=None
+        Display labels for plot. If None, display labels are set from 0 to
+        `n_classes - 1`.
 
     Attributes
     ----------
@@ -39,7 +40,7 @@ class ConfusionMatrixDisplay:
     figure_ : matplotlib Figure
         Figure containing the confusion matrix.
     """
-    def __init__(self, confusion_matrix, display_labels):
+    def __init__(self, confusion_matrix, display_labels=None):
         self.confusion_matrix = confusion_matrix
         self.display_labels = display_labels
 
@@ -108,11 +109,16 @@ def plot(self, include_values=True, cmap='viridis',
                     ha="center", va="center",
                     color=color)
 
+        if self.display_labels is None:
+            display_labels = np.arange(n_classes)
+        else:
+            display_labels = self.display_labels
+
         fig.colorbar(self.im_, ax=ax)
         ax.set(xticks=np.arange(n_classes),
                yticks=np.arange(n_classes),
-               xticklabels=self.display_labels,
-               yticklabels=self.display_labels,
+               xticklabels=display_labels,
+               yticklabels=display_labels,
                ylabel="True label",
                xlabel="Predicted label")
 
diff --git a/sklearn/metrics/_plot/precision_recall_curve.py b/sklearn/metrics/_plot/precision_recall_curve.py
index bfec9276f83be..10dd14e938984 100644
--- a/sklearn/metrics/_plot/precision_recall_curve.py
+++ b/sklearn/metrics/_plot/precision_recall_curve.py
@@ -23,11 +23,11 @@ class PrecisionRecallDisplay:
     recall : ndarray
         Recall values.
 
-    average_precision : float
-        Average precision.
+    average_precision : float, default=None
+        Average precision. If None, the average precision is not shown.
 
-    estimator_name : str
-        Name of estimator.
+    estimator_name : str, default=None
+        Name of estimator. If None, then the estimator name is not shown.
 
     Attributes
     ----------
@@ -41,7 +41,8 @@ class PrecisionRecallDisplay:
         Figure containing the curve.
     """
 
-    def __init__(self, precision, recall, average_precision, estimator_name):
+    def __init__(self, precision, recall,
+                 average_precision=None, estimator_name=None):
         self.precision = precision
         self.recall = recall
         self.average_precision = average_precision
@@ -78,16 +79,22 @@ def plot(self, ax=None, name=None, **kwargs):
 
         name = self.estimator_name if name is None else name
 
-        line_kwargs = {
-            "label": "{} (AP = {:0.2f})".format(name,
-                                                self.average_precision),
-            "drawstyle": "steps-post"
-        }
+        line_kwargs = {"drawstyle": "steps-post"}
+        if self.average_precision is not None and name is not None:
+            line_kwargs["label"] = (f"{name} (AP = "
+                                    f"{self.average_precision:0.2f})")
+        elif self.average_precision is not None:
+            line_kwargs["label"] = (f"AP = "
+                                    f"{self.average_precision:0.2f}")
+        elif name is not None:
+            line_kwargs["label"] = name
         line_kwargs.update(**kwargs)
 
         self.line_, = ax.plot(self.recall, self.precision, **line_kwargs)
         ax.set(xlabel="Recall", ylabel="Precision")
-        ax.legend(loc='lower left')
+
+        if "label" in line_kwargs:
+            ax.legend(loc='lower left')
 
         self.ax_ = ax
         self.figure_ = ax.figure
diff --git a/sklearn/metrics/_plot/roc_curve.py b/sklearn/metrics/_plot/roc_curve.py
index d786ac6659d41..0881646e8a1af 100644
--- a/sklearn/metrics/_plot/roc_curve.py
+++ b/sklearn/metrics/_plot/roc_curve.py
@@ -22,11 +22,11 @@ class RocCurveDisplay:
     tpr : ndarray
         True positive rate.
 
-    roc_auc : float
-        Area under ROC curve.
+    roc_auc : float, default=None
+        Area under ROC curve. If None, the roc_auc score is not shown.
 
-    estimator_name : str
-        Name of estimator.
+    estimator_name : str, default=None
+        Name of estimator. If None, the estimator name is not shown.
 
     Attributes
     ----------
@@ -54,7 +54,7 @@ class RocCurveDisplay:
     >>> plt.show()      # doctest: +SKIP
     """
 
-    def __init__(self, fpr, tpr, roc_auc, estimator_name):
+    def __init__(self, fpr, tpr, roc_auc=None, estimator_name=None):
         self.fpr = fpr
         self.tpr = tpr
         self.roc_auc = roc_auc
@@ -88,15 +88,22 @@ def plot(self, ax=None, name=None, **kwargs):
 
         name = self.estimator_name if name is None else name
 
-        line_kwargs = {
-            'label': "{} (AUC = {:0.2f})".format(name, self.roc_auc)
-        }
+        line_kwargs = {}
+        if self.roc_auc is not None and name is not None:
+            line_kwargs["label"] = f"{name} (AUC = {self.roc_auc:0.2f})"
+        elif self.roc_auc is not None:
+            line_kwargs["label"] = f"AUC = {self.roc_auc:0.2f}"
+        elif name is not None:
+            line_kwargs["label"] = name
+
         line_kwargs.update(**kwargs)
 
         self.line_ = ax.plot(self.fpr, self.tpr, **line_kwargs)[0]
         ax.set_xlabel("False Positive Rate")
         ax.set_ylabel("True Positive Rate")
-        ax.legend(loc='lower right')
+
+        if "label" in line_kwargs:
+            ax.legend(loc='lower right')
 
         self.ax_ = ax
         self.figure_ = ax.figure
diff --git a/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py b/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py
index b8a24ae15f1e5..6a0b880ebabb1 100644
--- a/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py
+++ b/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py
@@ -280,3 +280,18 @@ def test_confusion_matrix_standard_format(pyplot):
     # Values are have two dec places max, (e.g 100 becomes 1e+02)
     test = [t.get_text() for t in plotted_text.ravel()]
     assert test == ['0.1', '10', '1e+02', '0.53']
+
+
+@pytest.mark.parametrize("display_labels, expected_labels", [
+    (None, ["0", "1"]),
+    (["cat", "dog"], ["cat", "dog"]),
+])
+def test_default_labels(pyplot, display_labels, expected_labels):
+    cm = np.array([[10, 0], [12, 120]])
+    disp = ConfusionMatrixDisplay(cm, display_labels=display_labels).plot()
+
+    x_ticks = [tick.get_text() for tick in disp.ax_.get_xticklabels()]
+    y_ticks = [tick.get_text() for tick in disp.ax_.get_yticklabels()]
+
+    assert_array_equal(x_ticks, expected_labels)
+    assert_array_equal(y_ticks, expected_labels)
diff --git a/sklearn/metrics/_plot/tests/test_plot_precision_recall.py b/sklearn/metrics/_plot/tests/test_plot_precision_recall.py
index f22b112e96dc7..48305a93d0b3f 100644
--- a/sklearn/metrics/_plot/tests/test_plot_precision_recall.py
+++ b/sklearn/metrics/_plot/tests/test_plot_precision_recall.py
@@ -4,6 +4,7 @@
 
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.metrics import plot_precision_recall_curve
+from sklearn.metrics import PrecisionRecallDisplay
 from sklearn.metrics import average_precision_score
 from sklearn.metrics import precision_recall_curve
 from sklearn.datasets import make_classification
@@ -170,3 +171,22 @@ def test_plot_precision_recall_curve_estimator_name_multiple_calls(pyplot):
     clf_name = "another_name"
     disp.plot(name=clf_name)
     assert clf_name in disp.line_.get_label()
+
+
+@pytest.mark.parametrize(
+    "average_precision, estimator_name, expected_label",
+    [
+        (0.9, None, "AP = 0.90"),
+        (None, "my_est", "my_est"),
+        (0.8, "my_est2", "my_est2 (AP = 0.80)"),
+    ]
+)
+def test_default_labels(pyplot, average_precision, estimator_name,
+                        expected_label):
+    prec = np.array([1, 0.5, 0])
+    recall = np.array([0, 0.5, 1])
+    disp = PrecisionRecallDisplay(prec, recall,
+                                  average_precision=average_precision,
+                                  estimator_name=estimator_name)
+    disp.plot()
+    assert disp.line_.get_label() == expected_label
diff --git a/sklearn/metrics/_plot/tests/test_plot_roc_curve.py b/sklearn/metrics/_plot/tests/test_plot_roc_curve.py
index 699387ff4cfa3..1aa34bdca7279 100644
--- a/sklearn/metrics/_plot/tests/test_plot_roc_curve.py
+++ b/sklearn/metrics/_plot/tests/test_plot_roc_curve.py
@@ -4,6 +4,7 @@
 
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.metrics import plot_roc_curve
+from sklearn.metrics import RocCurveDisplay
 from sklearn.datasets import load_iris
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import roc_curve, auc
@@ -150,3 +151,20 @@ def test_plot_roc_curve_estimator_name_multiple_calls(pyplot, data_binary):
     clf_name = "another_name"
     disp.plot(name=clf_name)
     assert clf_name in disp.line_.get_label()
+
+
+@pytest.mark.parametrize(
+    "roc_auc, estimator_name, expected_label",
+    [
+        (0.9, None, "AUC = 0.90"),
+        (None, "my_est", "my_est"),
+        (0.8, "my_est2", "my_est2 (AUC = 0.80)")
+    ]
+)
+def test_default_labels(pyplot, roc_auc, estimator_name,
+                        expected_label):
+    fpr = np.array([0, 0.5, 1])
+    tpr = np.array([0, 0.5, 1])
+    disp = RocCurveDisplay(fpr, tpr, roc_auc=roc_auc,
+                           estimator_name=estimator_name).plot()
+    assert disp.line_.get_label() == expected_label

From 7844d1c2d78ed746f96e3acd38cf47e0d037b58d Mon Sep 17 00:00:00 2001
From: Lucy Liu <jliu176@gmail.com>
Date: Thu, 23 Apr 2020 19:19:52 +0200
Subject: [PATCH 048/125] TST Replace boston in histgradboost test_predictor
 (#16918)

---
 .../tests/test_predictor.py                       | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py
index 7df1e616445fc..3c837844f29e3 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py
@@ -1,5 +1,5 @@
 import numpy as np
-from sklearn.datasets import load_boston
+from sklearn.datasets import make_regression
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import r2_score
 import pytest
@@ -12,8 +12,9 @@
 
 
 @pytest.mark.parametrize('n_bins', [200, 256])
-def test_boston_dataset(n_bins):
-    X, y = load_boston(return_X_y=True)
+def test_regression_dataset(n_bins):
+    X, y = make_regression(n_samples=500, n_features=10, n_informative=5,
+                           random_state=42)
     X_train, X_test, y_train, y_test = train_test_split(
         X, y, random_state=42)
 
@@ -24,8 +25,8 @@ def test_boston_dataset(n_bins):
     gradients = -y_train.astype(G_H_DTYPE)
     hessians = np.ones(1, dtype=G_H_DTYPE)
 
-    min_samples_leaf = 8
-    max_leaf_nodes = 31
+    min_samples_leaf = 10
+    max_leaf_nodes = 30
     grower = TreeGrower(X_train_binned, gradients, hessians,
                         min_samples_leaf=min_samples_leaf,
                         max_leaf_nodes=max_leaf_nodes, n_bins=n_bins,
@@ -34,8 +35,8 @@ def test_boston_dataset(n_bins):
 
     predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_)
 
-    assert r2_score(y_train, predictor.predict(X_train)) > 0.85
-    assert r2_score(y_test, predictor.predict(X_test)) > 0.70
+    assert r2_score(y_train, predictor.predict(X_train)) > 0.82
+    assert r2_score(y_test, predictor.predict(X_test)) > 0.67
 
 
 @pytest.mark.parametrize('threshold, expected_predictions', [

From 88ba943684477aec39fb7eeba0b0d5dfa51cb2c8 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Fri, 24 Apr 2020 03:29:15 +1000
Subject: [PATCH 049/125] MNT Remove redundant doctest ELLIPSIS annotations
 (#16992)

---
 sklearn/metrics/_ranking.py | 20 ++++++++++----------
 sklearn/svm/_classes.py     |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py
index ad867efb8bfa3..a9e45310f330d 100644
--- a/sklearn/metrics/_ranking.py
+++ b/sklearn/metrics/_ranking.py
@@ -1239,22 +1239,22 @@ def dcg_score(y_true, y_score, k=None,
     >>> true_relevance = np.asarray([[10, 0, 0, 1, 5]])
     >>> # we predict scores for the answers
     >>> scores = np.asarray([[.1, .2, .3, 4, 70]])
-    >>> dcg_score(true_relevance, scores) # doctest: +ELLIPSIS
+    >>> dcg_score(true_relevance, scores)
     9.49...
     >>> # we can set k to truncate the sum; only top k answers contribute
-    >>> dcg_score(true_relevance, scores, k=2) # doctest: +ELLIPSIS
+    >>> dcg_score(true_relevance, scores, k=2)
     5.63...
     >>> # now we have some ties in our prediction
     >>> scores = np.asarray([[1, 0, 0, 0, 1]])
     >>> # by default ties are averaged, so here we get the average true
     >>> # relevance of our top predictions: (10 + 5) / 2 = 7.5
-    >>> dcg_score(true_relevance, scores, k=1) # doctest: +ELLIPSIS
+    >>> dcg_score(true_relevance, scores, k=1)
     7.5
     >>> # we can choose to ignore ties for faster results, but only
     >>> # if we know there aren't ties in our scores, otherwise we get
     >>> # wrong results:
     >>> dcg_score(true_relevance,
-    ...           scores, k=1, ignore_ties=True) # doctest: +ELLIPSIS
+    ...           scores, k=1, ignore_ties=True)
     5.0
 
     """
@@ -1387,29 +1387,29 @@ def ndcg_score(y_true, y_score, k=None, sample_weight=None, ignore_ties=False):
     >>> true_relevance = np.asarray([[10, 0, 0, 1, 5]])
     >>> # we predict some scores (relevance) for the answers
     >>> scores = np.asarray([[.1, .2, .3, 4, 70]])
-    >>> ndcg_score(true_relevance, scores) # doctest: +ELLIPSIS
+    >>> ndcg_score(true_relevance, scores)
     0.69...
     >>> scores = np.asarray([[.05, 1.1, 1., .5, .0]])
-    >>> ndcg_score(true_relevance, scores) # doctest: +ELLIPSIS
+    >>> ndcg_score(true_relevance, scores)
     0.49...
     >>> # we can set k to truncate the sum; only top k answers contribute.
-    >>> ndcg_score(true_relevance, scores, k=4) # doctest: +ELLIPSIS
+    >>> ndcg_score(true_relevance, scores, k=4)
     0.35...
     >>> # the normalization takes k into account so a perfect answer
     >>> # would still get 1.0
-    >>> ndcg_score(true_relevance, true_relevance, k=4) # doctest: +ELLIPSIS
+    >>> ndcg_score(true_relevance, true_relevance, k=4)
     1.0
     >>> # now we have some ties in our prediction
     >>> scores = np.asarray([[1, 0, 0, 0, 1]])
     >>> # by default ties are averaged, so here we get the average (normalized)
     >>> # true relevance of our top predictions: (10 / 10 + 5 / 10) / 2 = .75
-    >>> ndcg_score(true_relevance, scores, k=1) # doctest: +ELLIPSIS
+    >>> ndcg_score(true_relevance, scores, k=1)
     0.75
     >>> # we can choose to ignore ties for faster results, but only
     >>> # if we know there aren't ties in our scores, otherwise we get
     >>> # wrong results:
     >>> ndcg_score(true_relevance,
-    ...           scores, k=1, ignore_ties=True) # doctest: +ELLIPSIS
+    ...           scores, k=1, ignore_ties=True)
     0.5
 
     """
diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py
index 77110da119a02..f8b30e070711e 100644
--- a/sklearn/svm/_classes.py
+++ b/sklearn/svm/_classes.py
@@ -1251,7 +1251,7 @@ class OneClassSVM(OutlierMixin, BaseLibSVM):
     >>> clf = OneClassSVM(gamma='auto').fit(X)
     >>> clf.predict(X)
     array([-1,  1,  1,  1, -1])
-    >>> clf.score_samples(X)  # doctest: +ELLIPSIS
+    >>> clf.score_samples(X)
     array([1.7798..., 2.0547..., 2.0556..., 2.0561..., 1.7332...])
     """
 

From e392bfd6933ad1202bb269b404d18c74da01b19e Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Thu, 23 Apr 2020 20:17:00 +0200
Subject: [PATCH 050/125] MNT Add pre-comit configuration (#16957)

Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com>
---
 .pre-commit-config.yaml         | 22 ++++++++++++++++++++++
 doc/developers/contributing.rst | 25 +++++++++++++++++--------
 2 files changed, 39 insertions(+), 8 deletions(-)
 create mode 100644 .pre-commit-config.yaml

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000000000..aa8df3c3cbc87
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,22 @@
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.3.0
+    hooks:
+    -   id: check-yaml
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+-   repo: https://gitlab.com/pycqa/flake8
+    rev: 3.7.8
+    hooks:
+    -   id: flake8
+        types: [file, python]
+        # only check for unused imports for now, as long as
+        # the code is not fully PEP8 compatible
+        args: [--select=F401]
+-   repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v0.730
+    hooks:
+     -  id: mypy
+        args:
+          - --ignore-missing-imports
+        files: sklearn/
diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
index 99c59ec3392c6..33ab3fcecb887 100644
--- a/doc/developers/contributing.rst
+++ b/doc/developers/contributing.rst
@@ -248,19 +248,28 @@ modifying code and submitting a PR:
    and start making changes. Always use a feature branch. It's good
    practice to never work on the ``master`` branch!
 
-9. Develop the feature on your feature branch on your computer, using Git to
-   do the version control. When you're done editing, add changed files using
-   ``git add`` and then ``git commit``::
+9. (**Optional**) Install `pre-commit <https://pre-commit.com/#install>`_ to
+   run code style checks before each commit::
 
-       $ git add modified_files
-       $ git commit
+        $ pip install pre-commit
+        $ pre-commit install
 
-   to record your changes in Git, then push the changes to your GitHub
-   account with::
+   pre-commit checks can be disabled for a particular commit with
+   `git commit -n`.
+
+10. Develop the feature on your feature branch on your computer, using Git to
+    do the version control. When you're done editing, add changed files using
+    ``git add`` and then ``git commit``::
+ 
+        $ git add modified_files
+        $ git commit
+
+    to record your changes in Git, then push the changes to your GitHub
+    account with::
 
        $ git push -u origin my_feature
 
-10. Follow `these
+11. Follow `these
     <https://help.github.com/articles/creating-a-pull-request-from-a-fork>`_
     instructions to create a pull request from your fork. This will send an
     email to the committers. You may want to consider sending an email to the

From 94d8911310b7ec9cb6be2752d42b0cbd4c003c93 Mon Sep 17 00:00:00 2001
From: Lucy Liu <jliu176@gmail.com>
Date: Thu, 23 Apr 2020 21:06:03 +0200
Subject: [PATCH 051/125] DOC Fix typo in ensemble.rst (#16999)

---
 doc/modules/ensemble.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index e731ece0bdb20..bff08f542ce11 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -1437,7 +1437,7 @@ any other regressor or classifier, exposing a `predict`, `predict_proba`, and
    >>> print('R2 score: {:.2f}'.format(r2_score(y_test, y_pred)))
    R2 score: 0.81
 
-Note that it is also possible to get the output of the stacked outputs of the
+Note that it is also possible to get the output of the stacked
 `estimators` using the `transform` method::
 
   >>> reg.transform(X_test[:5])

From facd1177bd7bae219cb34d1c39a4cafc6e6c1f3d Mon Sep 17 00:00:00 2001
From: Lucy Liu <jliu176@gmail.com>
Date: Fri, 24 Apr 2020 04:40:29 +0200
Subject: [PATCH 052/125] TST Replace boston dataset in
 test_permutation_importance.py (#17020)

---
 sklearn/inspection/tests/test_permutation_importance.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py
index c13638b2fc0c7..2b381e9a20b1a 100644
--- a/sklearn/inspection/tests/test_permutation_importance.py
+++ b/sklearn/inspection/tests/test_permutation_importance.py
@@ -4,7 +4,7 @@
 from numpy.testing import assert_allclose
 
 from sklearn.compose import ColumnTransformer
-from sklearn.datasets import load_boston
+from sklearn.datasets import load_diabetes
 from sklearn.datasets import load_iris
 from sklearn.datasets import make_classification
 from sklearn.datasets import make_regression
@@ -33,7 +33,7 @@ def test_permutation_importance_correlated_feature_regression(n_jobs):
     rng = np.random.RandomState(42)
     n_repeats = 5
 
-    X, y = load_boston(return_X_y=True)
+    X, y = load_diabetes(return_X_y=True)
     y_with_little_noise = (
         y + rng.normal(scale=0.001, size=y.shape[0])).reshape(-1, 1)
 

From 0d04de2c76cb3f63acd9e1927c19a0c2d6da0266 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 24 Apr 2020 05:44:59 -0400
Subject: [PATCH 053/125] API kwonly for naive_bayes (#17003)

* wkonly

* pep8
---
 sklearn/naive_bayes.py            | 16 +++++++++++-----
 sklearn/tests/test_naive_bayes.py |  2 +-
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
index 6ef3895ffdb60..247d9eea763c6 100644
--- a/sklearn/naive_bayes.py
+++ b/sklearn/naive_bayes.py
@@ -32,6 +32,7 @@
 from .utils.multiclass import _check_partial_fit_first_call
 from .utils.validation import check_is_fitted, check_non_negative, column_or_1d
 from .utils.validation import _check_sample_weight
+from .utils.validation import _deprecate_positional_args
 
 __all__ = ['BernoulliNB', 'GaussianNB', 'MultinomialNB', 'ComplementNB',
            'CategoricalNB']
@@ -177,7 +178,8 @@ class labels known to the classifier
     [1]
     """
 
-    def __init__(self, priors=None, var_smoothing=1e-9):
+    @_deprecate_positional_args
+    def __init__(self, *, priors=None, var_smoothing=1e-9):
         self.priors = priors
         self.var_smoothing = var_smoothing
 
@@ -745,7 +747,8 @@ class MultinomialNB(_BaseDiscreteNB):
     https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html
     """
 
-    def __init__(self, alpha=1.0, fit_prior=True, class_prior=None):
+    @_deprecate_positional_args
+    def __init__(self, *, alpha=1.0, fit_prior=True, class_prior=None):
         self.alpha = alpha
         self.fit_prior = fit_prior
         self.class_prior = class_prior
@@ -847,7 +850,8 @@ class ComplementNB(_BaseDiscreteNB):
     https://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf
     """
 
-    def __init__(self, alpha=1.0, fit_prior=True, class_prior=None,
+    @_deprecate_positional_args
+    def __init__(self, *, alpha=1.0, fit_prior=True, class_prior=None,
                  norm=False):
         self.alpha = alpha
         self.fit_prior = fit_prior
@@ -961,7 +965,8 @@ class BernoulliNB(_BaseDiscreteNB):
     naive Bayes -- Which naive Bayes? 3rd Conf. on Email and Anti-Spam (CEAS).
     """
 
-    def __init__(self, alpha=1.0, binarize=.0, fit_prior=True,
+    @_deprecate_positional_args
+    def __init__(self, *, alpha=1.0, binarize=.0, fit_prior=True,
                  class_prior=None):
         self.alpha = alpha
         self.binarize = binarize
@@ -1072,7 +1077,8 @@ class CategoricalNB(_BaseDiscreteNB):
     [3]
     """
 
-    def __init__(self, alpha=1.0, fit_prior=True, class_prior=None):
+    @_deprecate_positional_args
+    def __init__(self, *, alpha=1.0, fit_prior=True, class_prior=None):
         self.alpha = alpha
         self.fit_prior = fit_prior
         self.class_prior = class_prior
diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py
index 1f0f9347a188c..1106684998f75 100644
--- a/sklearn/tests/test_naive_bayes.py
+++ b/sklearn/tests/test_naive_bayes.py
@@ -122,7 +122,7 @@ def test_gnb_priors_sum_isclose():
     priors = np.array([0.08, 0.14, 0.03, 0.16, 0.11, 0.16, 0.07, 0.14,
                        0.11, 0.0])
     Y = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
-    clf = GaussianNB(priors)
+    clf = GaussianNB(priors=priors)
     # smoke test for issue #9633
     clf.fit(X, Y)
 

From 361bf75544f7a0315f4388dffcc3279cdde50bec Mon Sep 17 00:00:00 2001
From: Stephen Marsh <stephendavidmarsh@gmail.com>
Date: Fri, 24 Apr 2020 09:25:39 -0400
Subject: [PATCH 054/125] DOC Fix link to user guide (#16989)

---
 sklearn/model_selection/_validation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 180c48fc99762..ef8890556ed1d 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -1384,7 +1384,7 @@ def validation_curve(estimator, X, y, param_name, param_range, groups=None,
     will also compute training scores and is merely a utility for plotting the
     results.
 
-    Read more in the :ref:`User Guide <learning_curve>`.
+    Read more in the :ref:`User Guide <validation_curve>`.
 
     Parameters
     ----------

From 2587a033808506cd1e46b0cba18fae5ae829f4ab Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 24 Apr 2020 10:28:56 -0400
Subject: [PATCH 055/125] [MRG] DOC fix ref for ParameterSampler (#16983)

---
 sklearn/model_selection/_search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index d283dc2f0b483..80cb269d4a0c7 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -189,7 +189,7 @@ class ParameterSampler:
     It is highly recommended to use continuous distributions for continuous
     parameters.
 
-    Read more in the :ref:`User Guide <search>`.
+    Read more in the :ref:`User Guide <grid_search>`.
 
     Parameters
     ----------

From 89993d2f57f668d0fba7995e764ff656710cb67f Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@gmail.com>
Date: Fri, 24 Apr 2020 18:11:37 +0200
Subject: [PATCH 056/125] Update the URL of valgrind-python.supp in the doc
 (#17029)

---
 doc/developers/tips.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/developers/tips.rst b/doc/developers/tips.rst
index b26d68ecfbe02..4c11c24684352 100644
--- a/doc/developers/tips.rst
+++ b/doc/developers/tips.rst
@@ -248,8 +248,8 @@ code. Follow these steps:
        $> valgrind -v --suppressions=valgrind-python.supp python my_test_script.py
 
 .. _valgrind: http://valgrind.org
-.. _`README.valgrind`: https://svn.python.org/projects/python/trunk/Misc/README.valgrind
-.. _`valgrind-python.supp`: https://svn.python.org/projects/python/trunk/Misc/valgrind-python.supp
+.. _`README.valgrind`: https://github.com/python/cpython/blob/master/Misc/README.valgrind
+.. _`valgrind-python.supp`: https://github.com/python/cpython/blob/master/Misc/valgrind-python.supp
 
 
 The result will be a list of all the memory-related errors, which reference

From 501ee56192154f3fee09f173faece092ae94dd46 Mon Sep 17 00:00:00 2001
From: Lucy Liu <jliu176@gmail.com>
Date: Fri, 24 Apr 2020 18:50:19 +0200
Subject: [PATCH 057/125] TST Replace Boston dataset in test_impute (#17025)

---
 sklearn/impute/tests/test_impute.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/impute/tests/test_impute.py b/sklearn/impute/tests/test_impute.py
index 58c71660b401d..960f671915e6a 100644
--- a/sklearn/impute/tests/test_impute.py
+++ b/sklearn/impute/tests/test_impute.py
@@ -16,7 +16,7 @@
 # make IterativeImputer available
 from sklearn.experimental import enable_iterative_imputer  # noqa
 
-from sklearn.datasets import load_boston
+from sklearn.datasets import load_diabetes
 from sklearn.impute import MissingIndicator
 from sklearn.impute import SimpleImputer, IterativeImputer
 from sklearn.dummy import DummyRegressor
@@ -947,7 +947,7 @@ def test_iterative_imputer_early_stopping():
 def test_iterative_imputer_catch_warning():
     # check that we catch a RuntimeWarning due to a division by zero when a
     # feature is constant in the dataset
-    X, y = load_boston(return_X_y=True)
+    X, y = load_diabetes(return_X_y=True)
     n_samples, n_features = X.shape
 
     # simulate that a feature only contain one category during fit

From 2955d9f0af7785ed353d6ad23740f62852fcf988 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 24 Apr 2020 14:22:09 -0400
Subject: [PATCH 058/125] API kwonly for neighbors module (#17004)

---
 sklearn/manifold/_locally_linear.py           |  4 ++--
 sklearn/manifold/_t_sne.py                    |  4 ++--
 sklearn/neighbors/_classification.py          |  7 ++++--
 sklearn/neighbors/_graph.py                   | 23 +++++++++++--------
 sklearn/neighbors/_kde.py                     |  4 +++-
 sklearn/neighbors/_lof.py                     |  4 +++-
 sklearn/neighbors/_nca.py                     |  4 +++-
 sklearn/neighbors/_nearest_centroid.py        |  4 +++-
 sklearn/neighbors/_regression.py              |  7 ++++--
 sklearn/neighbors/_unsupervised.py            |  6 +++--
 sklearn/neighbors/tests/test_kde.py           |  4 ++--
 sklearn/neighbors/tests/test_neighbors.py     |  8 +++----
 sklearn/semi_supervised/_label_propagation.py |  2 +-
 13 files changed, 51 insertions(+), 30 deletions(-)

diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py
index 7b46d51df718d..c2d1ffbae9361 100644
--- a/sklearn/manifold/_locally_linear.py
+++ b/sklearn/manifold/_locally_linear.py
@@ -97,7 +97,7 @@ def barycenter_kneighbors_graph(X, n_neighbors, reg=1e-3, n_jobs=None):
     sklearn.neighbors.kneighbors_graph
     sklearn.neighbors.radius_neighbors_graph
     """
-    knn = NearestNeighbors(n_neighbors + 1, n_jobs=n_jobs).fit(X)
+    knn = NearestNeighbors(n_neighbors=n_neighbors + 1, n_jobs=n_jobs).fit(X)
     X = knn._fit_X
     n_samples = knn.n_samples_fit_
     ind = knn.kneighbors(X, return_distance=False)[:, 1:]
@@ -647,7 +647,7 @@ def __init__(self, n_neighbors=5, n_components=2, reg=1E-3,
         self.n_jobs = n_jobs
 
     def _fit_transform(self, X):
-        self.nbrs_ = NearestNeighbors(self.n_neighbors,
+        self.nbrs_ = NearestNeighbors(n_neighbors=self.n_neighbors,
                                       algorithm=self.neighbors_algorithm,
                                       n_jobs=self.n_jobs)
 
diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py
index 53558f6051283..136a32cd86e73 100644
--- a/sklearn/manifold/_t_sne.py
+++ b/sklearn/manifold/_t_sne.py
@@ -450,8 +450,8 @@ def trustworthiness(X, X_embedded, n_neighbors=5, metric='euclidean'):
     np.fill_diagonal(dist_X, np.inf)
     ind_X = np.argsort(dist_X, axis=1)
     # `ind_X[i]` is the index of sorted distances between i and other samples
-    ind_X_embedded = NearestNeighbors(n_neighbors).fit(X_embedded).kneighbors(
-        return_distance=False)
+    ind_X_embedded = NearestNeighbors(n_neighbors=n_neighbors).fit(
+            X_embedded).kneighbors(return_distance=False)
 
     # We build an inverted index of neighbors in the input space: For sample i,
     # we define `inverted_index[i]` as the inverted index of sorted distances:
diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py
index e223476d3107b..331eb7821a511 100644
--- a/sklearn/neighbors/_classification.py
+++ b/sklearn/neighbors/_classification.py
@@ -20,6 +20,7 @@
     RadiusNeighborsMixin, SupervisedIntegerMixin
 from ..base import ClassifierMixin
 from ..utils import check_array
+from ..utils.validation import _deprecate_positional_args
 
 
 class KNeighborsClassifier(NeighborsBase, KNeighborsMixin,
@@ -142,7 +143,8 @@ class KNeighborsClassifier(NeighborsBase, KNeighborsMixin,
     https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
     """
 
-    def __init__(self, n_neighbors=5,
+    @_deprecate_positional_args
+    def __init__(self, n_neighbors=5, *,
                  weights='uniform', algorithm='auto', leaf_size=30,
                  p=2, metric='minkowski', metric_params=None, n_jobs=None,
                  **kwargs):
@@ -374,7 +376,8 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin,
     https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
     """
 
-    def __init__(self, radius=1.0, weights='uniform',
+    @_deprecate_positional_args
+    def __init__(self, radius=1.0, *, weights='uniform',
                  algorithm='auto', leaf_size=30, p=2, metric='minkowski',
                  outlier_label=None, metric_params=None, n_jobs=None,
                  **kwargs):
diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py
index d217999196950..6bf8da3f4ef5e 100644
--- a/sklearn/neighbors/_graph.py
+++ b/sklearn/neighbors/_graph.py
@@ -9,7 +9,7 @@
 from ._base import UnsupervisedMixin
 from ._unsupervised import NearestNeighbors
 from ..base import TransformerMixin
-from ..utils.validation import check_is_fitted
+from ..utils.validation import check_is_fitted, _deprecate_positional_args
 
 
 def _check_params(X, metric, p, metric_params):
@@ -37,8 +37,10 @@ def _query_include_self(X, include_self, mode):
     return X
 
 
-def kneighbors_graph(X, n_neighbors, mode='connectivity', metric='minkowski',
-                     p=2, metric_params=None, include_self=False, n_jobs=None):
+@_deprecate_positional_args
+def kneighbors_graph(X, n_neighbors, *, mode='connectivity',
+                     metric='minkowski', p=2, metric_params=None,
+                     include_self=False, n_jobs=None):
     """Computes the (weighted) graph of k-Neighbors for points in X
 
     Read more in the :ref:`User Guide <unsupervised_neighbors>`.
@@ -103,7 +105,7 @@ def kneighbors_graph(X, n_neighbors, mode='connectivity', metric='minkowski',
     radius_neighbors_graph
     """
     if not isinstance(X, KNeighborsMixin):
-        X = NearestNeighbors(n_neighbors, metric=metric, p=p,
+        X = NearestNeighbors(n_neighbors=n_neighbors, metric=metric, p=p,
                              metric_params=metric_params, n_jobs=n_jobs).fit(X)
     else:
         _check_params(X, metric, p, metric_params)
@@ -112,9 +114,10 @@ def kneighbors_graph(X, n_neighbors, mode='connectivity', metric='minkowski',
     return X.kneighbors_graph(X=query, n_neighbors=n_neighbors, mode=mode)
 
 
-def radius_neighbors_graph(X, radius, mode='connectivity', metric='minkowski',
-                           p=2, metric_params=None, include_self=False,
-                           n_jobs=None):
+@_deprecate_positional_args
+def radius_neighbors_graph(X, radius, *, mode='connectivity',
+                           metric='minkowski', p=2, metric_params=None,
+                           include_self=False, n_jobs=None):
     """Computes the (weighted) graph of Neighbors for points in X
 
     Neighborhoods are restricted the points at a distance lower than
@@ -281,7 +284,8 @@ class KNeighborsTransformer(KNeighborsMixin, UnsupervisedMixin,
     ...     KNeighborsTransformer(n_neighbors=5, mode='distance'),
     ...     Isomap(neighbors_algorithm='precomputed'))
     """
-    def __init__(self, mode='distance', n_neighbors=5, algorithm='auto',
+    @_deprecate_positional_args
+    def __init__(self, *, mode='distance', n_neighbors=5, algorithm='auto',
                  leaf_size=30, metric='minkowski', p=2, metric_params=None,
                  n_jobs=1):
         super(KNeighborsTransformer, self).__init__(
@@ -422,7 +426,8 @@ class RadiusNeighborsTransformer(RadiusNeighborsMixin, UnsupervisedMixin,
     ...     RadiusNeighborsTransformer(radius=42.0, mode='distance'),
     ...     DBSCAN(min_samples=30, metric='precomputed'))
     """
-    def __init__(self, mode='distance', radius=1., algorithm='auto',
+    @_deprecate_positional_args
+    def __init__(self, *, mode='distance', radius=1., algorithm='auto',
                  leaf_size=30, metric='minkowski', p=2, metric_params=None,
                  n_jobs=1):
         super(RadiusNeighborsTransformer, self).__init__(
diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py
index 91a97e2810baa..684e07947cddd 100644
--- a/sklearn/neighbors/_kde.py
+++ b/sklearn/neighbors/_kde.py
@@ -9,6 +9,7 @@
 from ..base import BaseEstimator
 from ..utils import check_array, check_random_state
 from ..utils.validation import _check_sample_weight, check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 
 from ..utils.extmath import row_norms
 from ._ball_tree import BallTree, DTYPE
@@ -89,7 +90,8 @@ class KernelDensity(BaseEstimator):
     >>> log_density
     array([-1.52955942, -1.51462041, -1.60244657])
     """
-    def __init__(self, bandwidth=1.0, algorithm='auto',
+    @_deprecate_positional_args
+    def __init__(self, *, bandwidth=1.0, algorithm='auto',
                  kernel='gaussian', metric="euclidean", atol=0, rtol=0,
                  breadth_first=True, leaf_size=40, metric_params=None):
         self.algorithm = algorithm
diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py
index f3b141bf499e5..dfdb89237f516 100644
--- a/sklearn/neighbors/_lof.py
+++ b/sklearn/neighbors/_lof.py
@@ -11,6 +11,7 @@
 from ..base import OutlierMixin
 
 from ..utils.validation import check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 from ..utils import check_array
 
 __all__ = ["LocalOutlierFactor"]
@@ -163,7 +164,8 @@ class LocalOutlierFactor(KNeighborsMixin, UnsupervisedMixin,
     .. [1] Breunig, M. M., Kriegel, H. P., Ng, R. T., & Sander, J. (2000, May).
            LOF: identifying density-based local outliers. In ACM sigmod record.
     """
-    def __init__(self, n_neighbors=20, algorithm='auto', leaf_size=30,
+    @_deprecate_positional_args
+    def __init__(self, n_neighbors=20, *, algorithm='auto', leaf_size=30,
                  metric='minkowski', p=2, metric_params=None,
                  contamination="auto", novelty=False, n_jobs=None):
         super().__init__(
diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py
index 1017f5cf12606..9705c9050f6c7 100644
--- a/sklearn/neighbors/_nca.py
+++ b/sklearn/neighbors/_nca.py
@@ -23,6 +23,7 @@
 from ..utils.multiclass import check_classification_targets
 from ..utils.random import check_random_state
 from ..utils.validation import check_is_fitted, check_array, check_scalar
+from ..utils.validation import _deprecate_positional_args
 from ..exceptions import ConvergenceWarning
 
 
@@ -161,7 +162,8 @@ class NeighborhoodComponentsAnalysis(TransformerMixin, BaseEstimator):
 
     """
 
-    def __init__(self, n_components=None, init='auto', warm_start=False,
+    @_deprecate_positional_args
+    def __init__(self, n_components=None, *, init='auto', warm_start=False,
                  max_iter=50, tol=1e-5, callback=None, verbose=0,
                  random_state=None):
         self.n_components = n_components
diff --git a/sklearn/neighbors/_nearest_centroid.py b/sklearn/neighbors/_nearest_centroid.py
index bf00d8b8f88d2..62f74940100e7 100644
--- a/sklearn/neighbors/_nearest_centroid.py
+++ b/sklearn/neighbors/_nearest_centroid.py
@@ -16,6 +16,7 @@
 from ..metrics.pairwise import pairwise_distances
 from ..preprocessing import LabelEncoder
 from ..utils.validation import check_array, check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 from ..utils.sparsefuncs import csc_median_axis_0
 from ..utils.multiclass import check_classification_targets
 
@@ -85,7 +86,8 @@ class NearestCentroid(ClassifierMixin, BaseEstimator):
 
     """
 
-    def __init__(self, metric='euclidean', shrink_threshold=None):
+    @_deprecate_positional_args
+    def __init__(self, metric='euclidean', *, shrink_threshold=None):
         self.metric = metric
         self.shrink_threshold = shrink_threshold
 
diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
index cce218062a3d5..845aacbfd4248 100644
--- a/sklearn/neighbors/_regression.py
+++ b/sklearn/neighbors/_regression.py
@@ -18,6 +18,7 @@
 from ._base import RadiusNeighborsMixin, SupervisedFloatMixin
 from ..base import RegressorMixin
 from ..utils import check_array
+from ..utils.validation import _deprecate_positional_args
 
 
 class KNeighborsRegressor(NeighborsBase, KNeighborsMixin,
@@ -139,7 +140,8 @@ class KNeighborsRegressor(NeighborsBase, KNeighborsMixin,
     https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
     """
 
-    def __init__(self, n_neighbors=5, weights='uniform',
+    @_deprecate_positional_args
+    def __init__(self, n_neighbors=5, *, weights='uniform',
                  algorithm='auto', leaf_size=30,
                  p=2, metric='minkowski', metric_params=None, n_jobs=None,
                  **kwargs):
@@ -307,7 +309,8 @@ class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin,
     https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
     """
 
-    def __init__(self, radius=1.0, weights='uniform',
+    @_deprecate_positional_args
+    def __init__(self, radius=1.0, *, weights='uniform',
                  algorithm='auto', leaf_size=30,
                  p=2, metric='minkowski', metric_params=None, n_jobs=None,
                  **kwargs):
diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py
index 923a465b1d31b..7e120d7587b66 100644
--- a/sklearn/neighbors/_unsupervised.py
+++ b/sklearn/neighbors/_unsupervised.py
@@ -3,6 +3,7 @@
 from ._base import KNeighborsMixin
 from ._base import RadiusNeighborsMixin
 from ._base import UnsupervisedMixin
+from ..utils.validation import _deprecate_positional_args
 
 
 class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin,
@@ -78,7 +79,7 @@ class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin,
       >>> from sklearn.neighbors import NearestNeighbors
       >>> samples = [[0, 0, 2], [1, 0, 0], [0, 0, 1]]
 
-      >>> neigh = NearestNeighbors(2, 0.4)
+      >>> neigh = NearestNeighbors(n_neighbors=2, radius=0.4)
       >>> neigh.fit(samples)
       NearestNeighbors(...)
 
@@ -105,7 +106,8 @@ class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin,
     https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
     """
 
-    def __init__(self, n_neighbors=5, radius=1.0,
+    @_deprecate_positional_args
+    def __init__(self, *, n_neighbors=5, radius=1.0,
                  algorithm='auto', leaf_size=30, metric='minkowski',
                  p=2, metric_params=None, n_jobs=None):
         super().__init__(
diff --git a/sklearn/neighbors/tests/test_kde.py b/sklearn/neighbors/tests/test_kde.py
index e17e8e575f728..cff7ffafe5acd 100644
--- a/sklearn/neighbors/tests/test_kde.py
+++ b/sklearn/neighbors/tests/test_kde.py
@@ -74,7 +74,7 @@ def test_kernel_density_sampling(n_samples=100, n_features=3):
 
     for kernel in ['gaussian', 'tophat']:
         # draw a tophat sample
-        kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
+        kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X)
         samp = kde.sample(100)
         assert X.shape == samp.shape
 
@@ -91,7 +91,7 @@ def test_kernel_density_sampling(n_samples=100, n_features=3):
 
     # check unsupported kernels
     for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']:
-        kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
+        kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X)
         assert_raises(NotImplementedError, kde.sample, 100)
 
     # non-regression test: used to return a scalar
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 88e32669777a1..d62b998052656 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -1245,9 +1245,9 @@ def custom_metric(x1, x2):
         return np.sqrt(np.sum(x1 ** 2 + x2 ** 2))
 
     X = np.random.RandomState(42).rand(20, 2)
-    nbrs1 = neighbors.NearestNeighbors(3, algorithm='auto',
+    nbrs1 = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto',
                                        metric=custom_metric)
-    nbrs2 = neighbors.NearestNeighbors(3, algorithm='brute',
+    nbrs2 = neighbors.NearestNeighbors(n_neighbors=3, algorithm='brute',
                                        metric=custom_metric)
 
     nbrs1.fit(X)
@@ -1339,7 +1339,7 @@ def test_non_euclidean_kneighbors():
         nbrs_graph = neighbors.kneighbors_graph(
             X, 3, metric=metric, mode='connectivity',
             include_self=True).toarray()
-        nbrs1 = neighbors.NearestNeighbors(3, metric=metric).fit(X)
+        nbrs1 = neighbors.NearestNeighbors(n_neighbors=3, metric=metric).fit(X)
         assert_array_equal(nbrs_graph, nbrs1.kneighbors_graph(X).toarray())
 
     # Test radiusneighbors_graph
@@ -1351,7 +1351,7 @@ def test_non_euclidean_kneighbors():
         assert_array_equal(nbrs_graph, nbrs1.radius_neighbors_graph(X).A)
 
     # Raise error when wrong parameters are supplied,
-    X_nbrs = neighbors.NearestNeighbors(3, metric='manhattan')
+    X_nbrs = neighbors.NearestNeighbors(n_neighbors=3, metric='manhattan')
     X_nbrs.fit(X)
     assert_raises(ValueError, neighbors.kneighbors_graph, X_nbrs, 3,
                   metric='euclidean')
diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py
index ccc6b889f41f6..efa9eb2255ce3 100644
--- a/sklearn/semi_supervised/_label_propagation.py
+++ b/sklearn/semi_supervised/_label_propagation.py
@@ -131,7 +131,7 @@ def _get_kernel(self, X, y=None):
                 return rbf_kernel(X, y, gamma=self.gamma)
         elif self.kernel == "knn":
             if self.nn_fit is None:
-                self.nn_fit = NearestNeighbors(self.n_neighbors,
+                self.nn_fit = NearestNeighbors(n_neighbors=self.n_neighbors,
                                                n_jobs=self.n_jobs).fit(X)
             if y is None:
                 return self.nn_fit.kneighbors_graph(self.nn_fit._fit_X,

From 02309ffbdaae45af75b0e87946aab9aedb6b3634 Mon Sep 17 00:00:00 2001
From: Adrin Jalali <adrin.jalali@gmail.com>
Date: Fri, 24 Apr 2020 23:24:24 +0200
Subject: [PATCH 059/125] API kwonly args in manifold, metrics, mixture,
 model_selection, multclass, multioutput (#16982)

---
 sklearn/cluster/_optics.py                    |  2 +-
 sklearn/feature_selection/_rfe.py             |  2 +-
 sklearn/manifold/_isomap.py                   |  5 +-
 sklearn/manifold/_locally_linear.py           | 15 +++--
 sklearn/manifold/_mds.py                      | 11 ++--
 sklearn/manifold/_spectral_embedding.py       |  8 ++-
 sklearn/manifold/_t_sne.py                    |  7 ++-
 sklearn/manifold/tests/test_isomap.py         |  2 +-
 sklearn/manifold/tests/test_locally_linear.py |  2 +-
 sklearn/metrics/_classification.py            | 60 ++++++++++++-------
 sklearn/metrics/_plot/confusion_matrix.py     |  9 ++-
 .../metrics/_plot/precision_recall_curve.py   | 10 ++--
 sklearn/metrics/_plot/roc_curve.py            | 10 ++--
 .../_plot/tests/test_plot_confusion_matrix.py |  8 ++-
 .../_plot/tests/test_plot_roc_curve.py        |  2 +-
 sklearn/metrics/_ranking.py                   | 30 +++++++---
 sklearn/metrics/_regression.py                | 31 ++++++----
 sklearn/metrics/_scorer.py                    |  7 ++-
 sklearn/metrics/cluster/_bicluster.py         |  4 +-
 sklearn/metrics/cluster/_supervised.py        | 22 ++++---
 sklearn/metrics/cluster/_unsupervised.py      |  7 ++-
 sklearn/metrics/pairwise.py                   | 33 ++++++----
 sklearn/metrics/tests/test_classification.py  |  7 ++-
 sklearn/metrics/tests/test_pairwise.py        |  2 +-
 sklearn/metrics/tests/test_score_objects.py   | 16 ++---
 sklearn/mixture/_bayesian_mixture.py          |  5 +-
 sklearn/mixture/_gaussian_mixture.py          |  5 +-
 sklearn/model_selection/_search.py            | 23 ++++---
 sklearn/model_selection/_split.py             | 55 ++++++++++-------
 sklearn/model_selection/_validation.py        | 21 ++++---
 sklearn/model_selection/tests/test_split.py   |  4 +-
 .../model_selection/tests/test_validation.py  | 19 +++---
 sklearn/multiclass.py                         | 12 ++--
 sklearn/multioutput.py                        | 21 ++++---
 34 files changed, 302 insertions(+), 175 deletions(-)

diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py
index c8ca3ec569a88..7f54a318d3d49 100755
--- a/sklearn/cluster/_optics.py
+++ b/sklearn/cluster/_optics.py
@@ -529,7 +529,7 @@ def _set_reach_dist(core_distances_, reachability_, predecessor_,
             # in the dict params
             _params['p'] = p
         dists = pairwise_distances(P, np.take(X, unproc, axis=0),
-                                   metric, n_jobs=None,
+                                   metric=metric, n_jobs=None,
                                    **_params).ravel()
 
     rdists = np.maximum(dists, core_distances_[point_index])
diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py
index 6d9bb8c463df6..7421bc50b7625 100644
--- a/sklearn/feature_selection/_rfe.py
+++ b/sklearn/feature_selection/_rfe.py
@@ -506,7 +506,7 @@ def fit(self, X, y, groups=None):
         )
 
         # Initialization
-        cv = check_cv(self.cv, y, is_classifier(self.estimator))
+        cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))
         scorer = check_scoring(self.estimator, scoring=self.scoring)
         n_features = X.shape[1]
 
diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py
index f26db5cc2028d..3229522d21c6e 100644
--- a/sklearn/manifold/_isomap.py
+++ b/sklearn/manifold/_isomap.py
@@ -8,6 +8,7 @@
 from ..neighbors import NearestNeighbors, kneighbors_graph
 from ..utils.deprecation import deprecated
 from ..utils.validation import check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 from ..utils.graph import graph_shortest_path
 from ..decomposition import KernelPCA
 from ..preprocessing import KernelCenterer
@@ -122,8 +123,8 @@ class Isomap(TransformerMixin, BaseEstimator):
     .. [1] Tenenbaum, J.B.; De Silva, V.; & Langford, J.C. A global geometric
            framework for nonlinear dimensionality reduction. Science 290 (5500)
     """
-
-    def __init__(self, n_neighbors=5, n_components=2, eigen_solver='auto',
+    @_deprecate_positional_args
+    def __init__(self, *, n_neighbors=5, n_components=2, eigen_solver='auto',
                  tol=0, max_iter=None, path_method='auto',
                  neighbors_algorithm='auto', n_jobs=None, metric='minkowski',
                  p=2, metric_params=None):
diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py
index c2d1ffbae9361..a1db0c43daccb 100644
--- a/sklearn/manifold/_locally_linear.py
+++ b/sklearn/manifold/_locally_linear.py
@@ -14,6 +14,7 @@
 from ..utils.extmath import stable_cumsum
 from ..utils.validation import check_is_fitted
 from ..utils.validation import FLOAT_DTYPES
+from ..utils.validation import _deprecate_positional_args
 from ..neighbors import NearestNeighbors
 
 
@@ -183,10 +184,11 @@ def null_space(M, k, k_skip=1, eigen_solver='arpack', tol=1E-6, max_iter=100,
         raise ValueError("Unrecognized eigen_solver '%s'" % eigen_solver)
 
 
+@_deprecate_positional_args
 def locally_linear_embedding(
-        X, n_neighbors, n_components, reg=1e-3, eigen_solver='auto', tol=1e-6,
-        max_iter=100, method='standard', hessian_tol=1E-4, modified_tol=1E-12,
-        random_state=None, n_jobs=None):
+        X, *, n_neighbors, n_components, reg=1e-3, eigen_solver='auto',
+        tol=1e-6, max_iter=100, method='standard', hessian_tol=1E-4,
+        modified_tol=1E-12, random_state=None, n_jobs=None):
     """Perform a Locally Linear Embedding analysis on the data.
 
     Read more in the :ref:`User Guide <locally_linear_embedding>`.
@@ -628,8 +630,8 @@ class LocallyLinearEmbedding(TransformerMixin,
         dimensionality reduction via tangent space alignment.
         Journal of Shanghai Univ.  8:406 (2004)
     """
-
-    def __init__(self, n_neighbors=5, n_components=2, reg=1E-3,
+    @_deprecate_positional_args
+    def __init__(self, *, n_neighbors=5, n_components=2, reg=1E-3,
                  eigen_solver='auto', tol=1E-6, max_iter=100,
                  method='standard', hessian_tol=1E-4, modified_tol=1E-12,
                  neighbors_algorithm='auto', random_state=None, n_jobs=None):
@@ -656,7 +658,8 @@ def _fit_transform(self, X):
         self.nbrs_.fit(X)
         self.embedding_, self.reconstruction_error_ = \
             locally_linear_embedding(
-                self.nbrs_, self.n_neighbors, self.n_components,
+                X=self.nbrs_, n_neighbors=self.n_neighbors,
+                n_components=self.n_components,
                 eigen_solver=self.eigen_solver, tol=self.tol,
                 max_iter=self.max_iter, method=self.method,
                 hessian_tol=self.hessian_tol, modified_tol=self.modified_tol,
diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py
index ca8c08ed69f98..0314007264689 100644
--- a/sklearn/manifold/_mds.py
+++ b/sklearn/manifold/_mds.py
@@ -14,6 +14,7 @@
 from ..metrics import euclidean_distances
 from ..utils import check_random_state, check_array, check_symmetric
 from ..isotonic import IsotonicRegression
+from ..utils.validation import _deprecate_positional_args
 
 
 def _smacof_single(dissimilarities, metric=True, n_components=2, init=None,
@@ -129,9 +130,10 @@ def _smacof_single(dissimilarities, metric=True, n_components=2, init=None,
     return X, stress, it + 1
 
 
-def smacof(dissimilarities, metric=True, n_components=2, init=None, n_init=8,
-           n_jobs=None, max_iter=300, verbose=0, eps=1e-3, random_state=None,
-           return_n_iter=False):
+@_deprecate_positional_args
+def smacof(dissimilarities, *, metric=True, n_components=2, init=None,
+           n_init=8, n_jobs=None, max_iter=300, verbose=0, eps=1e-3,
+           random_state=None, return_n_iter=False):
     """Computes multidimensional scaling using the SMACOF algorithm.
 
     The SMACOF (Scaling by MAjorizing a COmplicated Function) algorithm is a
@@ -357,7 +359,8 @@ class MDS(BaseEstimator):
     hypothesis" Kruskal, J. Psychometrika, 29, (1964)
 
     """
-    def __init__(self, n_components=2, metric=True, n_init=4,
+    @_deprecate_positional_args
+    def __init__(self, n_components=2, *, metric=True, n_init=4,
                  max_iter=300, verbose=0, eps=1e-3, n_jobs=None,
                  random_state=None, dissimilarity="euclidean"):
         self.n_components = n_components
diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py
index caac2236e1dd6..0c8bb4902c99a 100644
--- a/sklearn/manifold/_spectral_embedding.py
+++ b/sklearn/manifold/_spectral_embedding.py
@@ -20,6 +20,7 @@
 from ..utils.fixes import lobpcg
 from ..metrics.pairwise import rbf_kernel
 from ..neighbors import kneighbors_graph, NearestNeighbors
+from ..utils.validation import _deprecate_positional_args
 
 
 def _graph_connected_component(graph, node_id):
@@ -132,7 +133,8 @@ def _set_diag(laplacian, value, norm_laplacian):
     return laplacian
 
 
-def spectral_embedding(adjacency, n_components=8, eigen_solver=None,
+@_deprecate_positional_args
+def spectral_embedding(adjacency, *, n_components=8, eigen_solver=None,
                        random_state=None, eigen_tol=0.0,
                        norm_laplacian=True, drop_first=True):
     """Project the sample on the first eigenvectors of the graph Laplacian.
@@ -440,8 +442,8 @@ class SpectralEmbedding(BaseEstimator):
       Jianbo Shi, Jitendra Malik
       http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324
     """
-
-    def __init__(self, n_components=2, affinity="nearest_neighbors",
+    @_deprecate_positional_args
+    def __init__(self, n_components=2, *, affinity="nearest_neighbors",
                  gamma=None, random_state=None, eigen_solver=None,
                  n_neighbors=None, n_jobs=None):
         self.n_components = n_components
diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py
index 136a32cd86e73..d94bf777399f5 100644
--- a/sklearn/manifold/_t_sne.py
+++ b/sklearn/manifold/_t_sne.py
@@ -19,6 +19,7 @@
 from ..utils import check_random_state
 from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..utils.validation import check_non_negative
+from ..utils.validation import _deprecate_positional_args
 from ..decomposition import PCA
 from ..metrics.pairwise import pairwise_distances
 from . import _utils
@@ -396,7 +397,8 @@ def _gradient_descent(objective, p0, it, n_iter,
     return p, error, i
 
 
-def trustworthiness(X, X_embedded, n_neighbors=5, metric='euclidean'):
+@_deprecate_positional_args
+def trustworthiness(X, X_embedded, *, n_neighbors=5, metric='euclidean'):
     r"""Expresses to what extent the local structure is retained.
 
     The trustworthiness is within [0, 1]. It is defined as
@@ -632,7 +634,8 @@ class TSNE(BaseEstimator):
     # Control the number of iterations between progress checks
     _N_ITER_CHECK = 50
 
-    def __init__(self, n_components=2, perplexity=30.0,
+    @_deprecate_positional_args
+    def __init__(self, n_components=2, *, perplexity=30.0,
                  early_exaggeration=12.0, learning_rate=200.0, n_iter=1000,
                  n_iter_without_progress=300, min_grad_norm=1e-7,
                  metric="euclidean", init="random", verbose=0,
diff --git a/sklearn/manifold/tests/test_isomap.py b/sklearn/manifold/tests/test_isomap.py
index 18133719bf85a..9007772674a99 100644
--- a/sklearn/manifold/tests/test_isomap.py
+++ b/sklearn/manifold/tests/test_isomap.py
@@ -91,7 +91,7 @@ def test_transform():
     X, y = datasets.make_s_curve(n_samples, random_state=0)
 
     # Compute isomap embedding
-    iso = manifold.Isomap(n_components, 2)
+    iso = manifold.Isomap(n_components=n_components, n_neighbors=2)
     X_iso = iso.fit_transform(X)
 
     # Re-embed a noisy version of the points
diff --git a/sklearn/manifold/tests/test_locally_linear.py b/sklearn/manifold/tests/test_locally_linear.py
index 0968c5052a1b7..952da3ef41163 100644
--- a/sklearn/manifold/tests/test_locally_linear.py
+++ b/sklearn/manifold/tests/test_locally_linear.py
@@ -131,7 +131,7 @@ def test_singular_matrix():
     M = np.ones((10, 3))
     f = ignore_warnings
     with pytest.raises(ValueError):
-        f(manifold.locally_linear_embedding(M, 2, 1,
+        f(manifold.locally_linear_embedding(M, n_neighbors=2, n_components=1,
                                             method='standard',
                                             eigen_solver='arpack'))
 
diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index a916bbe1dd955..90e1935e62f06 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -37,6 +37,7 @@
 from ..utils.multiclass import unique_labels
 from ..utils.multiclass import type_of_target
 from ..utils.validation import _num_samples
+from ..utils.validation import _deprecate_positional_args
 from ..utils.sparsefuncs import count_nonzero
 from ..exceptions import UndefinedMetricWarning
 
@@ -121,7 +122,8 @@ def _weighted_sum(sample_score, sample_weight, normalize=False):
         return sample_score.sum()
 
 
-def accuracy_score(y_true, y_pred, normalize=True, sample_weight=None):
+@_deprecate_positional_args
+def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None):
     """Accuracy classification score.
 
     In multilabel classification, this function computes subset accuracy:
@@ -193,7 +195,8 @@ def accuracy_score(y_true, y_pred, normalize=True, sample_weight=None):
     return _weighted_sum(score, sample_weight, normalize)
 
 
-def confusion_matrix(y_true, y_pred, labels=None, sample_weight=None,
+@_deprecate_positional_args
+def confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None,
                      normalize=None):
     """Compute confusion matrix to evaluate the accuracy of a classification.
 
@@ -330,7 +333,8 @@ def confusion_matrix(y_true, y_pred, labels=None, sample_weight=None,
     return cm
 
 
-def multilabel_confusion_matrix(y_true, y_pred, sample_weight=None,
+@_deprecate_positional_args
+def multilabel_confusion_matrix(y_true, y_pred, *, sample_weight=None,
                                 labels=None, samplewise=False):
     """Compute a confusion matrix for each class or sample
 
@@ -533,7 +537,9 @@ def multilabel_confusion_matrix(y_true, y_pred, sample_weight=None,
     return np.array([tn, fp, fn, tp]).T.reshape(-1, 2, 2)
 
 
-def cohen_kappa_score(y1, y2, labels=None, weights=None, sample_weight=None):
+@_deprecate_positional_args
+def cohen_kappa_score(y1, y2, *, labels=None, weights=None,
+                      sample_weight=None):
     r"""Cohen's kappa: a statistic that measures inter-annotator agreement.
 
     This function computes Cohen's kappa [1]_, a score that expresses the level
@@ -613,7 +619,8 @@ class labels [2]_.
     return 1 - k
 
 
-def jaccard_score(y_true, y_pred, labels=None, pos_label=1,
+@_deprecate_positional_args
+def jaccard_score(y_true, y_pred, *, labels=None, pos_label=1,
                   average='binary', sample_weight=None):
     """Jaccard similarity coefficient score
 
@@ -752,7 +759,8 @@ def jaccard_score(y_true, y_pred, labels=None, pos_label=1,
     return np.average(jaccard, weights=weights)
 
 
-def matthews_corrcoef(y_true, y_pred, sample_weight=None):
+@_deprecate_positional_args
+def matthews_corrcoef(y_true, y_pred, *, sample_weight=None):
     """Compute the Matthews correlation coefficient (MCC)
 
     The Matthews correlation coefficient is used in machine learning as a
@@ -839,7 +847,8 @@ def matthews_corrcoef(y_true, y_pred, sample_weight=None):
         return mcc
 
 
-def zero_one_loss(y_true, y_pred, normalize=True, sample_weight=None):
+@_deprecate_positional_args
+def zero_one_loss(y_true, y_pred, *, normalize=True, sample_weight=None):
     """Zero-one classification loss.
 
     If normalize is ``True``, return the fraction of misclassifications
@@ -909,7 +918,8 @@ def zero_one_loss(y_true, y_pred, normalize=True, sample_weight=None):
         return n_samples - score
 
 
-def f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
+@_deprecate_positional_args
+def f1_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary',
              sample_weight=None, zero_division="warn"):
     """Compute the F1 score, also known as balanced F-score or F-measure
 
@@ -1027,13 +1037,14 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
     and ``UndefinedMetricWarning`` will be raised. This behavior can be
     modified with ``zero_division``.
     """
-    return fbeta_score(y_true, y_pred, 1, labels=labels,
+    return fbeta_score(y_true, y_pred, beta=1, labels=labels,
                        pos_label=pos_label, average=average,
                        sample_weight=sample_weight,
                        zero_division=zero_division)
 
 
-def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1,
+@_deprecate_positional_args
+def fbeta_score(y_true, y_pred, *, beta, labels=None, pos_label=1,
                 average='binary', sample_weight=None, zero_division="warn"):
     """Compute the F-beta score
 
@@ -1256,7 +1267,8 @@ def _check_set_wise_labels(y_true, y_pred, average, labels, pos_label):
     return labels
 
 
-def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
+@_deprecate_positional_args
+def precision_recall_fscore_support(y_true, y_pred, *, beta=1.0, labels=None,
                                     pos_label=1, average=None,
                                     warn_for=('precision', 'recall',
                                               'f-score'),
@@ -1488,7 +1500,8 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
     return precision, recall, f_score, true_sum
 
 
-def precision_score(y_true, y_pred, labels=None, pos_label=1,
+@_deprecate_positional_args
+def precision_score(y_true, y_pred, *, labels=None, pos_label=1,
                     average='binary', sample_weight=None,
                     zero_division="warn"):
     """Compute the precision
@@ -1607,7 +1620,8 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1,
     return p
 
 
-def recall_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
+@_deprecate_positional_args
+def recall_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary',
                  sample_weight=None, zero_division="warn"):
     """Compute the recall
 
@@ -1724,7 +1738,8 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
     return r
 
 
-def balanced_accuracy_score(y_true, y_pred, sample_weight=None,
+@_deprecate_positional_args
+def balanced_accuracy_score(y_true, y_pred, *, sample_weight=None,
                             adjusted=False):
     """Compute the balanced accuracy
 
@@ -1801,7 +1816,8 @@ def balanced_accuracy_score(y_true, y_pred, sample_weight=None,
     return score
 
 
-def classification_report(y_true, y_pred, labels=None, target_names=None,
+@_deprecate_positional_args
+def classification_report(y_true, y_pred, *, labels=None, target_names=None,
                           sample_weight=None, digits=2, output_dict=False,
                           zero_division="warn"):
     """Build a text report showing the main classification metrics.
@@ -1999,7 +2015,8 @@ class 2       1.00      0.67      0.80         3
         return report
 
 
-def hamming_loss(y_true, y_pred, sample_weight=None):
+@_deprecate_positional_args
+def hamming_loss(y_true, y_pred, *, sample_weight=None):
     """Compute the average Hamming loss.
 
     The Hamming loss is the fraction of labels that are incorrectly predicted.
@@ -2090,7 +2107,8 @@ def hamming_loss(y_true, y_pred, sample_weight=None):
         raise ValueError("{0} is not supported".format(y_type))
 
 
-def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None,
+@_deprecate_positional_args
+def log_loss(y_true, y_pred, *, eps=1e-15, normalize=True, sample_weight=None,
              labels=None):
     """Log loss, aka logistic loss or cross-entropy loss.
 
@@ -2215,7 +2233,8 @@ def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None,
     return _weighted_sum(loss, sample_weight, normalize)
 
 
-def hinge_loss(y_true, pred_decision, labels=None, sample_weight=None):
+@_deprecate_positional_args
+def hinge_loss(y_true, pred_decision, *, labels=None, sample_weight=None):
     """Average hinge loss (non-regularized)
 
     In binary class case, assuming labels in y_true are encoded with +1 and -1,
@@ -2292,7 +2311,7 @@ def hinge_loss(y_true, pred_decision, labels=None, sample_weight=None):
     LinearSVC()
     >>> pred_decision = est.decision_function([[-1], [2], [3]])
     >>> y_true = [0, 2, 3]
-    >>> hinge_loss(y_true, pred_decision, labels)
+    >>> hinge_loss(y_true, pred_decision, labels=labels)
     0.56...
     """
     check_consistent_length(y_true, pred_decision, sample_weight)
@@ -2336,7 +2355,8 @@ def hinge_loss(y_true, pred_decision, labels=None, sample_weight=None):
     return np.average(losses, weights=sample_weight)
 
 
-def brier_score_loss(y_true, y_prob, sample_weight=None, pos_label=None):
+@_deprecate_positional_args
+def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None):
     """Compute the Brier score.
 
     The smaller the Brier score, the better, hence the naming with "loss".
diff --git a/sklearn/metrics/_plot/confusion_matrix.py b/sklearn/metrics/_plot/confusion_matrix.py
index 861a2558a3cef..c858ac3950f86 100644
--- a/sklearn/metrics/_plot/confusion_matrix.py
+++ b/sklearn/metrics/_plot/confusion_matrix.py
@@ -4,6 +4,7 @@
 
 from .. import confusion_matrix
 from ...utils import check_matplotlib_support
+from ...utils.validation import _deprecate_positional_args
 from ...base import is_classifier
 
 
@@ -40,11 +41,12 @@ class ConfusionMatrixDisplay:
     figure_ : matplotlib Figure
         Figure containing the confusion matrix.
     """
-    def __init__(self, confusion_matrix, display_labels=None):
+    def __init__(self, confusion_matrix, *, display_labels=None):
         self.confusion_matrix = confusion_matrix
         self.display_labels = display_labels
 
-    def plot(self, include_values=True, cmap='viridis',
+    @_deprecate_positional_args
+    def plot(self, *, include_values=True, cmap='viridis',
              xticks_rotation='horizontal', values_format=None, ax=None):
         """Plot visualization.
 
@@ -130,7 +132,8 @@ def plot(self, include_values=True, cmap='viridis',
         return self
 
 
-def plot_confusion_matrix(estimator, X, y_true, labels=None,
+@_deprecate_positional_args
+def plot_confusion_matrix(estimator, X, y_true, *, labels=None,
                           sample_weight=None, normalize=None,
                           display_labels=None, include_values=True,
                           xticks_rotation='horizontal',
diff --git a/sklearn/metrics/_plot/precision_recall_curve.py b/sklearn/metrics/_plot/precision_recall_curve.py
index 10dd14e938984..bb2a91c198c41 100644
--- a/sklearn/metrics/_plot/precision_recall_curve.py
+++ b/sklearn/metrics/_plot/precision_recall_curve.py
@@ -4,6 +4,7 @@
 from .. import precision_recall_curve
 
 from ...utils import check_matplotlib_support
+from ...utils.validation import _deprecate_positional_args
 from ...base import is_classifier
 
 
@@ -40,15 +41,15 @@ class PrecisionRecallDisplay:
     figure_ : matplotlib Figure
         Figure containing the curve.
     """
-
-    def __init__(self, precision, recall,
+    def __init__(self, precision, recall, *,
                  average_precision=None, estimator_name=None):
         self.precision = precision
         self.recall = recall
         self.average_precision = average_precision
         self.estimator_name = estimator_name
 
-    def plot(self, ax=None, name=None, **kwargs):
+    @_deprecate_positional_args
+    def plot(self, ax=None, *, name=None, **kwargs):
         """Plot visualization.
 
         Extra keyword arguments will be passed to matplotlib's `plot`.
@@ -101,7 +102,8 @@ def plot(self, ax=None, name=None, **kwargs):
         return self
 
 
-def plot_precision_recall_curve(estimator, X, y,
+@_deprecate_positional_args
+def plot_precision_recall_curve(estimator, X, y, *,
                                 sample_weight=None, response_method="auto",
                                 name=None, ax=None, **kwargs):
     """Plot Precision Recall Curve for binary classifiers.
diff --git a/sklearn/metrics/_plot/roc_curve.py b/sklearn/metrics/_plot/roc_curve.py
index 0881646e8a1af..21af0aa388b07 100644
--- a/sklearn/metrics/_plot/roc_curve.py
+++ b/sklearn/metrics/_plot/roc_curve.py
@@ -4,6 +4,7 @@
 from .base import _check_classifer_response_method
 from ...utils import check_matplotlib_support
 from ...base import is_classifier
+from ...utils.validation import _deprecate_positional_args
 
 
 class RocCurveDisplay:
@@ -53,14 +54,14 @@ class RocCurveDisplay:
     >>> display.plot()  # doctest: +SKIP
     >>> plt.show()      # doctest: +SKIP
     """
-
-    def __init__(self, fpr, tpr, roc_auc=None, estimator_name=None):
+    def __init__(self, *, fpr, tpr, roc_auc=None, estimator_name=None):
         self.fpr = fpr
         self.tpr = tpr
         self.roc_auc = roc_auc
         self.estimator_name = estimator_name
 
-    def plot(self, ax=None, name=None, **kwargs):
+    @_deprecate_positional_args
+    def plot(self, ax=None, *, name=None, **kwargs):
         """Plot visualization
 
         Extra keyword arguments will be passed to matplotlib's ``plot``.
@@ -110,7 +111,8 @@ def plot(self, ax=None, name=None, **kwargs):
         return self
 
 
-def plot_roc_curve(estimator, X, y, sample_weight=None,
+@_deprecate_positional_args
+def plot_roc_curve(estimator, X, y, *, sample_weight=None,
                    drop_intermediate=True, response_method="auto",
                    name=None, ax=None, **kwargs):
     """Plot Receiver operating characteristic (ROC) curve.
diff --git a/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py b/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py
index 6a0b880ebabb1..e65c12904b757 100644
--- a/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py
+++ b/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py
@@ -267,15 +267,17 @@ def test_confusion_matrix_text_format(pyplot, data, y_pred, n_classes,
 
 def test_confusion_matrix_standard_format(pyplot):
     cm = np.array([[10000000, 0], [123456, 12345678]])
-    plotted_text = ConfusionMatrixDisplay(cm, [False, True]).plot().text_
+    plotted_text = ConfusionMatrixDisplay(
+        cm, display_labels=[False, True]).plot().text_
     # Values should be shown as whole numbers 'd',
     # except the first number which should be shown as 1e+07 (longer length)
-    # and the last number will be showns as 1.2e+07 (longer length)
+    # and the last number will be shown as 1.2e+07 (longer length)
     test = [t.get_text() for t in plotted_text.ravel()]
     assert test == ['1e+07', '0', '123456', '1.2e+07']
 
     cm = np.array([[0.1, 10], [100, 0.525]])
-    plotted_text = ConfusionMatrixDisplay(cm, [False, True]).plot().text_
+    plotted_text = ConfusionMatrixDisplay(
+        cm, display_labels=[False, True]).plot().text_
     # Values should now formatted as '.2g', since there's a float in
     # Values are have two dec places max, (e.g 100 becomes 1e+02)
     test = [t.get_text() for t in plotted_text.ravel()]
diff --git a/sklearn/metrics/_plot/tests/test_plot_roc_curve.py b/sklearn/metrics/_plot/tests/test_plot_roc_curve.py
index 1aa34bdca7279..50e69ad41af8d 100644
--- a/sklearn/metrics/_plot/tests/test_plot_roc_curve.py
+++ b/sklearn/metrics/_plot/tests/test_plot_roc_curve.py
@@ -165,6 +165,6 @@ def test_default_labels(pyplot, roc_auc, estimator_name,
                         expected_label):
     fpr = np.array([0, 0.5, 1])
     tpr = np.array([0, 0.5, 1])
-    disp = RocCurveDisplay(fpr, tpr, roc_auc=roc_auc,
+    disp = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                            estimator_name=estimator_name).plot()
     assert disp.line_.get_label() == expected_label
diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py
index a9e45310f330d..2ac226bfe0299 100644
--- a/sklearn/metrics/_ranking.py
+++ b/sklearn/metrics/_ranking.py
@@ -31,6 +31,7 @@
 from ..utils.multiclass import type_of_target
 from ..utils.extmath import stable_cumsum
 from ..utils.sparsefuncs import count_nonzero
+from ..utils.validation import _deprecate_positional_args
 from ..exceptions import UndefinedMetricWarning
 from ..preprocessing import label_binarize
 from ..preprocessing._label import _encode
@@ -101,7 +102,8 @@ def auc(x, y):
     return area
 
 
-def average_precision_score(y_true, y_score, average="macro", pos_label=1,
+@_deprecate_positional_args
+def average_precision_score(y_true, y_score, *, average="macro", pos_label=1,
                             sample_weight=None):
     """Compute average precision (AP) from prediction scores
 
@@ -243,7 +245,8 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None, max_fpr=None):
     return 0.5 * (1 + (partial_auc - min_area) / (max_area - min_area))
 
 
-def roc_auc_score(y_true, y_score, average="macro", sample_weight=None,
+@_deprecate_positional_args
+def roc_auc_score(y_true, y_score, *, average="macro", sample_weight=None,
                   max_fpr=None, multi_class="raise", labels=None):
     """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
     from prediction scores.
@@ -594,7 +597,8 @@ def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None):
     return fps, tps, y_score[threshold_idxs]
 
 
-def precision_recall_curve(y_true, probas_pred, pos_label=None,
+@_deprecate_positional_args
+def precision_recall_curve(y_true, probas_pred, *, pos_label=None,
                            sample_weight=None):
     """Compute precision-recall pairs for different probability thresholds
 
@@ -683,7 +687,8 @@ def precision_recall_curve(y_true, probas_pred, pos_label=None,
     return np.r_[precision[sl], 1], np.r_[recall[sl], 0], thresholds[sl]
 
 
-def roc_curve(y_true, y_score, pos_label=None, sample_weight=None,
+@_deprecate_positional_args
+def roc_curve(y_true, y_score, *, pos_label=None, sample_weight=None,
               drop_intermediate=True):
     """Compute Receiver operating characteristic (ROC)
 
@@ -813,7 +818,9 @@ def roc_curve(y_true, y_score, pos_label=None, sample_weight=None,
     return fpr, tpr, thresholds
 
 
-def label_ranking_average_precision_score(y_true, y_score, sample_weight=None):
+@_deprecate_positional_args
+def label_ranking_average_precision_score(y_true, y_score, *,
+                                          sample_weight=None):
     """Compute ranking-based average precision
 
     Label ranking average precision (LRAP) is the average over each ground
@@ -899,7 +906,8 @@ def label_ranking_average_precision_score(y_true, y_score, sample_weight=None):
     return out
 
 
-def coverage_error(y_true, y_score, sample_weight=None):
+@_deprecate_positional_args
+def coverage_error(y_true, y_score, *, sample_weight=None):
     """Coverage error measure
 
     Compute how far we need to go through the ranked scores to cover all
@@ -958,7 +966,8 @@ def coverage_error(y_true, y_score, sample_weight=None):
     return np.average(coverage, weights=sample_weight)
 
 
-def label_ranking_loss(y_true, y_score, sample_weight=None):
+@_deprecate_positional_args
+def label_ranking_loss(y_true, y_score, *, sample_weight=None):
     """Compute Ranking loss measure
 
     Compute the average number of label pairs that are incorrectly ordered
@@ -1163,7 +1172,8 @@ def _check_dcg_target_type(y_true):
                 supported_fmt, y_type))
 
 
-def dcg_score(y_true, y_score, k=None,
+@_deprecate_positional_args
+def dcg_score(y_true, y_score, *, k=None,
               log_base=2, sample_weight=None, ignore_ties=False):
     """Compute Discounted Cumulative Gain.
 
@@ -1320,7 +1330,9 @@ def _ndcg_sample_scores(y_true, y_score, k=None, ignore_ties=False):
     return gain
 
 
-def ndcg_score(y_true, y_score, k=None, sample_weight=None, ignore_ties=False):
+@_deprecate_positional_args
+def ndcg_score(y_true, y_score, *, k=None, sample_weight=None,
+               ignore_ties=False):
     """Compute Normalized Discounted Cumulative Gain.
 
     Sum the true scores ranked in the order induced by the predicted scores,
diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py
index 6026a5293806a..afbb469072cf5 100644
--- a/sklearn/metrics/_regression.py
+++ b/sklearn/metrics/_regression.py
@@ -29,6 +29,7 @@
 from ..utils.validation import (check_array, check_consistent_length,
                                 _num_samples)
 from ..utils.validation import column_or_1d
+from ..utils.validation import _deprecate_positional_args
 from ..exceptions import UndefinedMetricWarning
 
 
@@ -117,7 +118,8 @@ def _check_reg_targets(y_true, y_pred, multioutput, dtype="numeric"):
     return y_type, y_true, y_pred, multioutput
 
 
-def mean_absolute_error(y_true, y_pred,
+@_deprecate_positional_args
+def mean_absolute_error(y_true, y_pred, *,
                         sample_weight=None,
                         multioutput='uniform_average'):
     """Mean absolute error regression loss
@@ -188,7 +190,8 @@ def mean_absolute_error(y_true, y_pred,
     return np.average(output_errors, weights=multioutput)
 
 
-def mean_squared_error(y_true, y_pred,
+@_deprecate_positional_args
+def mean_squared_error(y_true, y_pred, *,
                        sample_weight=None,
                        multioutput='uniform_average', squared=True):
     """Mean squared error regression loss
@@ -263,7 +266,8 @@ def mean_squared_error(y_true, y_pred,
     return mse if squared else np.sqrt(mse)
 
 
-def mean_squared_log_error(y_true, y_pred,
+@_deprecate_positional_args
+def mean_squared_log_error(y_true, y_pred, *,
                            sample_weight=None,
                            multioutput='uniform_average'):
     """Mean squared logarithmic error regression loss
@@ -326,10 +330,12 @@ def mean_squared_log_error(y_true, y_pred,
                          "targets contain negative values.")
 
     return mean_squared_error(np.log1p(y_true), np.log1p(y_pred),
-                              sample_weight, multioutput)
+                              sample_weight=sample_weight,
+                              multioutput=multioutput)
 
 
-def median_absolute_error(y_true, y_pred, multioutput='uniform_average'):
+@_deprecate_positional_args
+def median_absolute_error(y_true, y_pred, *, multioutput='uniform_average'):
     """Median absolute error regression loss
 
     Median absolute error output is non-negative floating point. The best value
@@ -392,7 +398,8 @@ def median_absolute_error(y_true, y_pred, multioutput='uniform_average'):
     return np.average(output_errors, weights=multioutput)
 
 
-def explained_variance_score(y_true, y_pred,
+@_deprecate_positional_args
+def explained_variance_score(y_true, y_pred, *,
                              sample_weight=None,
                              multioutput='uniform_average'):
     """Explained variance regression score function
@@ -484,7 +491,8 @@ def explained_variance_score(y_true, y_pred,
     return np.average(output_scores, weights=avg_weights)
 
 
-def r2_score(y_true, y_pred, sample_weight=None,
+@_deprecate_positional_args
+def r2_score(y_true, y_pred, *, sample_weight=None,
              multioutput="uniform_average"):
     """R^2 (coefficient of determination) regression score function.
 
@@ -655,7 +663,8 @@ def max_error(y_true, y_pred):
     return np.max(np.abs(y_true - y_pred))
 
 
-def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, power=0):
+@_deprecate_positional_args
+def mean_tweedie_deviance(y_true, y_pred, *, sample_weight=None, power=0):
     """Mean Tweedie deviance regression loss.
 
     Read more in the :ref:`User Guide <mean_tweedie_deviance>`.
@@ -719,7 +728,8 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, power=0):
     return np.average(dev, weights=sample_weight)
 
 
-def mean_poisson_deviance(y_true, y_pred, sample_weight=None):
+@_deprecate_positional_args
+def mean_poisson_deviance(y_true, y_pred, *, sample_weight=None):
     """Mean Poisson deviance regression loss.
 
     Poisson deviance is equivalent to the Tweedie deviance with
@@ -756,7 +766,8 @@ def mean_poisson_deviance(y_true, y_pred, sample_weight=None):
     )
 
 
-def mean_gamma_deviance(y_true, y_pred, sample_weight=None):
+@_deprecate_positional_args
+def mean_gamma_deviance(y_true, y_pred, *, sample_weight=None):
     """Mean Gamma deviance regression loss.
 
     Gamma deviance is equivalent to the Tweedie deviance with
diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py
index 3df175c2ca306..400e92c158ca8 100644
--- a/sklearn/metrics/_scorer.py
+++ b/sklearn/metrics/_scorer.py
@@ -43,6 +43,7 @@
 from .cluster import fowlkes_mallows_score
 
 from ..utils.multiclass import type_of_target
+from ..utils.validation import _deprecate_positional_args
 from ..base import is_regressor
 
 
@@ -371,7 +372,8 @@ def _passthrough_scorer(estimator, *args, **kwargs):
     return estimator.score(*args, **kwargs)
 
 
-def check_scoring(estimator, scoring=None, allow_none=False):
+@_deprecate_positional_args
+def check_scoring(estimator, scoring=None, *, allow_none=False):
     """Determine scorer from user options.
 
     A TypeError will be thrown if the estimator cannot be scored.
@@ -528,7 +530,8 @@ def _check_multimetric_scoring(estimator, scoring=None):
         return scorers, True
 
 
-def make_scorer(score_func, greater_is_better=True, needs_proba=False,
+@_deprecate_positional_args
+def make_scorer(score_func, *, greater_is_better=True, needs_proba=False,
                 needs_threshold=False, **kwargs):
     """Make a scorer from a performance metric or loss function.
 
diff --git a/sklearn/metrics/cluster/_bicluster.py b/sklearn/metrics/cluster/_bicluster.py
index f8d8d18e9f6b0..ac0d0a454a74a 100644
--- a/sklearn/metrics/cluster/_bicluster.py
+++ b/sklearn/metrics/cluster/_bicluster.py
@@ -2,6 +2,7 @@
 from scipy.optimize import linear_sum_assignment
 
 from ...utils.validation import check_consistent_length, check_array
+from ...utils.validation import _deprecate_positional_args
 
 __all__ = ["consensus_score"]
 
@@ -44,7 +45,8 @@ def _pairwise_similarity(a, b, similarity):
     return result
 
 
-def consensus_score(a, b, similarity="jaccard"):
+@_deprecate_positional_args
+def consensus_score(a, b, *, similarity="jaccard"):
     """The similarity of two sets of biclusters.
 
     Similarity between individual biclusters is computed. Then the
diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py
index 973c45a908bf1..8a0fdcacb67f1 100644
--- a/sklearn/metrics/cluster/_supervised.py
+++ b/sklearn/metrics/cluster/_supervised.py
@@ -23,6 +23,7 @@
 
 from ._expected_mutual_info_fast import expected_mutual_information
 from ...utils.validation import check_array, check_consistent_length
+from ...utils.validation import _deprecate_positional_args
 from ...utils.fixes import _astype_copy_false
 
 
@@ -77,7 +78,8 @@ def _generalized_average(U, V, average_method):
                          "'arithmetic', or 'max'")
 
 
-def contingency_matrix(labels_true, labels_pred, eps=None, sparse=False):
+@_deprecate_positional_args
+def contingency_matrix(labels_true, labels_pred, *, eps=None, sparse=False):
     """Build a contingency matrix describing the relationship between labels.
 
     Parameters
@@ -241,7 +243,8 @@ def adjusted_rand_score(labels_true, labels_pred):
     return (sum_comb - prod_comb) / (mean_comb - prod_comb)
 
 
-def homogeneity_completeness_v_measure(labels_true, labels_pred, beta=1.0):
+@_deprecate_positional_args
+def homogeneity_completeness_v_measure(labels_true, labels_pred, *, beta=1.0):
     """Compute the homogeneity and completeness and V-Measure scores at once.
 
     Those metrics are based on normalized conditional entropy measures of
@@ -463,7 +466,8 @@ def completeness_score(labels_true, labels_pred):
     return homogeneity_completeness_v_measure(labels_true, labels_pred)[1]
 
 
-def v_measure_score(labels_true, labels_pred, beta=1.0):
+@_deprecate_positional_args
+def v_measure_score(labels_true, labels_pred, *, beta=1.0):
     """V-measure cluster labeling given a ground truth.
 
     This score is identical to :func:`normalized_mutual_info_score` with
@@ -563,7 +567,8 @@ def v_measure_score(labels_true, labels_pred, beta=1.0):
                                               beta=beta)[2]
 
 
-def mutual_info_score(labels_true, labels_pred, contingency=None):
+@_deprecate_positional_args
+def mutual_info_score(labels_true, labels_pred, *, contingency=None):
     """Mutual Information between two clusterings.
 
     The Mutual Information is a measure of the similarity between two labels of
@@ -649,7 +654,8 @@ def mutual_info_score(labels_true, labels_pred, contingency=None):
     return np.clip(mi.sum(), 0.0, None)
 
 
-def adjusted_mutual_info_score(labels_true, labels_pred,
+@_deprecate_positional_args
+def adjusted_mutual_info_score(labels_true, labels_pred, *,
                                average_method='arithmetic'):
     """Adjusted Mutual Information between two clusterings.
 
@@ -770,7 +776,8 @@ def adjusted_mutual_info_score(labels_true, labels_pred,
     return ami
 
 
-def normalized_mutual_info_score(labels_true, labels_pred,
+@_deprecate_positional_args
+def normalized_mutual_info_score(labels_true, labels_pred, *,
                                  average_method='arithmetic'):
     """Normalized Mutual Information between two clusterings.
 
@@ -870,7 +877,8 @@ def normalized_mutual_info_score(labels_true, labels_pred,
     return nmi
 
 
-def fowlkes_mallows_score(labels_true, labels_pred, sparse=False):
+@_deprecate_positional_args
+def fowlkes_mallows_score(labels_true, labels_pred, *, sparse=False):
     """Measure the similarity of two clusterings of a set of points.
 
     The Fowlkes-Mallows index (FMI) is defined as the geometric mean between of
diff --git a/sklearn/metrics/cluster/_unsupervised.py b/sklearn/metrics/cluster/_unsupervised.py
index 8841df701c69f..9e2ef713b352e 100644
--- a/sklearn/metrics/cluster/_unsupervised.py
+++ b/sklearn/metrics/cluster/_unsupervised.py
@@ -16,6 +16,7 @@
 from ..pairwise import pairwise_distances_chunked
 from ..pairwise import pairwise_distances
 from ...preprocessing import LabelEncoder
+from ...utils.validation import _deprecate_positional_args
 
 
 def check_number_of_labels(n_labels, n_samples):
@@ -34,7 +35,8 @@ def check_number_of_labels(n_labels, n_samples):
                          "to n_samples - 1 (inclusive)" % n_labels)
 
 
-def silhouette_score(X, labels, metric='euclidean', sample_size=None,
+@_deprecate_positional_args
+def silhouette_score(X, labels, *, metric='euclidean', sample_size=None,
                      random_state=None, **kwds):
     """Compute the mean Silhouette Coefficient of all samples.
 
@@ -147,7 +149,8 @@ def _silhouette_reduce(D_chunk, start, labels, label_freqs):
     return intra_clust_dists, inter_clust_dists
 
 
-def silhouette_samples(X, labels, metric='euclidean', **kwds):
+@_deprecate_positional_args
+def silhouette_samples(X, labels, *, metric='euclidean', **kwds):
     """Compute the Silhouette Coefficient for each sample.
 
     The Silhouette Coefficient is a measure of how well samples are clustered
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 25646acb49ea7..9d4107ebd66d6 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -28,6 +28,7 @@
 from ..utils.extmath import row_norms, safe_sparse_dot
 from ..preprocessing import normalize
 from ..utils._mask import _get_mask
+from ..utils.validation import _deprecate_positional_args
 
 from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
 from ..exceptions import DataConversionWarning
@@ -58,7 +59,8 @@ def _return_float_dtype(X, Y):
     return X, Y, dtype
 
 
-def check_pairwise_arrays(X, Y, precomputed=False, dtype=None,
+@_deprecate_positional_args
+def check_pairwise_arrays(X, Y, *, precomputed=False, dtype=None,
                           accept_sparse='csr', force_all_finite=True,
                           copy=False):
     """ Set X and Y appropriately and checks inputs
@@ -192,7 +194,8 @@ def check_paired_arrays(X, Y):
 
 
 # Pairwise distances
-def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,
+@_deprecate_positional_args
+def euclidean_distances(X, Y=None, *, Y_norm_squared=None, squared=False,
                         X_norm_squared=None):
     """
     Considering the rows of X (and Y=X) as vectors, compute the
@@ -313,7 +316,8 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,
     return distances if squared else np.sqrt(distances, out=distances)
 
 
-def nan_euclidean_distances(X, Y=None, squared=False,
+@_deprecate_positional_args
+def nan_euclidean_distances(X, Y=None, *, squared=False,
                             missing_values=np.nan, copy=True):
     """Calculate the euclidean distances in the presence of missing values.
 
@@ -503,7 +507,8 @@ def _argmin_min_reduce(dist, start):
     return indices, values
 
 
-def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean",
+@_deprecate_positional_args
+def pairwise_distances_argmin_min(X, Y, *, axis=1, metric="euclidean",
                                   metric_kwargs=None):
     """Compute minimum distances between one point and a set of points.
 
@@ -589,7 +594,8 @@ def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean",
     return indices, values
 
 
-def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean",
+@_deprecate_positional_args
+def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean",
                               metric_kwargs=None):
     """Compute minimum distances between one point and a set of points.
 
@@ -659,7 +665,7 @@ def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean",
     if metric_kwargs is None:
         metric_kwargs = {}
 
-    return pairwise_distances_argmin_min(X, Y, axis, metric,
+    return pairwise_distances_argmin_min(X, Y, axis=axis, metric=metric,
                                          metric_kwargs=metric_kwargs)[0]
 
 
@@ -711,7 +717,8 @@ def haversine_distances(X, Y=None):
     return DistanceMetric.get_metric('haversine').pairwise(X, Y)
 
 
-def manhattan_distances(X, Y=None, sum_over_features=True):
+@_deprecate_positional_args
+def manhattan_distances(X, Y=None, *, sum_over_features=True):
     """ Compute the L1 distances between the vectors in X and Y.
 
     With sum_over_features equal to False it returns the componentwise
@@ -908,7 +915,8 @@ def paired_cosine_distances(X, Y):
     'cityblock': paired_manhattan_distances}
 
 
-def paired_distances(X, Y, metric="euclidean", **kwds):
+@_deprecate_positional_args
+def paired_distances(X, Y, *, metric="euclidean", **kwds):
     """
     Computes the paired distances between X and Y.
 
@@ -1444,7 +1452,8 @@ def _precompute_metric_params(X, Y, metric=None, **kwds):
     return {}
 
 
-def pairwise_distances_chunked(X, Y=None, reduce_func=None,
+@_deprecate_positional_args
+def pairwise_distances_chunked(X, Y=None, *, reduce_func=None,
                                metric='euclidean', n_jobs=None,
                                working_memory=None, **kwds):
     """Generate a distance matrix chunk by chunk with optional reduction
@@ -1606,7 +1615,8 @@ def pairwise_distances_chunked(X, Y=None, reduce_func=None,
         yield D_chunk
 
 
-def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None,
+@_deprecate_positional_args
+def pairwise_distances(X, Y=None, metric="euclidean", *, n_jobs=None,
                        force_all_finite=True, **kwds):
     """ Compute the distance matrix from a vector array X and optional Y.
 
@@ -1820,7 +1830,8 @@ def kernel_metrics():
 }
 
 
-def pairwise_kernels(X, Y=None, metric="linear", filter_params=False,
+@_deprecate_positional_args
+def pairwise_kernels(X, Y=None, metric="linear", *, filter_params=False,
                      n_jobs=None, **kwds):
     """Compute the kernel between arrays X and optional array Y.
 
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index ca56e79299adb..1f959d95ce844 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -638,7 +638,7 @@ def test_matthews_corrcoef_against_jurman():
         for k in range(N)
     ])
     mcc_jurman = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
-    mcc_ours = matthews_corrcoef(y_true, y_pred, sample_weight)
+    mcc_ours = matthews_corrcoef(y_true, y_pred, sample_weight=sample_weight)
 
     assert_almost_equal(mcc_ours, mcc_jurman, 10)
 
@@ -725,7 +725,8 @@ def test_matthews_corrcoef_multiclass():
     y_true = [0, 0, 1, 1, 2]
     y_pred = [1, 1, 0, 0, 2]
     sample_weight = [1, 1, 1, 1, 0]
-    assert_almost_equal(matthews_corrcoef(y_true, y_pred, sample_weight), -1)
+    assert_almost_equal(matthews_corrcoef(y_true, y_pred,
+                                          sample_weight=sample_weight), -1)
 
     # For the zero vector case, the corrcoef cannot be calculated and should
     # result in a RuntimeWarning
@@ -734,7 +735,7 @@ def test_matthews_corrcoef_multiclass():
     sample_weight = [1, 1, 0, 0]
     mcc = assert_warns_message(RuntimeWarning, 'invalid value encountered',
                                matthews_corrcoef, y_true, y_pred,
-                               sample_weight)
+                               sample_weight=sample_weight)
 
     # But will output 0
     assert_almost_equal(mcc, 0.)
diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index fdff2c4c3959e..f2c7a307571bc 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -350,7 +350,7 @@ def test_pairwise_kernels_filter_param():
     assert_array_almost_equal(K, K2)
 
     with pytest.raises(TypeError):
-        pairwise_kernels(X, Y, "rbf", **params)
+        pairwise_kernels(X, Y, metric="rbf", **params)
 
 
 @pytest.mark.parametrize('metric, func', PAIRED_DISTANCES.items())
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 64e88f37ed2bc..189d36ae88328 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -190,11 +190,11 @@ def check_scoring_validator_for_single_metric_usecases(scoring_validator):
     with pytest.raises(TypeError, match=pattern):
         scoring_validator(estimator)
 
-    scorer = scoring_validator(estimator, "accuracy")
+    scorer = scoring_validator(estimator, scoring="accuracy")
     assert_almost_equal(scorer(estimator, [[1]], [1]), 1.0)
 
     estimator = EstimatorWithFit()
-    scorer = scoring_validator(estimator, "accuracy")
+    scorer = scoring_validator(estimator, scoring="accuracy")
     assert isinstance(scorer, _PredictScorer)
 
     # Test the allow_none parameter for check_scoring alone
@@ -274,11 +274,11 @@ def test_check_scoring_gridsearchcv():
     # slightly redundant non-regression test.
 
     grid = GridSearchCV(LinearSVC(), param_grid={'C': [.1, 1]}, cv=3)
-    scorer = check_scoring(grid, "f1")
+    scorer = check_scoring(grid, scoring="f1")
     assert isinstance(scorer, _PredictScorer)
 
     pipe = make_pipeline(LinearSVC())
-    scorer = check_scoring(pipe, "f1")
+    scorer = check_scoring(pipe, scoring="f1")
     assert isinstance(scorer, _PredictScorer)
 
     # check that cross_val_score definitely calls the scorer
@@ -544,13 +544,13 @@ def test_scorer_memmap_input(name):
 
 def test_scoring_is_not_metric():
     with pytest.raises(ValueError, match='make_scorer'):
-        check_scoring(LogisticRegression(), f1_score)
+        check_scoring(LogisticRegression(), scoring=f1_score)
     with pytest.raises(ValueError, match='make_scorer'):
-        check_scoring(LogisticRegression(), roc_auc_score)
+        check_scoring(LogisticRegression(), scoring=roc_auc_score)
     with pytest.raises(ValueError, match='make_scorer'):
-        check_scoring(Ridge(), r2_score)
+        check_scoring(Ridge(), scoring=r2_score)
     with pytest.raises(ValueError, match='make_scorer'):
-        check_scoring(KMeans(), cluster_module.adjusted_rand_score)
+        check_scoring(KMeans(), scoring=cluster_module.adjusted_rand_score)
 
 
 def test_deprecated_scorer():
diff --git a/sklearn/mixture/_bayesian_mixture.py b/sklearn/mixture/_bayesian_mixture.py
index c68fa260faee3..648fb8d903d38 100644
--- a/sklearn/mixture/_bayesian_mixture.py
+++ b/sklearn/mixture/_bayesian_mixture.py
@@ -15,6 +15,7 @@
 from ._gaussian_mixture import _estimate_gaussian_parameters
 from ._gaussian_mixture import _estimate_log_gaussian_prob
 from ..utils import check_array
+from ..utils.validation import _deprecate_positional_args
 
 
 def _log_dirichlet_norm(dirichlet_concentration):
@@ -307,8 +308,8 @@ class BayesianGaussianMixture(BaseMixture):
        inference for Dirichlet process mixtures". Bayesian analysis 1.1
        <https://www.cs.princeton.edu/courses/archive/fall11/cos597C/reading/BleiJordan2005.pdf>`_
     """
-
-    def __init__(self, n_components=1, covariance_type='full', tol=1e-3,
+    @_deprecate_positional_args
+    def __init__(self, *, n_components=1, covariance_type='full', tol=1e-3,
                  reg_covar=1e-6, max_iter=100, n_init=1, init_params='kmeans',
                  weight_concentration_prior_type='dirichlet_process',
                  weight_concentration_prior=None,
diff --git a/sklearn/mixture/_gaussian_mixture.py b/sklearn/mixture/_gaussian_mixture.py
index 277f65f929eac..596e66f6a4e64 100644
--- a/sklearn/mixture/_gaussian_mixture.py
+++ b/sklearn/mixture/_gaussian_mixture.py
@@ -11,6 +11,7 @@
 from ._base import BaseMixture, _check_shape
 from ..utils import check_array
 from ..utils.extmath import row_norms
+from ..utils.validation import _deprecate_positional_args
 
 
 ###############################################################################
@@ -585,8 +586,8 @@ class GaussianMixture(BaseMixture):
     BayesianGaussianMixture : Gaussian mixture model fit with a variational
         inference.
     """
-
-    def __init__(self, n_components=1, covariance_type='full', tol=1e-3,
+    @_deprecate_positional_args
+    def __init__(self, n_components=1, *, covariance_type='full', tol=1e-3,
                  reg_covar=1e-6, max_iter=100, n_init=1, init_params='kmeans',
                  weights_init=None, means_init=None, precisions_init=None,
                  random_state=None, warm_start=False,
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 80cb269d4a0c7..920900db20fe7 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -34,6 +34,7 @@
 from ..utils import check_random_state
 from ..utils.random import sample_without_replacement
 from ..utils.validation import indexable, check_is_fitted, _check_fit_params
+from ..utils.validation import _deprecate_positional_args
 from ..utils.metaestimators import if_delegate_has_method
 from ..metrics._scorer import _check_multimetric_scoring
 from ..metrics import check_scoring
@@ -234,7 +235,8 @@ class ParameterSampler:
     ...                  {'b': 1.038159, 'a': 2}]
     True
     """
-    def __init__(self, param_distributions, n_iter, random_state=None):
+    @_deprecate_positional_args
+    def __init__(self, param_distributions, n_iter, *, random_state=None):
         if not isinstance(param_distributions, (Mapping, Iterable)):
             raise TypeError('Parameter distribution is not a dict or '
                             'a list ({!r})'.format(param_distributions))
@@ -400,9 +402,11 @@ class BaseSearchCV(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta):
     """
 
     @abstractmethod
-    def __init__(self, estimator, scoring=None, n_jobs=None, iid='deprecated',
-                 refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs',
-                 error_score=np.nan, return_train_score=True):
+    @_deprecate_positional_args
+    def __init__(self, estimator, *, scoring=None, n_jobs=None,
+                 iid='deprecated', refit=True, cv=None, verbose=0,
+                 pre_dispatch='2*n_jobs', error_score=np.nan,
+                 return_train_score=True):
 
         self.scoring = scoring
         self.estimator = estimator
@@ -620,7 +624,8 @@ def _run_search(self, evaluate_candidates):
         """
         raise NotImplementedError("_run_search not implemented.")
 
-    def fit(self, X, y=None, groups=None, **fit_params):
+    @_deprecate_positional_args
+    def fit(self, X, y=None, *, groups=None, **fit_params):
         """Run fit with all sets of parameters.
 
         Parameters
@@ -1160,7 +1165,8 @@ class GridSearchCV(BaseSearchCV):
     """
     _required_parameters = ["estimator", "param_grid"]
 
-    def __init__(self, estimator, param_grid, scoring=None,
+    @_deprecate_positional_args
+    def __init__(self, estimator, param_grid, *, scoring=None,
                  n_jobs=None, iid='deprecated', refit=True, cv=None,
                  verbose=0, pre_dispatch='2*n_jobs',
                  error_score=np.nan, return_train_score=False):
@@ -1492,8 +1498,9 @@ class RandomizedSearchCV(BaseSearchCV):
     """
     _required_parameters = ["estimator", "param_distributions"]
 
-    def __init__(self, estimator, param_distributions, n_iter=10, scoring=None,
-                 n_jobs=None, iid='deprecated', refit=True,
+    @_deprecate_positional_args
+    def __init__(self, estimator, param_distributions, *, n_iter=10,
+                 scoring=None, n_jobs=None, iid='deprecated', refit=True,
                  cv=None, verbose=0, pre_dispatch='2*n_jobs',
                  random_state=None, error_score=np.nan,
                  return_train_score=False):
diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index e728533c3b5cf..edcb9b375ae79 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -24,6 +24,7 @@
 from ..utils import _approximate_mode
 from ..utils.validation import _num_samples, column_or_1d
 from ..utils.validation import check_array
+from ..utils.validation import _deprecate_positional_args
 from ..utils.multiclass import type_of_target
 from ..base import _pprint
 
@@ -50,7 +51,6 @@ class BaseCrossValidator(metaclass=ABCMeta):
 
     Implementations must define `_iter_test_masks` or `_iter_test_indices`.
     """
-
     def split(self, X, y=None, groups=None):
         """Generate indices to split data into training and test set.
 
@@ -270,7 +270,8 @@ class _BaseKFold(BaseCrossValidator, metaclass=ABCMeta):
     """Base class for KFold, GroupKFold, and StratifiedKFold"""
 
     @abstractmethod
-    def __init__(self, n_splits, shuffle, random_state):
+    @_deprecate_positional_args
+    def __init__(self, n_splits, *, shuffle, random_state):
         if not isinstance(n_splits, numbers.Integral):
             raise ValueError('The number of folds must be of Integral type. '
                              '%s of type %s was passed.'
@@ -426,10 +427,11 @@ class KFold(_BaseKFold):
 
     RepeatedKFold: Repeats K-Fold n times.
     """
-
-    def __init__(self, n_splits=5, shuffle=False,
+    @_deprecate_positional_args
+    def __init__(self, n_splits=5, *, shuffle=False,
                  random_state=None):
-        super().__init__(n_splits, shuffle, random_state)
+        super().__init__(n_splits=n_splits, shuffle=shuffle,
+                         random_state=random_state)
 
     def _iter_test_indices(self, X, y=None, groups=None):
         n_samples = _num_samples(X)
@@ -633,9 +635,10 @@ class StratifiedKFold(_BaseKFold):
     --------
     RepeatedStratifiedKFold: Repeats Stratified K-Fold n times.
     """
-
-    def __init__(self, n_splits=5, shuffle=False, random_state=None):
-        super().__init__(n_splits, shuffle, random_state)
+    @_deprecate_positional_args
+    def __init__(self, n_splits=5, *, shuffle=False, random_state=None):
+        super().__init__(n_splits=n_splits, shuffle=shuffle,
+                         random_state=random_state)
 
     def _make_test_folds(self, X, y=None):
         rng = check_random_state(self.random_state)
@@ -787,7 +790,8 @@ class TimeSeriesSplit(_BaseKFold):
     with a test set of size ``n_samples//(n_splits + 1)``,
     where ``n_samples`` is the number of samples.
     """
-    def __init__(self, n_splits=5, max_train_size=None):
+    @_deprecate_positional_args
+    def __init__(self, n_splits=5, *, max_train_size=None):
         super().__init__(n_splits, shuffle=False, random_state=None)
         self.max_train_size = max_train_size
 
@@ -1099,7 +1103,8 @@ class _RepeatedSplits(metaclass=ABCMeta):
         Constructor parameters for cv. Must not contain random_state
         and shuffle.
     """
-    def __init__(self, cv, n_repeats=10, random_state=None, **cvargs):
+    @_deprecate_positional_args
+    def __init__(self, cv, *, n_repeats=10, random_state=None, **cvargs):
         if not isinstance(n_repeats, numbers.Integral):
             raise ValueError("Number of repetitions must be of Integral type.")
 
@@ -1226,9 +1231,11 @@ class RepeatedKFold(_RepeatedSplits):
     --------
     RepeatedStratifiedKFold: Repeats Stratified K-Fold n times.
     """
-    def __init__(self, n_splits=5, n_repeats=10, random_state=None):
+    @_deprecate_positional_args
+    def __init__(self, *, n_splits=5, n_repeats=10, random_state=None):
         super().__init__(
-            KFold, n_repeats, random_state, n_splits=n_splits)
+            KFold, n_repeats=n_repeats,
+            random_state=random_state, n_splits=n_splits)
 
 
 class RepeatedStratifiedKFold(_RepeatedSplits):
@@ -1280,15 +1287,17 @@ class RepeatedStratifiedKFold(_RepeatedSplits):
     --------
     RepeatedKFold: Repeats K-Fold n times.
     """
-    def __init__(self, n_splits=5, n_repeats=10, random_state=None):
+    @_deprecate_positional_args
+    def __init__(self, *, n_splits=5, n_repeats=10, random_state=None):
         super().__init__(
-            StratifiedKFold, n_repeats, random_state, n_splits=n_splits)
+            StratifiedKFold, n_repeats=n_repeats, random_state=random_state,
+            n_splits=n_splits)
 
 
 class BaseShuffleSplit(metaclass=ABCMeta):
     """Base class for ShuffleSplit and StratifiedShuffleSplit"""
-
-    def __init__(self, n_splits=10, test_size=None, train_size=None,
+    @_deprecate_positional_args
+    def __init__(self, n_splits=10, *, test_size=None, train_size=None,
                  random_state=None):
         self.n_splits = n_splits
         self.test_size = test_size
@@ -1421,7 +1430,8 @@ class ShuffleSplit(BaseShuffleSplit):
     TRAIN: [3 4 1] TEST: [5 2]
     TRAIN: [3 5 1] TEST: [2 4]
     """
-    def __init__(self, n_splits=10, test_size=None, train_size=None,
+    @_deprecate_positional_args
+    def __init__(self, n_splits=10, *, test_size=None, train_size=None,
                  random_state=None):
         super().__init__(
             n_splits=n_splits,
@@ -1510,8 +1520,8 @@ class GroupShuffleSplit(ShuffleSplit):
     TRAIN: [2 3 4 5 6 7] TEST: [0 1]
     TRAIN: [0 1 5 6 7] TEST: [2 3 4]
     '''
-
-    def __init__(self, n_splits=5, test_size=None, train_size=None,
+    @_deprecate_positional_args
+    def __init__(self, n_splits=5, *, test_size=None, train_size=None,
                  random_state=None):
         super().__init__(
             n_splits=n_splits,
@@ -1626,8 +1636,8 @@ class StratifiedShuffleSplit(BaseShuffleSplit):
     TRAIN: [4 1 0] TEST: [2 3 5]
     TRAIN: [0 5 1] TEST: [3 4 2]
     """
-
-    def __init__(self, n_splits=10, test_size=None, train_size=None,
+    @_deprecate_positional_args
+    def __init__(self, n_splits=10, *, test_size=None, train_size=None,
                  random_state=None):
         super().__init__(
             n_splits=n_splits,
@@ -1959,7 +1969,8 @@ def split(self, X=None, y=None, groups=None):
             yield train, test
 
 
-def check_cv(cv=5, y=None, classifier=False):
+@_deprecate_positional_args
+def check_cv(cv=5, y=None, *, classifier=False):
     """Input checker utility for building a cross-validator
 
     Parameters
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index ef8890556ed1d..9618bf2fe2e09 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -25,6 +25,7 @@
                      _message_with_time)
 from ..utils.validation import _check_fit_params
 from ..utils.validation import _num_samples
+from ..utils.validation import _deprecate_positional_args
 from ..utils.metaestimators import _safe_split
 from ..metrics import check_scoring
 from ..metrics._scorer import _check_multimetric_scoring, _MultimetricScorer
@@ -37,7 +38,8 @@
            'permutation_test_score', 'learning_curve', 'validation_curve']
 
 
-def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
+@_deprecate_positional_args
+def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None,
                    n_jobs=None, verbose=0, fit_params=None,
                    pre_dispatch='2*n_jobs', return_train_score=False,
                    return_estimator=False, error_score=np.nan):
@@ -266,8 +268,9 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None,
     return ret
 
 
-def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None,
-                    n_jobs=None, verbose=0, fit_params=None,
+@_deprecate_positional_args
+def cross_val_score(estimator, X, y=None, *, groups=None, scoring=None,
+                    cv=None, n_jobs=None, verbose=0, fit_params=None,
                     pre_dispatch='2*n_jobs', error_score=np.nan):
     """Evaluate a score by cross-validation
 
@@ -618,7 +621,8 @@ def _score(estimator, X_test, y_test, scorer):
     return scores
 
 
-def cross_val_predict(estimator, X, y=None, groups=None, cv=None,
+@_deprecate_positional_args
+def cross_val_predict(estimator, X, y=None, *, groups=None, cv=None,
                       n_jobs=None, verbose=0, fit_params=None,
                       pre_dispatch='2*n_jobs', method='predict'):
     """Generate cross-validated estimates for each input data point
@@ -948,7 +952,8 @@ def _check_is_permutation(indices, n_samples):
     return True
 
 
-def permutation_test_score(estimator, X, y, groups=None, cv=None,
+@_deprecate_positional_args
+def permutation_test_score(estimator, X, y, *, groups=None, cv=None,
                            n_permutations=100, n_jobs=None, random_state=0,
                            verbose=0, scoring=None):
     """Evaluate the significance of a cross-validated score with permutations
@@ -1088,7 +1093,8 @@ def _shuffle(y, groups, random_state):
     return _safe_indexing(y, indices)
 
 
-def learning_curve(estimator, X, y, groups=None,
+@_deprecate_positional_args
+def learning_curve(estimator, X, y, *, groups=None,
                    train_sizes=np.linspace(0.1, 1.0, 5), cv=None,
                    scoring=None, exploit_incremental_learning=False,
                    n_jobs=None, pre_dispatch="all", verbose=0, shuffle=False,
@@ -1372,7 +1378,8 @@ def _incremental_fit_estimator(estimator, X, y, classes, train, test,
     return np.array(ret).T
 
 
-def validation_curve(estimator, X, y, param_name, param_range, groups=None,
+@_deprecate_positional_args
+def validation_curve(estimator, X, y, *, param_name, param_range, groups=None,
                      cv=None, scoring=None, n_jobs=None, pre_dispatch="all",
                      verbose=0, error_score=np.nan):
     """Validation curve.
diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
index 0205eb8901699..3b984745420f1 100644
--- a/sklearn/model_selection/tests/test_split.py
+++ b/sklearn/model_selection/tests/test_split.py
@@ -960,7 +960,7 @@ def test_repeated_kfold_determinstic_split():
 def test_get_n_splits_for_repeated_kfold():
     n_splits = 3
     n_repeats = 4
-    rkf = RepeatedKFold(n_splits, n_repeats)
+    rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats)
     expected_n_splits = n_splits * n_repeats
     assert expected_n_splits == rkf.get_n_splits()
 
@@ -968,7 +968,7 @@ def test_get_n_splits_for_repeated_kfold():
 def test_get_n_splits_for_repeated_stratified_kfold():
     n_splits = 3
     n_repeats = 4
-    rskf = RepeatedStratifiedKFold(n_splits, n_repeats)
+    rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats)
     expected_n_splits = n_splits * n_repeats
     assert expected_n_splits == rskf.get_n_splits()
 
diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py
index 67b66b6a91431..579726043f099 100644
--- a/sklearn/model_selection/tests/test_validation.py
+++ b/sklearn/model_selection/tests/test_validation.py
@@ -370,8 +370,8 @@ def test_cross_validate():
 
     for X, y, est in ((X_reg, y_reg, reg), (X_clf, y_clf, clf)):
         # It's okay to evaluate regression metrics on classification too
-        mse_scorer = check_scoring(est, 'neg_mean_squared_error')
-        r2_scorer = check_scoring(est, 'r2')
+        mse_scorer = check_scoring(est, scoring='neg_mean_squared_error')
+        r2_scorer = check_scoring(est, scoring='r2')
         train_mse_scores = []
         test_mse_scores = []
         train_r2_scores = []
@@ -1251,7 +1251,8 @@ def test_validation_curve_cv_splits_consistency():
     X, y = make_classification(n_samples=100, random_state=0)
 
     scores1 = validation_curve(SVC(kernel='linear', random_state=0), X, y,
-                               'C', [0.1, 0.1, 0.2, 0.2],
+                               param_name='C',
+                               param_range=[0.1, 0.1, 0.2, 0.2],
                                cv=OneTimeSplitter(n_splits=n_splits,
                                                   n_samples=n_samples))
     # The OneTimeSplitter is a non-re-entrant cv splitter. Unless, the
@@ -1262,7 +1263,8 @@ def test_validation_curve_cv_splits_consistency():
                                          2))
 
     scores2 = validation_curve(SVC(kernel='linear', random_state=0), X, y,
-                               'C', [0.1, 0.1, 0.2, 0.2],
+                               param_name='C',
+                               param_range=[0.1, 0.1, 0.2, 0.2],
                                cv=KFold(n_splits=n_splits, shuffle=True))
 
     # For scores2, compare the 1st and 2nd parameter's scores
@@ -1272,7 +1274,8 @@ def test_validation_curve_cv_splits_consistency():
                                          2))
 
     scores3 = validation_curve(SVC(kernel='linear', random_state=0), X, y,
-                               'C', [0.1, 0.1, 0.2, 0.2],
+                               param_name='C',
+                               param_range=[0.1, 0.1, 0.2, 0.2],
                                cv=KFold(n_splits=n_splits))
 
     # OneTimeSplitter is basically unshuffled KFold(n_splits=5). Sanity check.
@@ -1679,9 +1682,9 @@ def test_warn_trace(msg):
                          failing_clf, X, y, cv=3, error_score='unvalid-string')
 
     assert_raise_message(ValueError, error_message, validation_curve,
-                         failing_clf, X, y, 'parameter',
-                         [FailingClassifier.FAILING_PARAMETER], cv=3,
-                         error_score='unvalid-string')
+                         failing_clf, X, y, param_name='parameter',
+                         param_range=[FailingClassifier.FAILING_PARAMETER],
+                         cv=3, error_score='unvalid-string')
 
     assert failing_clf.score() == 0.  # FailingClassifier coverage
 
diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py
index 96ec40743fe2c..f6c80a1f5f2ab 100644
--- a/sklearn/multiclass.py
+++ b/sklearn/multiclass.py
@@ -48,6 +48,7 @@
 from .utils.validation import _num_samples
 from .utils.validation import check_is_fitted
 from .utils.validation import check_X_y, check_array
+from .utils.validation import _deprecate_positional_args
 from .utils.multiclass import (_check_partial_fit_first_call,
                                check_classification_targets,
                                _ovr_decision_function)
@@ -201,7 +202,8 @@ class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin,
     array([2, 0, 1])
 
     """
-    def __init__(self, estimator, n_jobs=None):
+    @_deprecate_positional_args
+    def __init__(self, estimator, *, n_jobs=None):
         self.estimator = estimator
         self.n_jobs = n_jobs
 
@@ -519,8 +521,8 @@ class OneVsOneClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
         Indices of samples used when training the estimators.
         ``None`` when ``estimator`` does not have ``_pairwise`` attribute.
     """
-
-    def __init__(self, estimator, n_jobs=None):
+    @_deprecate_positional_args
+    def __init__(self, estimator, *, n_jobs=None):
         self.estimator = estimator
         self.n_jobs = n_jobs
 
@@ -760,8 +762,8 @@ class OutputCodeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator):
        Hastie T., Tibshirani R., Friedman J., page 606 (second-edition)
        2008.
     """
-
-    def __init__(self, estimator, code_size=1.5, random_state=None,
+    @_deprecate_positional_args
+    def __init__(self, estimator, *, code_size=1.5, random_state=None,
                  n_jobs=None):
         self.estimator = estimator
         self.code_size = code_size
diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
index 8f94a0ae634da..b348dd0f78d09 100644
--- a/sklearn/multioutput.py
+++ b/sklearn/multioutput.py
@@ -25,7 +25,7 @@
 from .utils import check_array, check_X_y, check_random_state
 from .utils.metaestimators import if_delegate_has_method
 from .utils.validation import (check_is_fitted, has_fit_parameter,
-                               _check_fit_params)
+                               _check_fit_params, _deprecate_positional_args)
 from .utils.multiclass import check_classification_targets
 from .utils import deprecated
 
@@ -64,7 +64,8 @@ def _partial_fit_estimator(estimator, X, y, classes=None, sample_weight=None,
 class _MultiOutputEstimator(BaseEstimator, MetaEstimatorMixin,
                             metaclass=ABCMeta):
     @abstractmethod
-    def __init__(self, estimator, n_jobs=None):
+    @_deprecate_positional_args
+    def __init__(self, estimator, *, n_jobs=None):
         self.estimator = estimator
         self.n_jobs = n_jobs
 
@@ -245,9 +246,9 @@ class MultiOutputRegressor(RegressorMixin, _MultiOutputEstimator):
     >>> clf.predict(X[[0]])
     array([[176..., 35..., 57...]])
     """
-
-    def __init__(self, estimator, n_jobs=None):
-        super().__init__(estimator, n_jobs)
+    @_deprecate_positional_args
+    def __init__(self, estimator, *, n_jobs=None):
+        super().__init__(estimator, n_jobs=n_jobs)
 
     @if_delegate_has_method('estimator')
     def partial_fit(self, X, y, sample_weight=None):
@@ -315,9 +316,9 @@ class MultiOutputClassifier(ClassifierMixin, _MultiOutputEstimator):
     >>> clf.predict(X[-2:])
     array([[1, 1, 0], [1, 1, 1]])
     """
-
-    def __init__(self, estimator, n_jobs=None):
-        super().__init__(estimator, n_jobs)
+    @_deprecate_positional_args
+    def __init__(self, estimator, *, n_jobs=None):
+        super().__init__(estimator, n_jobs=n_jobs)
 
     def fit(self, X, Y, sample_weight=None, **fit_params):
         """Fit the model to data matrix X and targets Y.
@@ -414,7 +415,9 @@ def _more_tags(self):
 
 
 class _BaseChain(BaseEstimator, metaclass=ABCMeta):
-    def __init__(self, base_estimator, order=None, cv=None, random_state=None):
+    @_deprecate_positional_args
+    def __init__(self, base_estimator, *, order=None, cv=None,
+                 random_state=None):
         self.base_estimator = base_estimator
         self.order = order
         self.cv = cv

From be134a298f9c56af665d0575ebaae81eefef818b Mon Sep 17 00:00:00 2001
From: Lucy Liu <jliu176@gmail.com>
Date: Sat, 25 Apr 2020 17:47:46 +0200
Subject: [PATCH 060/125] TST Replace Bostond dataset in test_iforest (#17031)

---
 sklearn/ensemble/tests/test_iforest.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py
index 3593bc0422ff7..aeb384ab44503 100644
--- a/sklearn/ensemble/tests/test_iforest.py
+++ b/sklearn/ensemble/tests/test_iforest.py
@@ -21,7 +21,7 @@
 from sklearn.ensemble import IsolationForest
 from sklearn.ensemble._iforest import _average_path_length
 from sklearn.model_selection import train_test_split
-from sklearn.datasets import load_boston, load_iris
+from sklearn.datasets import load_diabetes, load_iris
 from sklearn.utils import check_random_state
 from sklearn.metrics import roc_auc_score
 
@@ -37,12 +37,12 @@
 iris.data = iris.data[perm]
 iris.target = iris.target[perm]
 
-# also load the boston dataset
+# also load the diabetes dataset
 # and randomly permute it
-boston = load_boston()
-perm = rng.permutation(boston.target.size)
-boston.data = boston.data[perm]
-boston.target = boston.target[perm]
+diabetes = load_diabetes()
+perm = rng.permutation(diabetes.target.size)
+diabetes.data = diabetes.data[perm]
+diabetes.target = diabetes.target[perm]
 
 
 def test_iforest():
@@ -63,8 +63,8 @@ def test_iforest():
 def test_iforest_sparse():
     """Check IForest for various parameter settings on sparse input."""
     rng = check_random_state(0)
-    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
-                                                        boston.target[:50],
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50],
+                                                        diabetes.target[:50],
                                                         random_state=rng)
     grid = ParameterGrid({"max_samples": [0.5, 1.0],
                           "bootstrap": [True, False]})
@@ -157,8 +157,8 @@ def test_iforest_parallel_regression():
     """Check parallel regression."""
     rng = check_random_state(0)
 
-    X_train, X_test, y_train, y_test = train_test_split(boston.data,
-                                                        boston.target,
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data,
+                                                        diabetes.target,
                                                         random_state=rng)
 
     ensemble = IsolationForest(n_jobs=3,
@@ -226,8 +226,8 @@ def test_max_samples_consistency():
 def test_iforest_subsampled_features():
     # It tests non-regression for #5732 which failed at predict.
     rng = check_random_state(0)
-    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
-                                                        boston.target[:50],
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50],
+                                                        diabetes.target[:50],
                                                         random_state=rng)
     clf = IsolationForest(max_features=0.8)
     clf.fit(X_train, y_train)

From 923d8879e22f340dd0cd4e7fd855e1642e139fa2 Mon Sep 17 00:00:00 2001
From: HaoYin <csustyinhao@gmail.com>
Date: Sun, 26 Apr 2020 02:08:10 +0800
Subject: [PATCH 061/125] DOC Fix a typo in comment (#17037)

---
 sklearn/decomposition/_kernel_pca.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py
index 527f78d34bbb5..617bf8541d830 100644
--- a/sklearn/decomposition/_kernel_pca.py
+++ b/sklearn/decomposition/_kernel_pca.py
@@ -236,7 +236,7 @@ def _fit_transform(self, K):
         # if v is an eigenvector of K
         #     then Phi(X)v  is an eigenvector of Phi(X)Phi(X)'
         # if u is an eigenvector of Phi(X)Phi(X)'
-        #     then Phi(X)'u is an eigenvector of Phi(X)Phi(X)'
+        #     then Phi(X)'u is an eigenvector of Phi(X)'Phi(X)
         #
         # At this stage our self.alphas_ (the v) have norm 1, we need to scale
         # them so that eigenvectors in kernel feature space (the u) have norm=1

From fc6ee00b0accceeec48cc5b606e713514b481617 Mon Sep 17 00:00:00 2001
From: Lucy Liu <jliu176@gmail.com>
Date: Sun, 26 Apr 2020 04:22:33 +0200
Subject: [PATCH 062/125] MNT Fix 'clf' variable naming in test_forest (#16929)

---
 sklearn/ensemble/tests/test_forest.py | 108 +++++++++++++-------------
 1 file changed, 54 insertions(+), 54 deletions(-)

diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index 8144a095cec3a..775ed851d5a53 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -163,17 +163,17 @@ def check_boston_criterion(name, criterion):
     # Check consistency on dataset boston house prices.
     ForestRegressor = FOREST_REGRESSORS[name]
 
-    clf = ForestRegressor(n_estimators=5, criterion=criterion,
+    reg = ForestRegressor(n_estimators=5, criterion=criterion,
                           random_state=1)
-    clf.fit(boston.data, boston.target)
-    score = clf.score(boston.data, boston.target)
+    reg.fit(boston.data, boston.target)
+    score = reg.score(boston.data, boston.target)
     assert score > 0.94, ("Failed with max_features=None, criterion %s "
                           "and score = %f" % (criterion, score))
 
-    clf = ForestRegressor(n_estimators=5, criterion=criterion,
+    reg = ForestRegressor(n_estimators=5, criterion=criterion,
                           max_features=6, random_state=1)
-    clf.fit(boston.data, boston.target)
-    score = clf.score(boston.data, boston.target)
+    reg.fit(boston.data, boston.target)
+    score = reg.score(boston.data, boston.target)
     assert score > 0.95, ("Failed with max_features=6, criterion %s "
                           "and score = %f" % (criterion, score))
 
@@ -682,10 +682,10 @@ def test_distribution():
     y = rng.rand(1000)
     n_trees = 500
 
-    clf = ExtraTreesRegressor(n_estimators=n_trees, random_state=42).fit(X, y)
+    reg = ExtraTreesRegressor(n_estimators=n_trees, random_state=42).fit(X, y)
 
     uniques = defaultdict(int)
-    for tree in clf.estimators_:
+    for tree in reg.estimators_:
         tree = "".join(("%d,%d/" % (f, int(t)) if f >= 0 else "-")
                        for f, t in zip(tree.tree_.feature,
                                        tree.tree_.threshold))
@@ -713,10 +713,10 @@ def test_distribution():
     X[:, 1] = np.random.randint(0, 3, 1000)
     y = rng.rand(1000)
 
-    clf = ExtraTreesRegressor(max_features=1, random_state=1).fit(X, y)
+    reg = ExtraTreesRegressor(max_features=1, random_state=1).fit(X, y)
 
     uniques = defaultdict(int)
-    for tree in clf.estimators_:
+    for tree in reg.estimators_:
         tree = "".join(("%d,%d/" % (f, int(t)) if f >= 0 else "-")
                        for f, t in zip(tree.tree_.feature,
                                        tree.tree_.threshold))
@@ -1065,25 +1065,25 @@ def check_warm_start(name, random_state=42):
     # right size and the same results as a normal fit.
     X, y = hastie_X, hastie_y
     ForestEstimator = FOREST_ESTIMATORS[name]
-    clf_ws = None
+    est_ws = None
     for n_estimators in [5, 10]:
-        if clf_ws is None:
-            clf_ws = ForestEstimator(n_estimators=n_estimators,
+        if est_ws is None:
+            est_ws = ForestEstimator(n_estimators=n_estimators,
                                      random_state=random_state,
                                      warm_start=True)
         else:
-            clf_ws.set_params(n_estimators=n_estimators)
-        clf_ws.fit(X, y)
-        assert len(clf_ws) == n_estimators
+            est_ws.set_params(n_estimators=n_estimators)
+        est_ws.fit(X, y)
+        assert len(est_ws) == n_estimators
 
-    clf_no_ws = ForestEstimator(n_estimators=10, random_state=random_state,
+    est_no_ws = ForestEstimator(n_estimators=10, random_state=random_state,
                                 warm_start=False)
-    clf_no_ws.fit(X, y)
+    est_no_ws.fit(X, y)
 
-    assert (set([tree.random_state for tree in clf_ws]) ==
-                 set([tree.random_state for tree in clf_no_ws]))
+    assert (set([tree.random_state for tree in est_ws]) ==
+            set([tree.random_state for tree in est_no_ws]))
 
-    assert_array_equal(clf_ws.apply(X), clf_no_ws.apply(X),
+    assert_array_equal(est_ws.apply(X), est_no_ws.apply(X),
                        err_msg="Failed with {0}".format(name))
 
 
@@ -1096,17 +1096,17 @@ def check_warm_start_clear(name):
     # Test if fit clears state and grows a new forest when warm_start==False.
     X, y = hastie_X, hastie_y
     ForestEstimator = FOREST_ESTIMATORS[name]
-    clf = ForestEstimator(n_estimators=5, max_depth=1, warm_start=False,
+    est = ForestEstimator(n_estimators=5, max_depth=1, warm_start=False,
                           random_state=1)
-    clf.fit(X, y)
+    est.fit(X, y)
 
-    clf_2 = ForestEstimator(n_estimators=5, max_depth=1, warm_start=True,
+    est_2 = ForestEstimator(n_estimators=5, max_depth=1, warm_start=True,
                             random_state=2)
-    clf_2.fit(X, y)  # inits state
-    clf_2.set_params(warm_start=False, random_state=1)
-    clf_2.fit(X, y)  # clears old state and equals clf
+    est_2.fit(X, y)  # inits state
+    est_2.set_params(warm_start=False, random_state=1)
+    est_2.fit(X, y)  # clears old state and equals est
 
-    assert_array_almost_equal(clf_2.apply(X), clf.apply(X))
+    assert_array_almost_equal(est_2.apply(X), est.apply(X))
 
 
 @pytest.mark.parametrize('name', FOREST_ESTIMATORS)
@@ -1118,10 +1118,10 @@ def check_warm_start_smaller_n_estimators(name):
     # Test if warm start second fit with smaller n_estimators raises error.
     X, y = hastie_X, hastie_y
     ForestEstimator = FOREST_ESTIMATORS[name]
-    clf = ForestEstimator(n_estimators=5, max_depth=1, warm_start=True)
-    clf.fit(X, y)
-    clf.set_params(n_estimators=4)
-    assert_raises(ValueError, clf.fit, X, y)
+    est = ForestEstimator(n_estimators=5, max_depth=1, warm_start=True)
+    est.fit(X, y)
+    est.set_params(n_estimators=4)
+    assert_raises(ValueError, est.fit, X, y)
 
 
 @pytest.mark.parametrize('name', FOREST_ESTIMATORS)
@@ -1134,20 +1134,20 @@ def check_warm_start_equal_n_estimators(name):
     # same forest and raises a warning.
     X, y = hastie_X, hastie_y
     ForestEstimator = FOREST_ESTIMATORS[name]
-    clf = ForestEstimator(n_estimators=5, max_depth=3, warm_start=True,
+    est = ForestEstimator(n_estimators=5, max_depth=3, warm_start=True,
                           random_state=1)
-    clf.fit(X, y)
+    est.fit(X, y)
 
-    clf_2 = ForestEstimator(n_estimators=5, max_depth=3, warm_start=True,
+    est_2 = ForestEstimator(n_estimators=5, max_depth=3, warm_start=True,
                             random_state=1)
-    clf_2.fit(X, y)
-    # Now clf_2 equals clf.
+    est_2.fit(X, y)
+    # Now est_2 equals est.
 
-    clf_2.set_params(random_state=2)
-    assert_warns(UserWarning, clf_2.fit, X, y)
+    est_2.set_params(random_state=2)
+    assert_warns(UserWarning, est_2.fit, X, y)
     # If we had fit the trees again we would have got a different forest as we
     # changed the random state.
-    assert_array_equal(clf.apply(X), clf_2.apply(X))
+    assert_array_equal(est.apply(X), est_2.apply(X))
 
 
 @pytest.mark.parametrize('name', FOREST_ESTIMATORS)
@@ -1160,31 +1160,31 @@ def check_warm_start_oob(name):
     X, y = hastie_X, hastie_y
     ForestEstimator = FOREST_ESTIMATORS[name]
     # Use 15 estimators to avoid 'some inputs do not have OOB scores' warning.
-    clf = ForestEstimator(n_estimators=15, max_depth=3, warm_start=False,
+    est = ForestEstimator(n_estimators=15, max_depth=3, warm_start=False,
                           random_state=1, bootstrap=True, oob_score=True)
-    clf.fit(X, y)
+    est.fit(X, y)
 
-    clf_2 = ForestEstimator(n_estimators=5, max_depth=3, warm_start=False,
+    est_2 = ForestEstimator(n_estimators=5, max_depth=3, warm_start=False,
                             random_state=1, bootstrap=True, oob_score=False)
-    clf_2.fit(X, y)
+    est_2.fit(X, y)
 
-    clf_2.set_params(warm_start=True, oob_score=True, n_estimators=15)
-    clf_2.fit(X, y)
+    est_2.set_params(warm_start=True, oob_score=True, n_estimators=15)
+    est_2.fit(X, y)
 
-    assert hasattr(clf_2, 'oob_score_')
-    assert clf.oob_score_ == clf_2.oob_score_
+    assert hasattr(est_2, 'oob_score_')
+    assert est.oob_score_ == est_2.oob_score_
 
     # Test that oob_score is computed even if we don't need to train
     # additional trees.
-    clf_3 = ForestEstimator(n_estimators=15, max_depth=3, warm_start=True,
+    est_3 = ForestEstimator(n_estimators=15, max_depth=3, warm_start=True,
                             random_state=1, bootstrap=True, oob_score=False)
-    clf_3.fit(X, y)
-    assert not hasattr(clf_3, 'oob_score_')
+    est_3.fit(X, y)
+    assert not hasattr(est_3, 'oob_score_')
 
-    clf_3.set_params(oob_score=True)
-    ignore_warnings(clf_3.fit)(X, y)
+    est_3.set_params(oob_score=True)
+    ignore_warnings(est_3.fit)(X, y)
 
-    assert clf.oob_score_ == clf_3.oob_score_
+    assert est.oob_score_ == est_3.oob_score_
 
 
 @pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS)

From e8a42aee5d5b3f2d25f3dddbc0c55c96da03e230 Mon Sep 17 00:00:00 2001
From: Hugo van Kemenade <hugovk@users.noreply.github.com>
Date: Sun, 26 Apr 2020 14:48:36 +0300
Subject: [PATCH 063/125] CI Remove pin now that Pillow issue is fixed (#17043)

---
 build_tools/azure/install.sh | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
index d1849a940d96c..f30db7f0ae08a 100755
--- a/build_tools/azure/install.sh
+++ b/build_tools/azure/install.sh
@@ -98,9 +98,6 @@ elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then
     python -m pip install -U pip
     python -m pip install pytest==$PYTEST_VERSION pytest-cov pytest-xdist
 
-    # TODO: Remove pin when https://github.com/python-pillow/Pillow/issues/4518 gets fixed
-    python -m pip install "pillow>=4.3.0,!=7.1.0,!=7.1.1"
-
     python -m pip install pandas matplotlib pyamg scikit-image
     # do not install dependencies for lightgbm since it requires scikit-learn
     python -m pip install lightgbm --no-deps

From 28c08d06f95f2ed8843bd7b04a53ca8710ca7971 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Sun, 26 Apr 2020 14:23:29 +0200
Subject: [PATCH 064/125] DOC fix typos in cross validation user guide (#17042)

---
 doc/modules/cross_validation.rst | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst
index e2de690658a25..ed014cea6f2ff 100644
--- a/doc/modules/cross_validation.rst
+++ b/doc/modules/cross_validation.rst
@@ -323,11 +323,11 @@ The following cross-validators can be used in such cases.
 
 While i.i.d. data is a common assumption in machine learning theory, it rarely
 holds in practice. If one knows that the samples have been generated using a
-time-dependent process, it's safer to
-use a :ref:`time-series aware cross-validation scheme <timeseries_cv>`
-Similarly if we know that the generative process has a group structure
-(samples from collected from different subjects, experiments, measurement
-devices) it safer to use :ref:`group-wise cross-validation <group_cv>`.
+time-dependent process, it is safer to
+use a :ref:`time-series aware cross-validation scheme <timeseries_cv>`.
+Similarly, if we know that the generative process has a group structure
+(samples collected from different subjects, experiments, measurement
+devices), it is safer to use :ref:`group-wise cross-validation <group_cv>`.
 
 
 K-fold
@@ -535,14 +535,14 @@ folds: each set contains approximately the same percentage of samples of each
 target class as the complete set.
 
 Here is an example of stratified 3-fold cross-validation on a dataset with 50 samples from
-two unbalanced classes.  We show the number of samples in each class and compare with 
+two unbalanced classes.  We show the number of samples in each class and compare with
 :class:`KFold`.
 
   >>> from sklearn.model_selection import StratifiedKFold, KFold
   >>> import numpy as np
   >>> X, y = np.ones((50, 1)), np.hstack(([0] * 45, [1] * 5))
-  >>> skf = StratifiedKFold(n_splits=3) 
-  >>> for train, test in skf.split(X, y):  
+  >>> skf = StratifiedKFold(n_splits=3)
+  >>> for train, test in skf.split(X, y):
   ...     print('train -  {}   |   test -  {}'.format(
   ...         np.bincount(y[train]), np.bincount(y[test])))
   train -  [30  3]   |   test -  [15  2]
@@ -556,7 +556,7 @@ two unbalanced classes.  We show the number of samples in each class and compare
   train -  [28  5]   |   test -  [17]
   train -  [34]   |   test -  [11  5]
 
-We can see that :class:`StratifiedKFold` preserves the class ratios 
+We can see that :class:`StratifiedKFold` preserves the class ratios
 (approximately 1 / 10) in both train and test dataset.
 
 Here is a visualization of the cross-validation behavior.

From 4755ae76d2df10bbf41bc93fb7083b0142ef1044 Mon Sep 17 00:00:00 2001
From: Ekaterina Borovikova <borovikova@zoho.eu>
Date: Sun, 26 Apr 2020 15:29:36 +0200
Subject: [PATCH 065/125] DOC Add versionchanged and versionadded for v0.20
 (#16199)

Co-Authored-By: Adrin Jalali <adrin.jalali@gmail.com>
Co-Authored-By: Chiara Marmo <cmarmo@users.noreply.github.com>
Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com>
---
 sklearn/cluster/_agglomerative.py            |  3 +++
 sklearn/cluster/_kmeans.py                   |  4 ++++
 sklearn/compose/_target.py                   |  2 ++
 sklearn/covariance/_elliptic_envelope.py     |  2 ++
 sklearn/covariance/_graph_lasso.py           | 12 ++++++++++++
 sklearn/datasets/_base.py                    | 10 ++++++++++
 sklearn/datasets/_openml.py                  |  2 ++
 sklearn/datasets/_samples_generator.py       |  3 +++
 sklearn/dummy.py                             |  2 ++
 sklearn/ensemble/_iforest.py                 |  3 +++
 sklearn/ensemble/_voting.py                  |  1 -
 sklearn/feature_extraction/text.py           |  2 ++
 sklearn/feature_selection/_rfe.py            |  4 ++++
 sklearn/linear_model/_stochastic_gradient.py | 12 ++++++++++++
 sklearn/manifold/_t_sne.py                   |  2 ++
 sklearn/metrics/_classification.py           |  4 ++++
 sklearn/metrics/_ranking.py                  |  4 +++-
 sklearn/metrics/cluster/_unsupervised.py     |  2 ++
 sklearn/model_selection/_search.py           | 10 ++++++++++
 sklearn/model_selection/_validation.py       | 15 ++++++++++++++-
 sklearn/multiclass.py                        |  3 +++
 sklearn/multioutput.py                       |  8 ++++++++
 sklearn/naive_bayes.py                       |  4 ++++
 sklearn/neighbors/_kde.py                    |  2 ++
 sklearn/neighbors/_lof.py                    |  4 ++++
 sklearn/pipeline.py                          |  8 ++++++++
 sklearn/preprocessing/_discretization.py     |  3 +++
 sklearn/preprocessing/_encoders.py           |  4 +++-
 sklearn/svm/_classes.py                      |  2 ++
 sklearn/utils/_show_versions.py              |  5 ++++-
 30 files changed, 137 insertions(+), 5 deletions(-)

diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py
index 182ae4b481116..92246141d6fe8 100644
--- a/sklearn/cluster/_agglomerative.py
+++ b/sklearn/cluster/_agglomerative.py
@@ -736,6 +736,9 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
         - single uses the minimum of the distances between all observations
           of the two sets.
 
+        .. versionadded:: 0.20
+            Added the 'single' option
+
     distance_threshold : float, default=None
         The linkage distance threshold above which, clusters will not be
         merged. If not ``None``, ``n_clusters`` must be ``None`` and
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index b185983c4b0f9..21a604bed3eb5 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -946,6 +946,8 @@ def fit(self, X, y=None, sample_weight=None):
             The weights for each observation in X. If None, all observations
             are assigned equal weight.
 
+            .. versionadded:: 0.20
+
         Returns
         -------
         self
@@ -1587,6 +1589,8 @@ def fit(self, X, y=None, sample_weight=None):
             The weights for each observation in X. If None, all observations
             are assigned equal weight (default: None).
 
+            .. versionadded:: 0.20
+
         Returns
         -------
         self
diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py
index d8c062ed423a2..1d6695a808d81 100644
--- a/sklearn/compose/_target.py
+++ b/sklearn/compose/_target.py
@@ -42,6 +42,8 @@ class TransformedTargetRegressor(RegressorMixin, BaseEstimator):
 
     Read more in the :ref:`User Guide <transformed_target_regressor>`.
 
+    .. versionadded:: 0.20
+
     Parameters
     ----------
     regressor : object, default=None
diff --git a/sklearn/covariance/_elliptic_envelope.py b/sklearn/covariance/_elliptic_envelope.py
index 801611943f350..354c0f8998968 100644
--- a/sklearn/covariance/_elliptic_envelope.py
+++ b/sklearn/covariance/_elliptic_envelope.py
@@ -67,6 +67,8 @@ class EllipticEnvelope(OutlierMixin, MinCovDet):
         such a way we obtain the expected number of outliers (samples with
         decision function < 0) in training.
 
+        .. versionadded:: 0.20
+
     raw_location_ : ndarray of shape (n_features,)
         The raw robust estimated location before correction and re-weighting.
 
diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py
index 1d0d93db75101..35a398741bc15 100644
--- a/sklearn/covariance/_graph_lasso.py
+++ b/sklearn/covariance/_graph_lasso.py
@@ -84,6 +84,9 @@ def graphical_lasso(emp_cov, alpha, cov_init=None, mode='cd', tol=1e-4,
 
     Read more in the :ref:`User Guide <sparse_inverse_covariance>`.
 
+    .. versionchanged:: v0.20
+        graph_lasso has been renamed to graphical_lasso
+
     Parameters
     ----------
     emp_cov : ndarray of shape (n_features, n_features)
@@ -283,6 +286,9 @@ class GraphicalLasso(EmpiricalCovariance):
 
     Read more in the :ref:`User Guide <sparse_inverse_covariance>`.
 
+    .. versionchanged:: v0.20
+        GraphLasso has been renamed to GraphicalLasso
+
     Parameters
     ----------
     alpha : float, default=0.01
@@ -509,6 +515,9 @@ class GraphicalLassoCV(GraphicalLasso):
 
     Read more in the :ref:`User Guide <sparse_inverse_covariance>`.
 
+    .. versionchanged:: v0.20
+        GraphLassoCV has been renamed to GraphicalLassoCV
+
     Parameters
     ----------
     alphas : int or array-like of shape (n_alphas,), dtype=float, default=4
@@ -563,6 +572,9 @@ class GraphicalLassoCV(GraphicalLasso):
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+        .. versionchanged:: v0.20
+           `n_jobs` default changed from 1 to None
+
     verbose : bool, default=False
         If verbose is True, the objective function and duality gap are
         printed at each iteration.
diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py
index d481288133991..9737a5f67891a 100644
--- a/sklearn/datasets/_base.py
+++ b/sklearn/datasets/_base.py
@@ -439,6 +439,8 @@ def load_iris(return_X_y=False, as_frame=False):
         filename: str
             The path to the location of the data.
 
+            .. versionadded:: 0.20
+
     (data, target) : tuple if ``return_X_y`` is True
 
         .. versionadded:: 0.18
@@ -551,6 +553,8 @@ def load_breast_cancer(return_X_y=False, as_frame=False):
         filename: str
             The path to the location of the data.
 
+            .. versionadded:: 0.20
+
     (data, target) : tuple if ``return_X_y`` is True
 
         .. versionadded:: 0.18
@@ -665,6 +669,9 @@ def load_digits(n_class=10, return_X_y=False, as_frame=False):
             The names of the dataset columns.
         target_names: list
             The names of target classes.
+
+            .. versionadded:: 0.20
+
         frame: DataFrame of shape (1797, 65)
             Only present when `as_frame=True`. DataFrame with `data` and
             `target`.
@@ -885,6 +892,8 @@ def load_linnerud(return_X_y=False, as_frame=False):
         target_filename: str
             The path to the location of the target.
 
+            .. versionadded:: 0.20
+
     (data, target) : tuple if ``return_X_y`` is True
 
         .. versionadded:: 0.18
@@ -961,6 +970,7 @@ def load_boston(return_X_y=False):
             The physical location of boston csv dataset.
 
             .. versionadded:: 0.20
+
         DESCR : str
             The full description of the dataset.
         feature_names : ndarray
diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index cef0e6cb1f411..26260a27ec883 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -526,6 +526,8 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
 
     Read more in the :ref:`User Guide <openml>`.
 
+    .. versionadded:: 0.20
+
     .. note:: EXPERIMENTAL
 
         The API is experimental (particularly the return value structure),
diff --git a/sklearn/datasets/_samples_generator.py b/sklearn/datasets/_samples_generator.py
index 62ef492f42f5e..fe0ac680ecd79 100644
--- a/sklearn/datasets/_samples_generator.py
+++ b/sklearn/datasets/_samples_generator.py
@@ -745,6 +745,9 @@ def make_blobs(n_samples=100, n_features=2, centers=None, cluster_std=1.0,
         If array-like, each element of the sequence indicates
         the number of samples per cluster.
 
+        .. versionchanged:: v0.20
+            one can now pass an array-like to the ``n_samples`` parameter
+
     n_features : int, optional (default=2)
         The number of features for each sample.
 
diff --git a/sklearn/dummy.py b/sklearn/dummy.py
index 37e9145f7536c..17b2c6cfd2e5d 100644
--- a/sklearn/dummy.py
+++ b/sklearn/dummy.py
@@ -561,6 +561,8 @@ def predict(self, X, return_std=False):
             Whether to return the standard deviation of posterior prediction.
             All zeros in this case.
 
+            .. versionadded:: 0.20
+
         Returns
         -------
         y : array-like of shape (n_samples,) or (n_samples, n_outputs)
diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py
index 9cec1c08efc9e..0c1bec9ebfb65 100644
--- a/sklearn/ensemble/_iforest.py
+++ b/sklearn/ensemble/_iforest.py
@@ -146,6 +146,8 @@ class IsolationForest(OutlierMixin, BaseBagging):
         is defined in such a way we obtain the expected number of outliers
         (samples with decision function < 0) in training.
 
+        .. versionadded:: 0.20
+
     estimators_features_ : list of arrays
         The subset of drawn features for each base estimator.
 
@@ -391,6 +393,7 @@ def score_samples(self, X):
             The lower, the more abnormal.
         """
         # code structure from ForestClassifier/predict_proba
+
         check_is_fitted(self)
 
         # Check data
diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py
index 8d2bbbe8c2b8a..b044cb68e5151 100644
--- a/sklearn/ensemble/_voting.py
+++ b/sklearn/ensemble/_voting.py
@@ -161,7 +161,6 @@ class VotingClassifier(ClassifierMixin, _BaseVoting):
     named_estimators_ : :class:`~sklearn.utils.Bunch`
         Attribute to access any fitted sub-estimators by name.
 
-
         .. versionadded:: 0.20
 
     classes_ : array-like of shape (n_predictions,)
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 27c5eb437805b..661f638b000fc 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -1375,6 +1375,8 @@ class TfidfTransformer(TransformerMixin, BaseEstimator):
         The inverse document frequency (IDF) vector; only defined
         if  ``use_idf`` is True.
 
+        .. versionadded:: 0.20
+
     Examples
     --------
     >>> from sklearn.feature_extraction.text import TfidfTransformer
diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py
index 7421bc50b7625..aedcd94943bc4 100644
--- a/sklearn/feature_selection/_rfe.py
+++ b/sklearn/feature_selection/_rfe.py
@@ -374,6 +374,8 @@ class RFECV(RFE):
         feature count and ``min_features_to_select`` isn't divisible by
         ``step``.
 
+        .. versionadded:: 0.20
+
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
@@ -497,6 +499,8 @@ def fit(self, X, y, groups=None):
             Group labels for the samples used while splitting the dataset into
             train/test set. Only used in conjunction with a "Group" :term:`cv`
             instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).
+
+            .. versionadded:: 0.20
         """
         tags = self._get_tags()
         X, y = self._validate_data(
diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index 3bedd8a26674b..1606a7ff35adb 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -847,6 +847,9 @@ class SGDClassifier(BaseSGDClassifier):
           training loss by tol or fail to increase validation score by tol if
           early_stopping is True, the current learning rate is divided by 5.
 
+            .. versionadded:: 0.20
+                Added 'adaptive' option
+
     eta0 : double, default=0.0
         The initial learning rate for the 'constant', 'invscaling' or
         'adaptive' schedules. The default value is 0.0 as eta0 is not used by
@@ -863,6 +866,7 @@ class SGDClassifier(BaseSGDClassifier):
         improving by at least tol for n_iter_no_change consecutive epochs.
 
         .. versionadded:: 0.20
+            Added 'early_stopping' option
 
     validation_fraction : float, default=0.1
         The proportion of training data to set aside as validation set for
@@ -870,11 +874,13 @@ class SGDClassifier(BaseSGDClassifier):
         Only used if `early_stopping` is True.
 
         .. versionadded:: 0.20
+            Added 'validation_fraction' option
 
     n_iter_no_change : int, default=5
         Number of iterations with no improvement to wait before early stopping.
 
         .. versionadded:: 0.20
+            Added 'n_iter_no_change' option
 
     class_weight : dict, {class_label: weight} or "balanced", default=None
         Preset for the class_weight fit parameter.
@@ -1446,6 +1452,9 @@ class SGDRegressor(BaseSGDRegressor):
           training loss by tol or fail to increase validation score by tol if
           early_stopping is True, the current learning rate is divided by 5.
 
+            .. versionadded:: 0.20
+                Added 'adaptive' option
+
     eta0 : double, default=0.01
         The initial learning rate for the 'constant', 'invscaling' or
         'adaptive' schedules. The default value is 0.01.
@@ -1462,6 +1471,7 @@ class SGDRegressor(BaseSGDRegressor):
         epochs.
 
         .. versionadded:: 0.20
+            Added 'early_stopping' option
 
     validation_fraction : float, default=0.1
         The proportion of training data to set aside as validation set for
@@ -1469,11 +1479,13 @@ class SGDRegressor(BaseSGDRegressor):
         Only used if `early_stopping` is True.
 
         .. versionadded:: 0.20
+            Added 'validation_fraction' option
 
     n_iter_no_change : int, default=5
         Number of iterations with no improvement to wait before early stopping.
 
         .. versionadded:: 0.20
+            Added 'n_iter_no_change' option
 
     warm_start : bool, default=False
         When set to True, reuse the solution of the previous call to fit as
diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py
index d94bf777399f5..eef67d5460e22 100644
--- a/sklearn/manifold/_t_sne.py
+++ b/sklearn/manifold/_t_sne.py
@@ -439,6 +439,8 @@ def trustworthiness(X, X_embedded, *, n_neighbors=5, metric='euclidean'):
         documentation of argument metric in sklearn.pairwise.pairwise_distances
         for a list of available metrics.
 
+        .. versionadded:: 0.20
+
     Returns
     -------
     trustworthiness : float
diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 90e1935e62f06..b8a1a8e5e22b4 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -1751,6 +1751,8 @@ def balanced_accuracy_score(y_true, y_pred, *, sample_weight=None,
 
     Read more in the :ref:`User Guide <balanced_accuracy_score>`.
 
+    .. versionadded:: 0.20
+
     Parameters
     ----------
     y_true : 1d array-like
@@ -1849,6 +1851,8 @@ def classification_report(y_true, y_pred, *, labels=None, target_names=None,
     output_dict : bool (default = False)
         If True, return output as dict
 
+        .. versionadded:: 0.20
+
     zero_division : "warn", 0 or 1, default="warn"
         Sets the value to return when there is a zero division. If set to
         "warn", this acts as 0, but warnings are also raised.
diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py
index 2ac226bfe0299..18d948214bbec 100644
--- a/sklearn/metrics/_ranking.py
+++ b/sklearn/metrics/_ranking.py
@@ -848,6 +848,8 @@ def label_ranking_average_precision_score(y_true, y_score, *,
     sample_weight : array-like of shape (n_samples,), default=None
         Sample weights.
 
+        .. versionadded:: 0.20
+
     Returns
     -------
     score : float
@@ -1031,7 +1033,7 @@ def label_ranking_loss(y_true, y_score, *, sample_weight=None):
             unique_inverse[y_true.indices[start:stop]],
             minlength=len(unique_scores))
         all_at_reversed_rank = np.bincount(unique_inverse,
-                                        minlength=len(unique_scores))
+                                           minlength=len(unique_scores))
         false_at_reversed_rank = all_at_reversed_rank - true_at_reversed_rank
 
         # if the scores are ordered, it's possible to count the number of
diff --git a/sklearn/metrics/cluster/_unsupervised.py b/sklearn/metrics/cluster/_unsupervised.py
index 9e2ef713b352e..ce5563c4763d3 100644
--- a/sklearn/metrics/cluster/_unsupervised.py
+++ b/sklearn/metrics/cluster/_unsupervised.py
@@ -312,6 +312,8 @@ def davies_bouldin_score(X, labels):
 
     Read more in the :ref:`User Guide <davies-bouldin_index>`.
 
+    .. versionadded:: 0.20
+
     Parameters
     ----------
     X : array-like, shape (``n_samples``, ``n_features``)
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 920900db20fe7..e44511a9394b4 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -914,6 +914,9 @@ class GridSearchCV(BaseSearchCV):
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+        .. versionchanged:: v0.20
+           `n_jobs` default changed from 1 to None
+
     pre_dispatch : int, or str, default=n_jobs
         Controls the number of jobs that get dispatched during parallel
         execution. Reducing this number can be useful to avoid an
@@ -1136,6 +1139,8 @@ class GridSearchCV(BaseSearchCV):
 
         This is present only if ``refit`` is not False.
 
+        .. versionadded:: 0.20
+
     Notes
     -----
     The parameters selected are those that maximize the score of the left out
@@ -1250,6 +1255,9 @@ class RandomizedSearchCV(BaseSearchCV):
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+        .. versionchanged:: v0.20
+           `n_jobs` default changed from 1 to None
+
     pre_dispatch : int, or str, default=None
         Controls the number of jobs that get dispatched during parallel
         execution. Reducing this number can be useful to avoid an
@@ -1457,6 +1465,8 @@ class RandomizedSearchCV(BaseSearchCV):
 
         This is present only if ``refit`` is not False.
 
+        .. versionadded:: 0.20
+
     Notes
     -----
     The parameters selected are those that maximize the score of the held-out
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
index 9618bf2fe2e09..dd204ad4a57d0 100644
--- a/sklearn/model_selection/_validation.py
+++ b/sklearn/model_selection/_validation.py
@@ -144,12 +144,16 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None,
     return_estimator : bool, default=False
         Whether to return the estimators fitted on each split.
 
+        .. versionadded:: 0.20
+
     error_score : 'raise' or numeric
         Value to assign to the score if an error occurs in estimator fitting.
         If set to 'raise', the error is raised.
         If a numeric value is given, FitFailedWarning is raised. This parameter
         does not affect the refit step, which will always raise the error.
 
+        .. versionadded:: 0.20
+
     Returns
     -------
     scores : dict of float arrays of shape (n_splits,)
@@ -359,6 +363,8 @@ def cross_val_score(estimator, X, y=None, *, groups=None, scoring=None,
         If a numeric value is given, FitFailedWarning is raised. This parameter
         does not affect the refit step, which will always raise the error.
 
+        .. versionadded:: 0.20
+
     Returns
     -------
     scores : array of float, shape=(len(list(cv)),)
@@ -495,7 +501,7 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
             msg = ''
         else:
             msg = '%s' % (', '.join('%s=%s' % (k, v)
-                          for k, v in parameters.items()))
+                                    for k, v in parameters.items()))
         print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))
 
     # Adjust length of sample weights
@@ -813,6 +819,9 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params,
     X : array-like of shape (n_samples, n_features)
         The data to fit.
 
+        .. versionchanged:: 0.20
+            X is only required to be an object with finite length or shape now
+
     y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None
         The target variable to try to predict in the case of
         supervised learning.
@@ -1197,6 +1206,8 @@ def learning_curve(estimator, X, y, *, groups=None,
         If a numeric value is given, FitFailedWarning is raised. This parameter
         does not affect the refit step, which will always raise the error.
 
+        .. versionadded:: 0.20
+
     return_times : bool, default=False
         Whether to return the fit and score times.
 
@@ -1461,6 +1472,8 @@ def validation_curve(estimator, X, y, *, param_name, param_range, groups=None,
         If a numeric value is given, FitFailedWarning is raised. This parameter
         does not affect the refit step, which will always raise the error.
 
+        .. versionadded:: 0.20
+
     Returns
     -------
     train_scores : array of shape (n_ticks, n_cv_folds)
diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py
index f6c80a1f5f2ab..1f0bfaf6517b4 100644
--- a/sklearn/multiclass.py
+++ b/sklearn/multiclass.py
@@ -165,6 +165,9 @@ class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin,
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+        .. versionchanged:: v0.20
+           `n_jobs` default changed from 1 to None
+
     Attributes
     ----------
     estimators_ : list of `n_classes` estimators
diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
index b348dd0f78d09..815c1cbd67757 100644
--- a/sklearn/multioutput.py
+++ b/sklearn/multioutput.py
@@ -230,6 +230,9 @@ class MultiOutputRegressor(RegressorMixin, _MultiOutputEstimator):
         using `n_jobs>1` can result in slower performance due
         to the overhead of spawning processes.
 
+        .. versionchanged:: v0.20
+           `n_jobs` default changed from 1 to None
+
     Attributes
     ----------
     estimators_ : list of ``n_output`` estimators
@@ -296,6 +299,9 @@ class MultiOutputClassifier(ClassifierMixin, _MultiOutputEstimator):
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+        .. versionchanged:: v0.20
+           `n_jobs` default changed from 1 to None
+
     Attributes
     ----------
     classes_ : array, shape = (n_classes,)
@@ -697,6 +703,8 @@ class RegressorChain(MetaEstimatorMixin, RegressorMixin, _BaseChain):
 
     Read more in the :ref:`User Guide <regressorchain>`.
 
+    .. versionadded:: 0.20
+
     Parameters
     ----------
     base_estimator : estimator
diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
index 247d9eea763c6..e631bb3dcd599 100644
--- a/sklearn/naive_bayes.py
+++ b/sklearn/naive_bayes.py
@@ -140,6 +140,8 @@ class GaussianNB(_BaseNB):
         Portion of the largest variance of all features that is added to
         variances for calculation stability.
 
+        .. versionadded:: 0.20
+
     Attributes
     ----------
     class_count_ : ndarray of shape (n_classes,)
@@ -785,6 +787,8 @@ class ComplementNB(_BaseDiscreteNB):
 
     Read more in the :ref:`User Guide <complement_naive_bayes>`.
 
+    .. versionadded:: 0.20
+
     Parameters
     ----------
     alpha : float, default=1.0
diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py
index 684e07947cddd..1a967e301b357 100644
--- a/sklearn/neighbors/_kde.py
+++ b/sklearn/neighbors/_kde.py
@@ -148,6 +148,8 @@ def fit(self, X, y=None, sample_weight=None):
         sample_weight : array_like, shape (n_samples,), optional
             List of sample weights attached to the data X.
 
+            .. versionadded:: 0.20
+
         Returns
         -------
         self : object
diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py
index dfdb89237f516..2d456ff3e620f 100644
--- a/sklearn/neighbors/_lof.py
+++ b/sklearn/neighbors/_lof.py
@@ -117,6 +117,8 @@ class LocalOutlierFactor(KNeighborsMixin, UnsupervisedMixin,
         that you should only use predict, decision_function and score_samples
         on new unseen data and not on the training set.
 
+        .. versionadded:: 0.20
+
     n_jobs : int, default=None
         The number of parallel jobs to run for neighbors search.
         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
@@ -148,6 +150,8 @@ class LocalOutlierFactor(KNeighborsMixin, UnsupervisedMixin,
         case, the offset is defined in such a way we obtain the expected
         number of outliers in training.
 
+        .. versionadded:: 0.20
+
     Examples
     --------
     >>> import numpy as np
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 477354107e133..8e2a539786557 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -395,6 +395,8 @@ def predict(self, X, **predict_params):
             transformations in the pipeline are not propagated to the
             final estimator.
 
+            .. versionadded:: 0.20
+
         Returns
         -------
         y_pred : array-like
@@ -773,6 +775,9 @@ class FeatureUnion(TransformerMixin, _BaseComposition):
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+        .. versionchanged:: v0.20
+           `n_jobs` default changed from 1 to None
+
     transformer_weights : dict, default=None
         Multiplicative weights for features per transformer.
         Keys are transformer names, values the weights.
@@ -1018,6 +1023,9 @@ def make_union(*transformers, **kwargs):
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+        .. versionchanged:: v0.20
+           `n_jobs` default changed from 1 to None
+
     verbose : bool, default=False
         If True, the time elapsed while fitting each transformer will be
         printed as it is completed.
diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py
index 581765a81361e..fa7d574e65ccd 100644
--- a/sklearn/preprocessing/_discretization.py
+++ b/sklearn/preprocessing/_discretization.py
@@ -25,6 +25,8 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator):
 
     Read more in the :ref:`User Guide <preprocessing_discretization>`.
 
+    .. versionadded:: 0.20
+
     Parameters
     ----------
     n_bins : int or array-like, shape (n_features,) (default=5)
@@ -114,6 +116,7 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator):
            [-0.5,  2.5, -2.5, -0.5],
            [ 0.5,  3.5, -1.5,  0.5],
            [ 0.5,  3.5, -1.5,  1.5]])
+
     """
 
     @_deprecate_positional_args
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index c8f8ba6781400..3b0e43c151e0c 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -186,6 +186,8 @@ class OneHotEncoder(_BaseEncoder):
 
         The used categories can be found in the ``categories_`` attribute.
 
+        .. versionadded:: 0.20
+
     drop : {'first', 'if_binary'} or a array-like of shape (n_features,), \
             default=None
         Specifies a methodology to use to drop one of the categories per
@@ -603,7 +605,7 @@ class OrdinalEncoder(_BaseEncoder):
 
     Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
 
-    .. versionchanged:: 0.20.1
+    .. versionadded:: 0.20
 
     Parameters
     ----------
diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py
index f8b30e070711e..5ff6e74825e50 100644
--- a/sklearn/svm/_classes.py
+++ b/sklearn/svm/_classes.py
@@ -1241,6 +1241,8 @@ class OneClassSVM(OutlierMixin, BaseLibSVM):
         The offset is the opposite of `intercept_` and is provided for
         consistency with other outlier detection algorithms.
 
+        .. versionadded:: 0.20
+
     fit_status_ : int
         0 if correctly fitted, 1 otherwise (will raise warning)
 
diff --git a/sklearn/utils/_show_versions.py b/sklearn/utils/_show_versions.py
index 53bcf2f35269d..e9c7c687c5aaa 100644
--- a/sklearn/utils/_show_versions.py
+++ b/sklearn/utils/_show_versions.py
@@ -73,7 +73,10 @@ def get_version(module):
 
 
 def show_versions():
-    """Print useful debugging information"""
+    """Print useful debugging information"
+
+    .. versionadded:: 0.20
+    """
 
     sys_info = _get_sys_info()
     deps_info = _get_deps_info()

From 81c1e9d1f265cfb32e155817ae14ef41521eea22 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Sun, 26 Apr 2020 12:05:16 -0400
Subject: [PATCH 066/125] API kwonly for utils (#17007)

---
 sklearn/utils/__init__.py                |  20 +++--
 sklearn/utils/class_weight.py            |   5 +-
 sklearn/utils/multiclass.py              |   7 +-
 sklearn/utils/tests/test_class_weight.py |  22 ++---
 sklearn/utils/tests/test_validation.py   |   5 +-
 sklearn/utils/validation.py              | 109 ++++++++++++-----------
 6 files changed, 94 insertions(+), 74 deletions(-)

diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index aac6e292a198a..afde7614070fd 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -29,7 +29,8 @@
                          assert_all_finite,
                          check_random_state, column_or_1d, check_array,
                          check_consistent_length, check_X_y, indexable,
-                         check_symmetric, check_scalar)
+                         check_symmetric, check_scalar,
+                         _deprecate_positional_args)
 from .. import get_config
 
 
@@ -314,10 +315,10 @@ def safe_indexing(X, indices, axis=0):
     CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are
     not supported.
     """
-    return _safe_indexing(X, indices, axis)
+    return _safe_indexing(X, indices, axis=axis)
 
 
-def _safe_indexing(X, indices, axis=0):
+def _safe_indexing(X, indices, *, axis=0):
     """Return rows, items or columns of X using indices.
 
     .. warning::
@@ -684,7 +685,8 @@ def shuffle(*arrays, **options):
     return resample(*arrays, **options)
 
 
-def safe_sqr(X, copy=True):
+@_deprecate_positional_args
+def safe_sqr(X, *, copy=True):
     """Element wise squaring of array-likes and sparse matrices.
 
     Parameters
@@ -723,7 +725,8 @@ def _chunk_generator(gen, chunksize):
             return
 
 
-def gen_batches(n, batch_size, min_batch_size=0):
+@_deprecate_positional_args
+def gen_batches(n, batch_size, *, min_batch_size=0):
     """Generator to create slices containing batch_size elements, from 0 to n.
 
     The last slice may contain less than batch_size elements, when batch_size
@@ -772,7 +775,8 @@ def gen_batches(n, batch_size, min_batch_size=0):
         yield slice(start, n)
 
 
-def gen_even_slices(n, n_packs, n_samples=None):
+@_deprecate_positional_args
+def gen_even_slices(n, n_packs, *, n_samples=None):
     """Generator to create n_packs slices going up to n.
 
     Parameters
@@ -957,8 +961,8 @@ def _print_elapsed_time(source, message=None):
                                timeit.default_timer() - start))
 
 
-def get_chunk_n_rows(row_bytes, max_n_rows=None,
-                     working_memory=None):
+@_deprecate_positional_args
+def get_chunk_n_rows(row_bytes, *, max_n_rows=None, working_memory=None):
     """Calculates how many rows can be processed within working_memory
 
     Parameters
diff --git a/sklearn/utils/class_weight.py b/sklearn/utils/class_weight.py
index 5f785cb36df45..8c64e33e1d0d4 100644
--- a/sklearn/utils/class_weight.py
+++ b/sklearn/utils/class_weight.py
@@ -4,6 +4,8 @@
 
 import numpy as np
 
+from .validation import _deprecate_positional_args
+
 
 def compute_class_weight(class_weight, classes, y):
     """Estimate class weights for unbalanced datasets.
@@ -69,7 +71,8 @@ def compute_class_weight(class_weight, classes, y):
     return weight
 
 
-def compute_sample_weight(class_weight, y, indices=None):
+@_deprecate_positional_args
+def compute_sample_weight(class_weight, y, *, indices=None):
     """Estimate sample weights by class for unbalanced datasets.
 
     Parameters
diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py
index 3301ac977b4b9..8e471d5fdf577 100644
--- a/sklearn/utils/multiclass.py
+++ b/sklearn/utils/multiclass.py
@@ -27,7 +27,9 @@ def _unique_multiclass(y):
 
 
 def _unique_indicator(y):
-    return np.arange(check_array(y, ['csr', 'csc', 'coo']).shape[1])
+    return np.arange(
+        check_array(y, accept_sparse=['csr', 'csc', 'coo']).shape[1]
+    )
 
 
 _FN_UNIQUE_LABELS = {
@@ -83,7 +85,8 @@ def unique_labels(*ys):
 
     # Check consistency for the indicator format
     if (label_type == "multilabel-indicator" and
-            len(set(check_array(y, ['csr', 'csc', 'coo']).shape[1]
+            len(set(check_array(y,
+                                accept_sparse=['csr', 'csc', 'coo']).shape[1]
                     for y in ys)) > 1):
         raise ValueError("Multi-label binary indicator input with "
                          "different numbers of labels")
diff --git a/sklearn/utils/tests/test_class_weight.py b/sklearn/utils/tests/test_class_weight.py
index 487da5b431be0..067b12cc32f28 100644
--- a/sklearn/utils/tests/test_class_weight.py
+++ b/sklearn/utils/tests/test_class_weight.py
@@ -192,39 +192,41 @@ def test_compute_sample_weight_with_subsample():
     # Test compute_sample_weight with subsamples specified.
     # Test with balanced classes and all samples present
     y = np.asarray([1, 1, 1, 2, 2, 2])
-    sample_weight = compute_sample_weight("balanced", y, range(6))
+    sample_weight = compute_sample_weight("balanced", y, indices=range(6))
     assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
 
     # Test with column vector of balanced classes and all samples present
     y = np.asarray([[1], [1], [1], [2], [2], [2]])
-    sample_weight = compute_sample_weight("balanced", y, range(6))
+    sample_weight = compute_sample_weight("balanced", y, indices=range(6))
     assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
 
     # Test with a subsample
     y = np.asarray([1, 1, 1, 2, 2, 2])
-    sample_weight = compute_sample_weight("balanced", y, range(4))
+    sample_weight = compute_sample_weight("balanced", y, indices=range(4))
     assert_array_almost_equal(sample_weight, [2. / 3, 2. / 3,
                                               2. / 3, 2., 2., 2.])
 
     # Test with a bootstrap subsample
     y = np.asarray([1, 1, 1, 2, 2, 2])
-    sample_weight = compute_sample_weight("balanced", y, [0, 1, 1, 2, 2, 3])
+    sample_weight = compute_sample_weight("balanced", y,
+                                          indices=[0, 1, 1, 2, 2, 3])
     expected_balanced = np.asarray([0.6, 0.6, 0.6, 3., 3., 3.])
     assert_array_almost_equal(sample_weight, expected_balanced)
 
     # Test with a bootstrap subsample for multi-output
     y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
-    sample_weight = compute_sample_weight("balanced", y, [0, 1, 1, 2, 2, 3])
+    sample_weight = compute_sample_weight("balanced", y,
+                                          indices=[0, 1, 1, 2, 2, 3])
     assert_array_almost_equal(sample_weight, expected_balanced ** 2)
 
     # Test with a missing class
     y = np.asarray([1, 1, 1, 2, 2, 2, 3])
-    sample_weight = compute_sample_weight("balanced", y, range(6))
+    sample_weight = compute_sample_weight("balanced", y, indices=range(6))
     assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.])
 
     # Test with a missing class for multi-output
     y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [2, 2]])
-    sample_weight = compute_sample_weight("balanced", y, range(6))
+    sample_weight = compute_sample_weight("balanced", y, indices=range(6))
     assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.])
 
 
@@ -237,15 +239,15 @@ def test_compute_sample_weight_errors():
     with pytest.raises(ValueError):
         compute_sample_weight("ni", y)
     with pytest.raises(ValueError):
-        compute_sample_weight("ni", y, range(4))
+        compute_sample_weight("ni", y, indices=range(4))
     with pytest.raises(ValueError):
         compute_sample_weight("ni", y_)
     with pytest.raises(ValueError):
-        compute_sample_weight("ni", y_, range(4))
+        compute_sample_weight("ni", y_, indices=range(4))
 
     # Not "balanced" for subsample
     with pytest.raises(ValueError):
-        compute_sample_weight({1: 2, 2: 1}, y, range(4))
+        compute_sample_weight({1: 2, 2: 1}, y, indices=range(4))
 
     # Not a list or preset for multi-output
     with pytest.raises(ValueError):
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index b178ccc148d9d..418f037936c64 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -63,7 +63,7 @@ def test_as_float_array():
     X = X.astype(np.int64)
     X2 = as_float_array(X, copy=True)
     # Checking that the array wasn't overwritten
-    assert as_float_array(X, False) is not X
+    assert as_float_array(X, copy=False) is not X
     assert X2.dtype == np.float64
     # Test int dtypes <= 32bit
     tested_dtypes = [np.bool,
@@ -912,7 +912,8 @@ def test_check_scalar_valid(x, target_type, min_val, max_val):
     """Test that check_scalar returns no error/warning if valid inputs are
     provided"""
     with pytest.warns(None) as record:
-        check_scalar(x, "test_name", target_type, min_val, max_val)
+        check_scalar(x, "test_name", target_type=target_type,
+                     min_val=min_val, max_val=max_val)
     assert len(record) == 0
 
 
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 953584fff0f8a..1082ad7337dee 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -36,6 +36,44 @@
 warnings.simplefilter('ignore', NonBLASDotWarning)
 
 
+def _deprecate_positional_args(f):
+    """Decorator for methods that issues warnings for positional arguments
+
+    Using the keyword-only argument syntax in pep 3102, arguments after the
+    * will issue a warning when passed as a positional argument.
+
+    Parameters
+    ----------
+    f : function
+        function to check arguments on
+    """
+    sig = signature(f)
+    kwonly_args = []
+    all_args = []
+
+    for name, param in sig.parameters.items():
+        if param.kind == Parameter.POSITIONAL_OR_KEYWORD:
+            all_args.append(name)
+        elif param.kind == Parameter.KEYWORD_ONLY:
+            kwonly_args.append(name)
+
+    @wraps(f)
+    def inner_f(*args, **kwargs):
+        extra_args = len(args) - len(all_args)
+        if extra_args > 0:
+            # ignore first 'self' argument for instance methods
+            args_msg = ['{}={}'.format(name, arg)
+                        for name, arg in zip(kwonly_args[:extra_args],
+                                             args[-extra_args:])]
+            warnings.warn("Pass {} as keyword args. From version 0.25 "
+                          "passing these as positional arguments will "
+                          "result in an error".format(", ".join(args_msg)),
+                          FutureWarning)
+        kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
+        return f(**kwargs)
+    return inner_f
+
+
 def _assert_all_finite(X, allow_nan=False, msg_dtype=None):
     """Like assert_all_finite, but only for ndarray."""
     # validation is also imported in extmath
@@ -67,7 +105,8 @@ def _assert_all_finite(X, allow_nan=False, msg_dtype=None):
             raise ValueError("Input contains NaN")
 
 
-def assert_all_finite(X, allow_nan=False):
+@_deprecate_positional_args
+def assert_all_finite(X, *, allow_nan=False):
     """Throw a ValueError if X contains NaN or infinity.
 
     Parameters
@@ -79,7 +118,8 @@ def assert_all_finite(X, allow_nan=False):
     _assert_all_finite(X.data if sp.issparse(X) else X, allow_nan)
 
 
-def as_float_array(X, copy=True, force_all_finite=True):
+@_deprecate_positional_args
+def as_float_array(X, *, copy=True, force_all_finite=True):
     """Converts an array-like to an array of floats.
 
     The new dtype will be np.float32 or np.float64, depending on the original
@@ -113,9 +153,9 @@ def as_float_array(X, copy=True, force_all_finite=True):
     """
     if isinstance(X, np.matrix) or (not isinstance(X, np.ndarray)
                                     and not sp.issparse(X)):
-        return check_array(X, ['csr', 'csc', 'coo'], dtype=np.float64,
-                           copy=copy, force_all_finite=force_all_finite,
-                           ensure_2d=False)
+        return check_array(X, accept_sparse=['csr', 'csc', 'coo'],
+                           dtype=np.float64, copy=copy,
+                           force_all_finite=force_all_finite, ensure_2d=False)
     elif sp.issparse(X) and X.dtype in [np.float32, np.float64]:
         return X.copy() if copy else X
     elif X.dtype in [np.float32, np.float64]:  # is numpy array
@@ -349,7 +389,8 @@ def _ensure_no_complex_data(array):
                          "{}\n".format(array))
 
 
-def check_array(array, accept_sparse=False, accept_large_sparse=True,
+@_deprecate_positional_args
+def check_array(array, *, accept_sparse=False, accept_large_sparse=True,
                 dtype="numeric", order=None, copy=False, force_all_finite=True,
                 ensure_2d=True, allow_nd=False, ensure_min_samples=1,
                 ensure_min_features=1, estimator=None):
@@ -620,7 +661,8 @@ def _check_large_sparse(X, accept_large_sparse=False):
                                  % indices_datatype)
 
 
-def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True,
+@_deprecate_positional_args
+def check_X_y(X, y, *, accept_sparse=False, accept_large_sparse=True,
               dtype="numeric", order=None, copy=False, force_all_finite=True,
               ensure_2d=True, allow_nd=False, multi_output=False,
               ensure_min_samples=1, ensure_min_features=1, y_numeric=False,
@@ -732,8 +774,8 @@ def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True,
                     ensure_min_features=ensure_min_features,
                     estimator=estimator)
     if multi_output:
-        y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
-                        dtype=None)
+        y = check_array(y, accept_sparse='csr', force_all_finite=True,
+                        ensure_2d=False, dtype=None)
     else:
         y = column_or_1d(y, warn=True)
         _assert_all_finite(y)
@@ -745,7 +787,8 @@ def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True,
     return X, y
 
 
-def column_or_1d(y, warn=False):
+@_deprecate_positional_args
+def column_or_1d(y, *, warn=False):
     """ Ravel column or 1d numpy array, else raises an error
 
     Parameters
@@ -825,7 +868,8 @@ def has_fit_parameter(estimator, parameter):
     return parameter in signature(estimator.fit).parameters
 
 
-def check_symmetric(array, tol=1E-10, raise_warning=True,
+@_deprecate_positional_args
+def check_symmetric(array, *, tol=1E-10, raise_warning=True,
                     raise_exception=False):
     """Make sure that array is 2D, square and symmetric.
 
@@ -881,7 +925,8 @@ def check_symmetric(array, tol=1E-10, raise_warning=True,
     return array
 
 
-def check_is_fitted(estimator, attributes=None, msg=None, all_or_any=all):
+@_deprecate_positional_args
+def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all):
     """Perform is_fitted validation for estimator.
 
     Checks if the estimator is fitted by verifying the presence of
@@ -974,7 +1019,7 @@ def check_non_negative(X, whom):
         raise ValueError("Negative values in data passed to %s" % whom)
 
 
-def check_scalar(x, name, target_type, min_val=None, max_val=None):
+def check_scalar(x, name, target_type, *, min_val=None, max_val=None):
     """Validate scalar parameters type and value.
 
     Parameters
@@ -1268,44 +1313,6 @@ def _allclose_dense_sparse(x, y, rtol=1e-7, atol=1e-9):
                      "matrix and an array")
 
 
-def _deprecate_positional_args(f):
-    """Decorator for methods that issues warnings for positional arguments
-
-    Using the keyword-only argument syntax in pep 3102, arguments after the
-    * will issue a warning when passed as a positional argument.
-
-    Parameters
-    ----------
-    f : function
-        function to check arguments on
-    """
-    sig = signature(f)
-    kwonly_args = []
-    all_args = []
-
-    for name, param in sig.parameters.items():
-        if param.kind == Parameter.POSITIONAL_OR_KEYWORD:
-            all_args.append(name)
-        elif param.kind == Parameter.KEYWORD_ONLY:
-            kwonly_args.append(name)
-
-    @wraps(f)
-    def inner_f(*args, **kwargs):
-        extra_args = len(args) - len(all_args)
-        if extra_args > 0:
-            # ignore first 'self' argument for instance methods
-            args_msg = ['{}={}'.format(name, arg)
-                        for name, arg in zip(kwonly_args[:extra_args],
-                                             args[-extra_args:])]
-            warnings.warn("Pass {} as keyword args. From version 0.25 "
-                          "passing these as positional arguments will "
-                          "result in an error".format(", ".join(args_msg)),
-                          FutureWarning)
-        kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
-        return f(**kwargs)
-    return inner_f
-
-
 def _check_fit_params(X, fit_params, indices=None):
     """Check and validate the parameters passed during `fit`.
 

From f621d8a39287dbe831dd9e6469934593598b6ed3 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Sun, 26 Apr 2020 18:30:09 +0200
Subject: [PATCH 067/125] Revert "API kwonly for utils (#17007)" (#17045)

---
 sklearn/utils/__init__.py                |  20 ++---
 sklearn/utils/class_weight.py            |   5 +-
 sklearn/utils/multiclass.py              |   7 +-
 sklearn/utils/tests/test_class_weight.py |  22 +++--
 sklearn/utils/tests/test_validation.py   |   5 +-
 sklearn/utils/validation.py              | 109 +++++++++++------------
 6 files changed, 74 insertions(+), 94 deletions(-)

diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index afde7614070fd..aac6e292a198a 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -29,8 +29,7 @@
                          assert_all_finite,
                          check_random_state, column_or_1d, check_array,
                          check_consistent_length, check_X_y, indexable,
-                         check_symmetric, check_scalar,
-                         _deprecate_positional_args)
+                         check_symmetric, check_scalar)
 from .. import get_config
 
 
@@ -315,10 +314,10 @@ def safe_indexing(X, indices, axis=0):
     CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are
     not supported.
     """
-    return _safe_indexing(X, indices, axis=axis)
+    return _safe_indexing(X, indices, axis)
 
 
-def _safe_indexing(X, indices, *, axis=0):
+def _safe_indexing(X, indices, axis=0):
     """Return rows, items or columns of X using indices.
 
     .. warning::
@@ -685,8 +684,7 @@ def shuffle(*arrays, **options):
     return resample(*arrays, **options)
 
 
-@_deprecate_positional_args
-def safe_sqr(X, *, copy=True):
+def safe_sqr(X, copy=True):
     """Element wise squaring of array-likes and sparse matrices.
 
     Parameters
@@ -725,8 +723,7 @@ def _chunk_generator(gen, chunksize):
             return
 
 
-@_deprecate_positional_args
-def gen_batches(n, batch_size, *, min_batch_size=0):
+def gen_batches(n, batch_size, min_batch_size=0):
     """Generator to create slices containing batch_size elements, from 0 to n.
 
     The last slice may contain less than batch_size elements, when batch_size
@@ -775,8 +772,7 @@ def gen_batches(n, batch_size, *, min_batch_size=0):
         yield slice(start, n)
 
 
-@_deprecate_positional_args
-def gen_even_slices(n, n_packs, *, n_samples=None):
+def gen_even_slices(n, n_packs, n_samples=None):
     """Generator to create n_packs slices going up to n.
 
     Parameters
@@ -961,8 +957,8 @@ def _print_elapsed_time(source, message=None):
                                timeit.default_timer() - start))
 
 
-@_deprecate_positional_args
-def get_chunk_n_rows(row_bytes, *, max_n_rows=None, working_memory=None):
+def get_chunk_n_rows(row_bytes, max_n_rows=None,
+                     working_memory=None):
     """Calculates how many rows can be processed within working_memory
 
     Parameters
diff --git a/sklearn/utils/class_weight.py b/sklearn/utils/class_weight.py
index 8c64e33e1d0d4..5f785cb36df45 100644
--- a/sklearn/utils/class_weight.py
+++ b/sklearn/utils/class_weight.py
@@ -4,8 +4,6 @@
 
 import numpy as np
 
-from .validation import _deprecate_positional_args
-
 
 def compute_class_weight(class_weight, classes, y):
     """Estimate class weights for unbalanced datasets.
@@ -71,8 +69,7 @@ def compute_class_weight(class_weight, classes, y):
     return weight
 
 
-@_deprecate_positional_args
-def compute_sample_weight(class_weight, y, *, indices=None):
+def compute_sample_weight(class_weight, y, indices=None):
     """Estimate sample weights by class for unbalanced datasets.
 
     Parameters
diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py
index 8e471d5fdf577..3301ac977b4b9 100644
--- a/sklearn/utils/multiclass.py
+++ b/sklearn/utils/multiclass.py
@@ -27,9 +27,7 @@ def _unique_multiclass(y):
 
 
 def _unique_indicator(y):
-    return np.arange(
-        check_array(y, accept_sparse=['csr', 'csc', 'coo']).shape[1]
-    )
+    return np.arange(check_array(y, ['csr', 'csc', 'coo']).shape[1])
 
 
 _FN_UNIQUE_LABELS = {
@@ -85,8 +83,7 @@ def unique_labels(*ys):
 
     # Check consistency for the indicator format
     if (label_type == "multilabel-indicator" and
-            len(set(check_array(y,
-                                accept_sparse=['csr', 'csc', 'coo']).shape[1]
+            len(set(check_array(y, ['csr', 'csc', 'coo']).shape[1]
                     for y in ys)) > 1):
         raise ValueError("Multi-label binary indicator input with "
                          "different numbers of labels")
diff --git a/sklearn/utils/tests/test_class_weight.py b/sklearn/utils/tests/test_class_weight.py
index 067b12cc32f28..487da5b431be0 100644
--- a/sklearn/utils/tests/test_class_weight.py
+++ b/sklearn/utils/tests/test_class_weight.py
@@ -192,41 +192,39 @@ def test_compute_sample_weight_with_subsample():
     # Test compute_sample_weight with subsamples specified.
     # Test with balanced classes and all samples present
     y = np.asarray([1, 1, 1, 2, 2, 2])
-    sample_weight = compute_sample_weight("balanced", y, indices=range(6))
+    sample_weight = compute_sample_weight("balanced", y, range(6))
     assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
 
     # Test with column vector of balanced classes and all samples present
     y = np.asarray([[1], [1], [1], [2], [2], [2]])
-    sample_weight = compute_sample_weight("balanced", y, indices=range(6))
+    sample_weight = compute_sample_weight("balanced", y, range(6))
     assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
 
     # Test with a subsample
     y = np.asarray([1, 1, 1, 2, 2, 2])
-    sample_weight = compute_sample_weight("balanced", y, indices=range(4))
+    sample_weight = compute_sample_weight("balanced", y, range(4))
     assert_array_almost_equal(sample_weight, [2. / 3, 2. / 3,
                                               2. / 3, 2., 2., 2.])
 
     # Test with a bootstrap subsample
     y = np.asarray([1, 1, 1, 2, 2, 2])
-    sample_weight = compute_sample_weight("balanced", y,
-                                          indices=[0, 1, 1, 2, 2, 3])
+    sample_weight = compute_sample_weight("balanced", y, [0, 1, 1, 2, 2, 3])
     expected_balanced = np.asarray([0.6, 0.6, 0.6, 3., 3., 3.])
     assert_array_almost_equal(sample_weight, expected_balanced)
 
     # Test with a bootstrap subsample for multi-output
     y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
-    sample_weight = compute_sample_weight("balanced", y,
-                                          indices=[0, 1, 1, 2, 2, 3])
+    sample_weight = compute_sample_weight("balanced", y, [0, 1, 1, 2, 2, 3])
     assert_array_almost_equal(sample_weight, expected_balanced ** 2)
 
     # Test with a missing class
     y = np.asarray([1, 1, 1, 2, 2, 2, 3])
-    sample_weight = compute_sample_weight("balanced", y, indices=range(6))
+    sample_weight = compute_sample_weight("balanced", y, range(6))
     assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.])
 
     # Test with a missing class for multi-output
     y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [2, 2]])
-    sample_weight = compute_sample_weight("balanced", y, indices=range(6))
+    sample_weight = compute_sample_weight("balanced", y, range(6))
     assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.])
 
 
@@ -239,15 +237,15 @@ def test_compute_sample_weight_errors():
     with pytest.raises(ValueError):
         compute_sample_weight("ni", y)
     with pytest.raises(ValueError):
-        compute_sample_weight("ni", y, indices=range(4))
+        compute_sample_weight("ni", y, range(4))
     with pytest.raises(ValueError):
         compute_sample_weight("ni", y_)
     with pytest.raises(ValueError):
-        compute_sample_weight("ni", y_, indices=range(4))
+        compute_sample_weight("ni", y_, range(4))
 
     # Not "balanced" for subsample
     with pytest.raises(ValueError):
-        compute_sample_weight({1: 2, 2: 1}, y, indices=range(4))
+        compute_sample_weight({1: 2, 2: 1}, y, range(4))
 
     # Not a list or preset for multi-output
     with pytest.raises(ValueError):
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 418f037936c64..b178ccc148d9d 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -63,7 +63,7 @@ def test_as_float_array():
     X = X.astype(np.int64)
     X2 = as_float_array(X, copy=True)
     # Checking that the array wasn't overwritten
-    assert as_float_array(X, copy=False) is not X
+    assert as_float_array(X, False) is not X
     assert X2.dtype == np.float64
     # Test int dtypes <= 32bit
     tested_dtypes = [np.bool,
@@ -912,8 +912,7 @@ def test_check_scalar_valid(x, target_type, min_val, max_val):
     """Test that check_scalar returns no error/warning if valid inputs are
     provided"""
     with pytest.warns(None) as record:
-        check_scalar(x, "test_name", target_type=target_type,
-                     min_val=min_val, max_val=max_val)
+        check_scalar(x, "test_name", target_type, min_val, max_val)
     assert len(record) == 0
 
 
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 1082ad7337dee..953584fff0f8a 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -36,44 +36,6 @@
 warnings.simplefilter('ignore', NonBLASDotWarning)
 
 
-def _deprecate_positional_args(f):
-    """Decorator for methods that issues warnings for positional arguments
-
-    Using the keyword-only argument syntax in pep 3102, arguments after the
-    * will issue a warning when passed as a positional argument.
-
-    Parameters
-    ----------
-    f : function
-        function to check arguments on
-    """
-    sig = signature(f)
-    kwonly_args = []
-    all_args = []
-
-    for name, param in sig.parameters.items():
-        if param.kind == Parameter.POSITIONAL_OR_KEYWORD:
-            all_args.append(name)
-        elif param.kind == Parameter.KEYWORD_ONLY:
-            kwonly_args.append(name)
-
-    @wraps(f)
-    def inner_f(*args, **kwargs):
-        extra_args = len(args) - len(all_args)
-        if extra_args > 0:
-            # ignore first 'self' argument for instance methods
-            args_msg = ['{}={}'.format(name, arg)
-                        for name, arg in zip(kwonly_args[:extra_args],
-                                             args[-extra_args:])]
-            warnings.warn("Pass {} as keyword args. From version 0.25 "
-                          "passing these as positional arguments will "
-                          "result in an error".format(", ".join(args_msg)),
-                          FutureWarning)
-        kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
-        return f(**kwargs)
-    return inner_f
-
-
 def _assert_all_finite(X, allow_nan=False, msg_dtype=None):
     """Like assert_all_finite, but only for ndarray."""
     # validation is also imported in extmath
@@ -105,8 +67,7 @@ def _assert_all_finite(X, allow_nan=False, msg_dtype=None):
             raise ValueError("Input contains NaN")
 
 
-@_deprecate_positional_args
-def assert_all_finite(X, *, allow_nan=False):
+def assert_all_finite(X, allow_nan=False):
     """Throw a ValueError if X contains NaN or infinity.
 
     Parameters
@@ -118,8 +79,7 @@ def assert_all_finite(X, *, allow_nan=False):
     _assert_all_finite(X.data if sp.issparse(X) else X, allow_nan)
 
 
-@_deprecate_positional_args
-def as_float_array(X, *, copy=True, force_all_finite=True):
+def as_float_array(X, copy=True, force_all_finite=True):
     """Converts an array-like to an array of floats.
 
     The new dtype will be np.float32 or np.float64, depending on the original
@@ -153,9 +113,9 @@ def as_float_array(X, *, copy=True, force_all_finite=True):
     """
     if isinstance(X, np.matrix) or (not isinstance(X, np.ndarray)
                                     and not sp.issparse(X)):
-        return check_array(X, accept_sparse=['csr', 'csc', 'coo'],
-                           dtype=np.float64, copy=copy,
-                           force_all_finite=force_all_finite, ensure_2d=False)
+        return check_array(X, ['csr', 'csc', 'coo'], dtype=np.float64,
+                           copy=copy, force_all_finite=force_all_finite,
+                           ensure_2d=False)
     elif sp.issparse(X) and X.dtype in [np.float32, np.float64]:
         return X.copy() if copy else X
     elif X.dtype in [np.float32, np.float64]:  # is numpy array
@@ -389,8 +349,7 @@ def _ensure_no_complex_data(array):
                          "{}\n".format(array))
 
 
-@_deprecate_positional_args
-def check_array(array, *, accept_sparse=False, accept_large_sparse=True,
+def check_array(array, accept_sparse=False, accept_large_sparse=True,
                 dtype="numeric", order=None, copy=False, force_all_finite=True,
                 ensure_2d=True, allow_nd=False, ensure_min_samples=1,
                 ensure_min_features=1, estimator=None):
@@ -661,8 +620,7 @@ def _check_large_sparse(X, accept_large_sparse=False):
                                  % indices_datatype)
 
 
-@_deprecate_positional_args
-def check_X_y(X, y, *, accept_sparse=False, accept_large_sparse=True,
+def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True,
               dtype="numeric", order=None, copy=False, force_all_finite=True,
               ensure_2d=True, allow_nd=False, multi_output=False,
               ensure_min_samples=1, ensure_min_features=1, y_numeric=False,
@@ -774,8 +732,8 @@ def check_X_y(X, y, *, accept_sparse=False, accept_large_sparse=True,
                     ensure_min_features=ensure_min_features,
                     estimator=estimator)
     if multi_output:
-        y = check_array(y, accept_sparse='csr', force_all_finite=True,
-                        ensure_2d=False, dtype=None)
+        y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
+                        dtype=None)
     else:
         y = column_or_1d(y, warn=True)
         _assert_all_finite(y)
@@ -787,8 +745,7 @@ def check_X_y(X, y, *, accept_sparse=False, accept_large_sparse=True,
     return X, y
 
 
-@_deprecate_positional_args
-def column_or_1d(y, *, warn=False):
+def column_or_1d(y, warn=False):
     """ Ravel column or 1d numpy array, else raises an error
 
     Parameters
@@ -868,8 +825,7 @@ def has_fit_parameter(estimator, parameter):
     return parameter in signature(estimator.fit).parameters
 
 
-@_deprecate_positional_args
-def check_symmetric(array, *, tol=1E-10, raise_warning=True,
+def check_symmetric(array, tol=1E-10, raise_warning=True,
                     raise_exception=False):
     """Make sure that array is 2D, square and symmetric.
 
@@ -925,8 +881,7 @@ def check_symmetric(array, *, tol=1E-10, raise_warning=True,
     return array
 
 
-@_deprecate_positional_args
-def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all):
+def check_is_fitted(estimator, attributes=None, msg=None, all_or_any=all):
     """Perform is_fitted validation for estimator.
 
     Checks if the estimator is fitted by verifying the presence of
@@ -1019,7 +974,7 @@ def check_non_negative(X, whom):
         raise ValueError("Negative values in data passed to %s" % whom)
 
 
-def check_scalar(x, name, target_type, *, min_val=None, max_val=None):
+def check_scalar(x, name, target_type, min_val=None, max_val=None):
     """Validate scalar parameters type and value.
 
     Parameters
@@ -1313,6 +1268,44 @@ def _allclose_dense_sparse(x, y, rtol=1e-7, atol=1e-9):
                      "matrix and an array")
 
 
+def _deprecate_positional_args(f):
+    """Decorator for methods that issues warnings for positional arguments
+
+    Using the keyword-only argument syntax in pep 3102, arguments after the
+    * will issue a warning when passed as a positional argument.
+
+    Parameters
+    ----------
+    f : function
+        function to check arguments on
+    """
+    sig = signature(f)
+    kwonly_args = []
+    all_args = []
+
+    for name, param in sig.parameters.items():
+        if param.kind == Parameter.POSITIONAL_OR_KEYWORD:
+            all_args.append(name)
+        elif param.kind == Parameter.KEYWORD_ONLY:
+            kwonly_args.append(name)
+
+    @wraps(f)
+    def inner_f(*args, **kwargs):
+        extra_args = len(args) - len(all_args)
+        if extra_args > 0:
+            # ignore first 'self' argument for instance methods
+            args_msg = ['{}={}'.format(name, arg)
+                        for name, arg in zip(kwonly_args[:extra_args],
+                                             args[-extra_args:])]
+            warnings.warn("Pass {} as keyword args. From version 0.25 "
+                          "passing these as positional arguments will "
+                          "result in an error".format(", ".join(args_msg)),
+                          FutureWarning)
+        kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
+        return f(**kwargs)
+    return inner_f
+
+
 def _check_fit_params(X, fit_params, indices=None):
     """Check and validate the parameters passed during `fit`.
 

From a14953aca9cb0c1b0e33b2c31ec5666919b95871 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Mon, 27 Apr 2020 02:52:47 +1000
Subject: [PATCH 068/125] ENH buffer openml stream rather than reading all at
 once (#16084)

---
 doc/whats_new/v0.23.rst               |   4 +
 sklearn/datasets/_openml.py           | 233 +++++++++++++++-----------
 sklearn/datasets/tests/test_openml.py |  21 ++-
 3 files changed, 154 insertions(+), 104 deletions(-)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index 844cdf0360f73..2bd44dcd54486 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -123,6 +123,10 @@ Changelog
   :func:`datasets.make_moons` now accept two-element tuple.
   :pr:`15707` by :user:`Maciej J Mikulski <mjmikulski>`.
 
+- |Efficiency| :func:`datasets.fetch_openml` has reduced memory usage because
+  it no longer stores the full dataset text stream in memory. :pr:`16084` by
+  `Joel Nothman`_.
+
 - |Feature| :func:`datasets.fetch_california_housing` now supports
   heterogeneous data using pandas by setting `as_frame=True`. :pr:`15950`
   by :user:`Stephanie Andrews <gitsteph>` and
diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index 26260a27ec883..10f40dc8906bf 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -9,6 +9,7 @@
 import itertools
 from collections.abc import Generator
 from collections import OrderedDict
+from functools import partial
 
 from urllib.request import urlopen, Request
 
@@ -44,11 +45,11 @@ def _retry_with_clean_cache(openml_path, data_home):
     """
     def decorator(f):
         @wraps(f)
-        def wrapper():
+        def wrapper(*args, **kw):
             if data_home is None:
-                return f()
+                return f(*args, **kw)
             try:
-                return f()
+                return f(*args, **kw)
             except HTTPError:
                 raise
             except Exception:
@@ -56,7 +57,7 @@ def wrapper():
                 local_path = _get_local_path(openml_path, data_home)
                 if os.path.exists(local_path):
                     os.unlink(local_path)
-                return f()
+                return f(*args, **kw)
         return wrapper
     return decorator
 
@@ -217,7 +218,7 @@ def _sparse_data_to_array(arff_data, include_columns):
     return y
 
 
-def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None):
+def _convert_arff_data(arff, col_slice_x, col_slice_y, shape=None):
     """
     converts the arff object into the appropriate matrix type (np.array or
     scipy.sparse.csr_matrix) based on the 'data part' (i.e., in the
@@ -225,8 +226,8 @@ def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None):
 
     Parameters
     ----------
-    arff_data : list or dict
-        as obtained from liac-arff object
+    arff : dict
+        As obtained from liac-arff object.
 
     col_slice_x : list
         The column indices that are sliced from the original array to return
@@ -241,6 +242,7 @@ def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None):
     X : np.array or scipy.sparse.csr_matrix
     y : np.array
     """
+    arff_data = arff['data']
     if isinstance(arff_data, Generator):
         if shape[0] == -1:
             count = -1
@@ -300,7 +302,8 @@ def _convert_arff_data_dataframe(arff, columns, features_dict):
 
     Returns
     -------
-    dataframe : pandas DataFrame
+    result : tuple
+        tuple with the resulting dataframe
     """
     pd = check_pandas_support('fetch_openml with as_frame=True')
 
@@ -327,7 +330,7 @@ def _convert_arff_data_dataframe(arff, columns, features_dict):
         if dtype == 'category':
             dtype = pd.api.types.CategoricalDtype(attributes[column])
         df[column] = df[column].astype(dtype, copy=False)
-    return df
+    return (df, )
 
 
 def _get_data_info_by_name(name, version, data_home):
@@ -448,27 +451,119 @@ def _get_num_samples(data_qualities):
     return int(float(qualities.get('NumberOfInstances', default_n_samples)))
 
 
-def _download_data_arff(file_id, sparse, data_home, encode_nominal=True):
-    # Accesses an ARFF file on the OpenML server. Documentation:
-    # https://www.openml.org/api_data_docs#!/data/get_download_id
-    # encode_nominal argument is to ensure unit testing, do not alter in
-    # production!
-    url = _DATA_FILE.format(file_id)
+def _load_arff_response(url, data_home, return_type, encode_nominal,
+                        parse_arff):
+    """Load arff data with url and parses arff response with parse_arff"""
+    response = _open_openml_url(url, data_home)
 
-    @_retry_with_clean_cache(url, data_home)
-    def _arff_load():
-        with closing(_open_openml_url(url, data_home)) as response:
-            if sparse is True:
-                return_type = _arff.COO
-            else:
-                return_type = _arff.DENSE_GEN
+    with closing(response):
+        # Note that if the data is dense, no reading is done until the data
+        # generator is iterated.
+        arff = _arff.load((line.decode('utf-8') for line in response),
+                          return_type=return_type,
+                          encode_nominal=encode_nominal)
+        return parse_arff(arff)
+
+
+def _download_data_to_bunch(url, sparse, data_home, *,
+                            as_frame, features_list, data_columns,
+                            target_columns, shape):
+    """Download OpenML ARFF and convert to Bunch of data
+    """
+    # NB: this function is long in order to handle retry for any failure
+    #     during the streaming parse of the ARFF.
+
+    # Prepare which columns and data types should be returned for the X and y
+    features_dict = {feature['name']: feature for feature in features_list}
+
+    # XXX: col_slice_y should be all nominal or all numeric
+    _verify_target_data_type(features_dict, target_columns)
+
+    col_slice_y = [int(features_dict[col_name]['index'])
+                   for col_name in target_columns]
 
-            arff_file = _arff.loads(response.read().decode('utf-8'),
-                                    encode_nominal=encode_nominal,
-                                    return_type=return_type)
-        return arff_file
+    col_slice_x = [int(features_dict[col_name]['index'])
+                   for col_name in data_columns]
+    for col_idx in col_slice_y:
+        feat = features_list[col_idx]
+        nr_missing = int(feat['number_of_missing_values'])
+        if nr_missing > 0:
+            raise ValueError('Target column {} has {} missing values. '
+                             'Missing values are not supported for target '
+                             'columns. '.format(feat['name'], nr_missing))
 
-    return _arff_load()
+    # Access an ARFF file on the OpenML server. Documentation:
+    # https://www.openml.org/api_data_docs#!/data/get_download_id
+
+    if sparse is True:
+        return_type = _arff.COO
+    else:
+        return_type = _arff.DENSE_GEN
+
+    frame = nominal_attributes = None
+    if as_frame:
+        columns = data_columns + target_columns
+        parse_arff = partial(_convert_arff_data_dataframe, columns=columns,
+                             features_dict=features_dict)
+
+        def postprocess(frame):
+            X = frame[data_columns]
+            if len(target_columns) >= 2:
+                y = frame[target_columns]
+            elif len(target_columns) == 1:
+                y = frame[target_columns[0]]
+            else:
+                y = None
+            return X, y, frame, nominal_attributes
+    else:
+        def parse_arff(arff):
+            X, y = _convert_arff_data(arff, col_slice_x, col_slice_y, shape)
+            # nominal attributes is a dict mapping from the attribute name to
+            # the possible values. Includes also the target column (which will
+            # be popped off below, before it will be packed in the Bunch
+            # object)
+            nominal_attributes = {k: v for k, v in arff['attributes']
+                                  if isinstance(v, list) and
+                                  k in data_columns + target_columns}
+            return X, y, nominal_attributes
+
+        def postprocess(X, y, nominal_attributes):
+            is_classification = {col_name in nominal_attributes
+                                 for col_name in target_columns}
+            if not is_classification:
+                # No target
+                pass
+            elif all(is_classification):
+                y = np.hstack([
+                    np.take(
+                        np.asarray(nominal_attributes.pop(col_name),
+                                   dtype='O'),
+                        y[:, i:i + 1].astype(int, copy=False))
+                    for i, col_name in enumerate(target_columns)
+                ])
+            elif any(is_classification):
+                raise ValueError('Mix of nominal and non-nominal targets is '
+                                 'not currently supported')
+
+            # reshape y back to 1-D array, if there is only 1 target column;
+            # back to None if there are not target columns
+            if y.shape[1] == 1:
+                y = y.reshape((-1,))
+            elif y.shape[1] == 0:
+                y = None
+            return X, y, frame, nominal_attributes
+
+    out = _retry_with_clean_cache(url, data_home)(
+        _load_arff_response)(url, data_home,
+                             return_type=return_type,
+                             encode_nominal=not as_frame,
+                             parse_arff=parse_arff)
+    X, y, frame, nominal_attributes = postprocess(*out)
+
+    return Bunch(data=X, target=y, frame=frame,
+                 categories=nominal_attributes,
+                 feature_names=data_columns,
+                 target_names=target_columns)
 
 
 def _verify_target_data_type(features_dict, target_columns):
@@ -705,25 +800,6 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
     data_columns = _valid_data_column_names(features_list,
                                             target_columns)
 
-    # prepare which columns and data types should be returned for the X and y
-    features_dict = {feature['name']: feature for feature in features_list}
-
-    # XXX: col_slice_y should be all nominal or all numeric
-    _verify_target_data_type(features_dict, target_columns)
-
-    col_slice_y = [int(features_dict[col_name]['index'])
-                   for col_name in target_columns]
-
-    col_slice_x = [int(features_dict[col_name]['index'])
-                   for col_name in data_columns]
-    for col_idx in col_slice_y:
-        feat = features_list[col_idx]
-        nr_missing = int(feat['number_of_missing_values'])
-        if nr_missing > 0:
-            raise ValueError('Target column {} has {} missing values. '
-                             'Missing values are not supported for target '
-                             'columns. '.format(feat['name'], nr_missing))
-
     # determine arff encoding to return
     if not return_sparse:
         # The shape must include the ignored features to keep the right indexes
@@ -734,66 +810,21 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
         shape = None
 
     # obtain the data
-    arff = _download_data_arff(data_description['file_id'], return_sparse,
-                               data_home, encode_nominal=not as_frame)
+    url = _DATA_FILE.format(data_description['file_id'])
+    bunch = _download_data_to_bunch(url, return_sparse, data_home,
+                                    as_frame=as_frame,
+                                    features_list=features_list, shape=shape,
+                                    target_columns=target_columns,
+                                    data_columns=data_columns)
+
+    if return_X_y:
+        return bunch.data, bunch.target
 
     description = "{}\n\nDownloaded from openml.org.".format(
         data_description.pop('description'))
 
-    nominal_attributes = None
-    frame = None
-    if as_frame:
-        columns = data_columns + target_columns
-        frame = _convert_arff_data_dataframe(arff, columns, features_dict)
-        X = frame[data_columns]
-        if len(target_columns) >= 2:
-            y = frame[target_columns]
-        elif len(target_columns) == 1:
-            y = frame[target_columns[0]]
-        else:
-            y = None
-    else:
-        # nominal attributes is a dict mapping from the attribute name to the
-        # possible values. Includes also the target column (which will be
-        # popped off below, before it will be packed in the Bunch object)
-        nominal_attributes = {k: v for k, v in arff['attributes']
-                              if isinstance(v, list) and
-                              k in data_columns + target_columns}
-
-        X, y = _convert_arff_data(arff['data'], col_slice_x,
-                                  col_slice_y, shape)
-
-        is_classification = {col_name in nominal_attributes
-                             for col_name in target_columns}
-        if not is_classification:
-            # No target
-            pass
-        elif all(is_classification):
-            y = np.hstack([
-                np.take(
-                    np.asarray(nominal_attributes.pop(col_name), dtype='O'),
-                    y[:, i:i + 1].astype(int, copy=False))
-                for i, col_name in enumerate(target_columns)
-            ])
-        elif any(is_classification):
-            raise ValueError('Mix of nominal and non-nominal targets is not '
-                             'currently supported')
-
-        # reshape y back to 1-D array, if there is only 1 target column; back
-        # to None if there are not target columns
-        if y.shape[1] == 1:
-            y = y.reshape((-1,))
-        elif y.shape[1] == 0:
-            y = None
-
-    if return_X_y:
-        return X, y
-
-    bunch = Bunch(
-        data=X, target=y, frame=frame, feature_names=data_columns,
-        target_names=target_columns,
+    bunch.update(
         DESCR=description, details=data_description,
-        categories=nominal_attributes,
         url="https://www.openml.org/d/{}".format(data_id))
 
     return bunch
diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index f9969c75d5c8e..44fe392e42e74 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -12,8 +12,9 @@
 from sklearn import config_context
 from sklearn.datasets import fetch_openml
 from sklearn.datasets._openml import (_open_openml_url,
+                                      _arff,
+                                      _DATA_FILE,
                                       _get_data_description_by_id,
-                                      _download_data_arff,
                                       _get_local_path,
                                       _retry_with_clean_cache,
                                       _feature_to_dtype)
@@ -56,8 +57,13 @@ def decode_column(data_bunch, col_idx):
     if sparse is True:
         raise ValueError('This test is not intended for sparse data, to keep '
                          'code relatively simple')
-    data_arff = _download_data_arff(data_description['file_id'],
-                                    sparse, None, False)
+    url = _DATA_FILE.format(data_description['file_id'])
+    with _open_openml_url(url, data_home=None) as f:
+        data_arff = _arff.load((line.decode('utf-8') for line in f),
+                               return_type=(_arff.COO if sparse
+                                            else _arff.DENSE_GEN),
+                               encode_nominal=False)
+
     data_downloaded = np.array(list(data_arff['data']), dtype='O')
 
     for i in range(len(data_bunch.feature_names)):
@@ -176,6 +182,15 @@ def info(self):
                 return {'Content-Encoding': 'gzip'}
             return {}
 
+        def __iter__(self):
+            return iter(self.data)
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, exc_type, exc_val, exc_tb):
+            return False
+
     def _file_name(url, suffix):
         return (re.sub(r'\W', '-', url[len("https://openml.org/"):])
                 + suffix + path_suffix)

From a35b892522499bfe7a0e5fdfdfbd15752e63fbb0 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Sun, 26 Apr 2020 14:37:07 -0400
Subject: [PATCH 069/125] Fix Mypy issue in _openml.py (#17047)

---
 doc/developers/contributing.rst | 4 +++-
 sklearn/datasets/_openml.py     | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
index 33ab3fcecb887..c886119e908c1 100644
--- a/doc/developers/contributing.rst
+++ b/doc/developers/contributing.rst
@@ -431,7 +431,9 @@ You can check for common programming errors with the following tools:
 
     mypy --ignore-missing-import sklearn
 
-  must not produce new errors in your pull request. Using `# type: ignore` annotation can be a workaround for a few cases that are not supported by mypy, in particular,
+  must not produce new errors in your pull request. Using `# type: ignore`
+  annotation can be a workaround for a few cases that are not supported by
+  mypy, in particular,
    - when importing C or Cython modules
    - on properties with decorators
 
diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index 10f40dc8906bf..4040641a17574 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -506,7 +506,7 @@ def _download_data_to_bunch(url, sparse, data_home, *,
         parse_arff = partial(_convert_arff_data_dataframe, columns=columns,
                              features_dict=features_dict)
 
-        def postprocess(frame):
+        def postprocess(frame):  # type:ignore
             X = frame[data_columns]
             if len(target_columns) >= 2:
                 y = frame[target_columns]
@@ -527,7 +527,7 @@ def parse_arff(arff):
                                   k in data_columns + target_columns}
             return X, y, nominal_attributes
 
-        def postprocess(X, y, nominal_attributes):
+        def postprocess(X, y, nominal_attributes):  # type:ignore
             is_classification = {col_name in nominal_attributes
                                  for col_name in target_columns}
             if not is_classification:

From 7ede0285db17bc64719749255027032651388edf Mon Sep 17 00:00:00 2001
From: Alex Liang <liangtianchu@gmail.com>
Date: Sun, 26 Apr 2020 16:04:23 -0400
Subject: [PATCH 070/125] DOC add detail about flip_y parameter in
 make_classification (#17049)

---
 sklearn/datasets/_samples_generator.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/datasets/_samples_generator.py b/sklearn/datasets/_samples_generator.py
index fe0ac680ecd79..7642b6fb7dc59 100644
--- a/sklearn/datasets/_samples_generator.py
+++ b/sklearn/datasets/_samples_generator.py
@@ -102,7 +102,8 @@ def make_classification(n_samples=100, n_features=20, n_informative=2,
     flip_y : float, optional (default=0.01)
         The fraction of samples whose class is assigned randomly. Larger
         values introduce noise in the labels and make the classification
-        task harder.
+        task harder. Note that the default setting flip_y > 0 might lead
+        to less than n_classes in y in some cases.
 
     class_sep : float, optional (default=1.0)
         The factor multiplying the hypercube size.  Larger values spread

From f624f4e48781927cb3c8d140b1f033fcdb809d67 Mon Sep 17 00:00:00 2001
From: Adrin Jalali <adrin.jalali@gmail.com>
Date: Mon, 27 Apr 2020 15:21:31 +0200
Subject: [PATCH 071/125] DOC cleaning up to 0.23/whats new (#17015)

---
 doc/whats_new.rst            |   1 +
 doc/whats_new/v0.23.rst      | 225 +++++++++++++++++++++--------------
 doc/whats_new/v0.24.rst      |  55 +++++++++
 maint_tools/whats_missing.sh |   2 +-
 4 files changed, 194 insertions(+), 89 deletions(-)
 create mode 100644 doc/whats_new/v0.24.rst

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index 7b84374bd5146..66f2a3818cec8 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -12,6 +12,7 @@ on libraries.io to be notified when new versions are released.
 .. toctree::
     :maxdepth: 1
 
+    Version 0.24 <whats_new/v0.24.rst>
     Version 0.23 <whats_new/v0.23.rst>
     Version 0.22 <whats_new/v0.22.rst>
     Version 0.21 <whats_new/v0.21.rst>
diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index 2bd44dcd54486..4357845885e3f 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -22,14 +22,44 @@ parameters, may produce different models from the previous version. This often
 occurs due to changes in the modelling logic (bug fixes or enhancements), or in
 random sampling procedures.
 
-- :class:`ensemble.BaggingClassifier`, :class:`ensemble.BaggingRegressor`,
-  and :class:`ensemble.IsolationForest`. |Fix|
-
-- Any model using the :func:`svm.libsvm` or the :func:`svm.liblinear` solver,
+- |Fix| :class:`ensemble.BaggingClassifier`, :class:`ensemble.BaggingRegressor`,
+  and :class:`ensemble.IsolationForest`.
+- |Fix| :class:`cluster.KMeans` with ``algorithm="elkan"`` and
+  ``algorithm="full"``.
+- |Fix| :class:`cluster.Birch`
+- |Fix| :func:`compose.ColumnTransformer.get_feature_names`
+- |Fix| :func:`compose.ColumnTransformer.fit`
+- |Fix| :func:`datasets.make_multilabel_classification`
+- |Fix| :class:`decomposition.PCA` with `n_components='mle'`
+- |Enhancement| :class:`decomposition.NMF` and
+  :func:`decomposition.non_negative_factorization` with float32 dtype input.
+- |Fix| :func:`decomposition.KernelPCA.inverse_transform`
+- |API| :class:`ensemble.HistGradientBoostingClassifier` and
+  :class:`ensemble.HistGradientBoostingRegrerssor`
+- |Fix| ``estimator_samples_`` in :class:`ensemble.BaggingClassifier`,
+  :class:`ensemble.BaggingRegressor` and :class:`ensemble.IsolationForest`
+- |Fix| :class:`ensemble.StackingClassifier` and
+  :class:`ensemble.StackingRegressor` with `sample_weight`
+- |Fix| :class:`gaussian_process.GaussianProcessRegressor`
+- |Fix| :class:`linear_model.RANSACRegressor` with ``sample_weight``.
+- |Fix| :class:`linear_model.RidgeClassifierCV`
+- |Fix| :func:`metrics.mean_squared_error` with `squared` and
+  `multioutput='raw_values'`.
+- |Fix| :func:`metrics.mutual_info_score` with negative scores.
+- |Fix| :func:`metrics.confusion_matrix` with zero length `y_true` and `y_pred`
+- |Fix| :class:`neural_network.MLPClassifier`
+- |Fix| :class:`preprocessing.StandardScaler` with `partial_fit` and sparse
+  input.
+- |Fix| :class:`preprocessing.Normalizer` with norm='max'
+- |Fix| Any model using the :func:`svm.libsvm` or the :func:`svm.liblinear` solver,
   including :class:`svm.LinearSVC`, :class:`svm.LinearSVR`,
   :class:`svm.NuSVC`, :class:`svm.NuSVR`, :class:`svm.OneClassSVM`,
   :class:`svm.SVC`, :class:`svm.SVR`, :class:`linear_model.LogisticRegression`.
-  |Efficiency| |Fix|
+- |Fix| :class:`tree.DecisionTreeClassifier`, :class:`tree.ExtraTreeClassifier` and
+  :class:`ensemble.GradientBoostingClassifier` as well as ``predict`` method of
+  :class:`tree.DecisionTreeRegressor`, :class:`tree.ExtraTreeRegressor`, and
+  :class:`ensemble.GradientBoostingRegressor` and read-only float32 input in
+  ``predict``, ``decision_path`` and ``predict_proba``.
 
 Details are listed in the changelog below.
 
@@ -53,19 +83,29 @@ Changelog
 :mod:`sklearn.cluster`
 ......................
 
-- |Enhancement| :class:`cluster.AgglomerativeClustering` has a faster and more
-  more memory efficient implementation of single linkage clustering.
-  :pr:`11514` by :user:`Leland McInnes <lmcinnes>`.
-- |Fix| :class:`cluster.KMeans` with ``algorithm="elkan"`` now converges with
-  ``tol=0`` as with the default ``algorithm="full"``. :pr:`16075` by
-  :user:`Erich Schubert <kno10>`.
-
 - |Efficiency| :class:`cluster.Birch` implementation of the predict method
   avoids high memory footprint by calculating the distances matrix using
   a chunked scheme.
   :pr:`16149` by :user:`Jeremie du Boisberranger <jeremiedbb>` and
   :user:`Alex Shacked <alexshacked>`.
 
+- |Efficiency| The critical parts of :class:`cluster.KMeans` have a more
+  optimized implementation. Parallelism is now over the data instead of over
+  initializations allowing better scalability. :pr:`11950` by
+  :user:`Jeremie du Boisberranger <jeremiedbb>`.
+
+- |Enhancement| :class:`cluster.KMeans` now supports sparse data when
+  `solver = "elkan"`. :pr:`11950` by
+  :user:`Jeremie du Boisberranger <jeremiedbb>`.
+
+- |Enhancement| :class:`cluster.AgglomerativeClustering` has a faster and more
+  memory efficient implementation of single linkage clustering.
+  :pr:`11514` by :user:`Leland McInnes <lmcinnes>`.
+
+- |Fix| :class:`cluster.KMeans` with ``algorithm="elkan"`` now converges with
+  ``tol=0`` as with the default ``algorithm="full"``. :pr:`16075` by
+  :user:`Erich Schubert <kno10>`.
+
 - |Fix| Fixed a bug in :class:`cluster.Birch` where the `n_clusters` parameter
   could not have a `np.int64` type. :pr:`16484`
   by :user:`Jeremie du Boisberranger <jeremiedbb>`.
@@ -81,47 +121,28 @@ Changelog
   deprecated. It has no effect. :pr:`11950` by
   :user:`Jeremie du Boisberranger <jeremiedbb>`.
 
-- |Efficiency| The critical parts of :class:`cluster.KMeans` have a more
-  optimized implementation. Parallelism is now over the data instead of over
-  initializations allowing better scalability. :pr:`11950` by
-  :user:`Jeremie du Boisberranger <jeremiedbb>`.
-
-- |Enhancement| :class:`cluster.KMeans` now supports sparse data when
-  `solver = "elkan"`. :pr:`11950` by
-  :user:`Jeremie du Boisberranger <jeremiedbb>`.
-
 :mod:`sklearn.compose`
 ......................
 
-- |Fix| :class:`compose.ColumnTransformer` method ``get_feature_names`` now
-  returns correct results when one of the transformer steps applies on an
-  empty list of columns :pr:`15963` by `Roman Yurchak`_.
-
 - |Efficiency| :class:`compose.ColumnTransformer` is now faster when working
   with dataframes and strings are used to specific subsets of data for
   transformers. :pr:`16431` by `Thomas Fan`_.
 
-- |Fix| :func:`compose.ColumnTransformer.fit` will error when selecting
-  a column name that is not unique in the dataframe. :pr:`16431` by
-  `Thomas Fan`_.
-
 - |Enhancement| :class:`compose.ColumnTransformer` method ``get_feature_names``
   now supports `'passthrough'` columns, with the feature name being either
   the column name for a dataframe, or `'xi'` for column index `i`.
   :pr:`14048` by :user:`Lewis Ball <lrjball>`.
 
-:mod:`sklearn.datasets`
-.......................
+- |Fix| :class:`compose.ColumnTransformer` method ``get_feature_names`` now
+  returns correct results when one of the transformer steps applies on an
+  empty list of columns :pr:`15963` by `Roman Yurchak`_.
 
-- |Enhancement| Added ``return_centers`` parameter  in
-  :func:`datasets.make_blobs`, which can be used to return
-  centers for each cluster.
-  :pr:`15709` by :user:`<shivamgargsya>` and
-  :user:`Venkatachalam N <venkyyuvy>`.
+- |Fix| :func:`compose.ColumnTransformer.fit` will error when selecting
+  a column name that is not unique in the dataframe. :pr:`16431` by
+  `Thomas Fan`_.
 
-- |Enhancement| Functions :func:`datasets.make_circles` and
-  :func:`datasets.make_moons` now accept two-element tuple.
-  :pr:`15707` by :user:`Maciej J Mikulski <mjmikulski>`.
+:mod:`sklearn.datasets`
+.......................
 
 - |Efficiency| :func:`datasets.fetch_openml` has reduced memory usage because
   it no longer stores the full dataset text stream in memory. :pr:`16084` by
@@ -138,6 +159,16 @@ Changelog
   ``DataFrame`` by setting `as_frame=True`. :pr:`15980` by :user:`wconnell` and
   :user:`Reshama Shaikh <reshamas>`.
 
+- |Enhancement| Added ``return_centers`` parameter  in
+  :func:`datasets.make_blobs`, which can be used to return
+  centers for each cluster.
+  :pr:`15709` by :user:`<shivamgargsya>` and
+  :user:`Venkatachalam N <venkyyuvy>`.
+
+- |Enhancement| Functions :func:`datasets.make_circles` and
+  :func:`datasets.make_moons` now accept two-element tuple.
+  :pr:`15707` by :user:`Maciej J Mikulski <mjmikulski>`.
+
 - |Fix| :func:`datasets.make_multilabel_classification` now generates
   `ValueError` for arguments `n_classes < 1` OR `length < 1`.
   :pr:`16006` by :user:`Rushabh Vasani <rushabh-v>`.
@@ -145,20 +176,23 @@ Changelog
 :mod:`sklearn.decomposition`
 ............................
 
+- |Enhancement| :class:`decomposition.NMF` and
+  :func:`decomposition.non_negative_factorization` now preserves float32 dtype.
+  :pr:`16280` by :user:`Jeremie du Boisberranger <jeremiedbb>`.
+
+- |Enhancement| :func:`TruncatedSVD.transform` is now faster on given sparse
+  ``csc`` matrices. :pr:`16837` by :user:`wornbb`.
+
 - |Fix| :class:`decomposition.PCA` with a float `n_components` parameter, will
    exclusively choose the components that explain the variance greater than
    `n_components`. :pr:`15669` by :user:`Krishna Chaitanya <krishnachaitanya9>`
 
 - |Fix| :class:`decomposition.PCA` with `n_components='mle'` now correctly
   handles small eigenvalues, and does not infer 0 as the correct number of
-  components. :pr: `4441` by :user:`Lisa Schwetlick <lschwetlick>`, and
+  components. :pr:`16224` by :user:`Lisa Schwetlick <lschwetlick>`, and
   :user:`Gelavizh Ahmadi <gelavizh1>` and :user:`Marija Vlajic Wheeler
   <marijavlajic>` and :pr:`16841` by `Nicolas Hug`_.
 
-- |Enhancement| :class:`decomposition.NMF` and
-  :func:`decomposition.non_negative_factorization` now preserves float32 dtype.
-  :pr:`16280` by :user:`Jeremie du Boisberranger <jeremiedbb>`.
-
 - |Fix| :class:`decomposition.KernelPCA` method ``inverse_transform`` now
   applies the correct inverse transform to the transformed data. :pr:`16655`
   by :user:`Lewis Ball <lrjball>`.
@@ -174,9 +208,22 @@ Changelog
   :class:`ensemble.HistGradientBoostingRegressor` now support
   :term:`sample_weight`. :pr:`14696` by `Adrin Jalali`_ and `Nicolas Hug`_.
 
+- |Feature| Early stopping in
+  :class:`ensemble.HistGradientBoostingClassifier` and
+  :class:`ensemble.HistGradientBoostingRegressor` is now determined with a
+  new `early_stopping` parameter instead of `n_iter_no_change`. Default value
+  is 'auto', which enables early stopping if there are at least 10,000
+  samples in the training set. :pr:`14516` by :user:`Johann Faouzi
+  <johannfaouzi>`.
+
+- |Feature| :class:`ensemble.HistGradientBoostingClassifier` and
+  :class:`ensemble.HistGradientBoostingRegressor` now support monotonic
+  constraints, useful when features are supposed to have a positive/negative
+  effect on the target. :pr:`15582` by `Nicolas Hug`_.
+
 - |API| Added boolean `verbose` flag to classes:
   :class:`ensemble.VotingClassifier` and :class:`ensemble.VotingRegressor`.
-  :pr:`15991` by :user:`Sam Bail <spbail>`,
+  :pr:`16069` by :user:`Sam Bail <spbail>`,
   :user:`Hanna Bruce MacDonald <hannahbrucemacdonald>`,
   :user:`Reshama Shaikh <reshamas>`, and
   :user:`Chiara Marmo <cmarmo>`.
@@ -191,20 +238,7 @@ Changelog
   :class:`ensemble.HistGradientBoostingRegressor`. The depth now corresponds to
   the number of edges to go from the root to the deepest leaf.
   Stumps (trees with one split) are now allowed.
-  :pr: `16182` by :user:`Santhosh B <santhoshbala18>`
-
-- |Feature| Early stopping in
-  :class:`ensemble.HistGradientBoostingClassifier` and
-  :class:`ensemble.HistGradientBoostingRegressor` is now determined with a
-  new `early_stopping` parameter instead of `n_iter_no_change`. Default value
-  is 'auto', which enables early stopping if there are at least 10,000
-  samples in the training set. :pr:`14516` by :user:`Johann Faouzi
-  <johannfaouzi>`.
-
-- |Feature| :class:`ensemble.HistGradientBoostingClassifier` and
-  :class:`ensemble.HistGradientBoostingRegressor` now support monotonic
-  constraints, useful when features are supposed to have a positive/negative
-  effect on the target. :pr:`15582` by `Nicolas Hug`_.
+  :pr:`16182` by :user:`Santhosh B <santhoshbala18>`
 
 - |Fix| Fixed a bug in :class:`ensemble.BaggingClassifier`,
   :class:`ensemble.BaggingRegressor` and :class:`ensemble.IsolationForest`
@@ -278,11 +312,6 @@ Changelog
   :class:`linear_model:Lasso` for dense feature matrix `X`.
   :pr:`15436` by :user:`Christian Lorentzen <lorentzenchr>`.
 
-- |Fix| Fixed a bug where if a `sample_weight` parameter was passed to the fit
-  method of :class:`linear_model.RANSACRegressor`, it would not be passed to
-  the wrapped `base_estimator` during the fitting of the final model.
-  :pr:`15573` by :user:`Jeremy Alexandre <J-A16>`.
-
 - |Efficiency| :class:`linear_model.RidgeCV` and
   :class:`linear_model.RidgeClassifierCV` now does not allocate a
   potentially large array to store dual coefficients for all hyperparameters
@@ -290,6 +319,16 @@ Changelog
   `store_cv_values` is `True`.
   :pr:`15652` by :user:`Jérôme Dockès <jeromedockes>`.
 
+- |Enhancement| :class:`linear_model.LassoLars` and
+  :class:`linear_model.Lars` now support a `jitter` parameter that adds
+  random noise to the target. This might help with stability in some edge
+  cases. :pr:`15179` by :user:`angelaambroz`.
+
+- |Fix| Fixed a bug where if a `sample_weight` parameter was passed to the fit
+  method of :class:`linear_model.RANSACRegressor`, it would not be passed to
+  the wrapped `base_estimator` during the fitting of the final model.
+  :pr:`15773` by :user:`Jeremy Alexandre <J-A16>`.
+
 - |Fix| add `best_score_` attribute to :class:`linear_model.RidgeCV` and
   :class:`linear_model.RidgeClassifierCV`.
   :pr:`15653` by :user:`Jérôme Dockès <jeromedockes>`.
@@ -299,6 +338,11 @@ Changelog
   instead of predictions.
   :pr:`14848` by :user:`Venkatachalam N <venkyyuvy>`.
 
+- |Fix| :class:`linear_model.LogisticRegression` will now avoid an unnecessary
+  iteration when `solver='newton-cg'` by checking for inferior or equal instead
+  of strictly inferior for maximum of `absgrad` and `tol` in `utils.optimize._newton_cg`.
+  :pr:`16266` by :user:`Rushabh Vasani <rushabh-v>`.
+
 - |API| Deprecated public attributes `standard_coef_`, `standard_intercept_`,
   `average_coef_`, and `average_intercept_` in
   :class:`linear_model.SGDClassifier`,
@@ -307,31 +351,15 @@ Changelog
   :class:`linear_model.PassiveAggressiveRegressor`.
   :pr:`16261` by :user:`Carlos Brandt <chbrandt>`.
 
-- |Fix| :class:`linear_model.LogisticRegression` will now avoid an unnecessary
-  iteration when `solver='newton-cg'` by checking for inferior or equal instead
-  of strictly inferior for maximum of `absgrad` and `tol` in `utils.optimize._newton_cg`.
-  :pr:`16266` by :user:`Rushabh Vasani <rushabh-v>`.
-
 - |Fix| |Efficiency| :class:`linear_model.ARDRegression` is more stable and
   much faster when `n_samples > n_features`. It can now scale to hundreds of
   thousands of samples. The stability fix might imply changes in the number
   of non-zero coefficients and in the predicted output. :pr:`16849` by
   `Nicolas Hug`_.
 
-- |Enhancement| :class:`linear_model.LassoLars` and
-  :class:`linear_model.Lars` now support a `jitter` parameter that adds
-  random noise to the target. This might help with stability in some edge
-  cases. :pr:`15179` by :user:`angelaambroz`.
-
 :mod:`sklearn.metrics`
 ......................
 
-- |API| Changed the formatting of values in
-  :meth:`metrics.ConfusionMatrixDisplay.plot` and
-  :func:`metrics.plot_confusion_matrix` to pick the shorter format (either '2g'
-  or 'd'). :pr:`16159` by :user:`Rick Mackenbach <Rick-Mackenbach>` and
-  `Thomas Fan`_.
-
 - |Enhancement| :func:`metrics.pairwise.pairwise_distances_chunked` now allows
   its ``reduce_func`` to not have a return value, enabling in-place operations.
   :pr:`16397` by `Joel Nothman`_.
@@ -349,6 +377,12 @@ Changelog
   the `labels` parameter.
   :pr:`16442` by `Kyle Parsons <parsons-kyle-89>`.
 
+- |API| Changed the formatting of values in
+  :meth:`metrics.ConfusionMatrixDisplay.plot` and
+  :func:`metrics.plot_confusion_matrix` to pick the shorter format (either '2g'
+  or 'd'). :pr:`16159` by :user:`Rick Mackenbach <Rick-Mackenbach>` and
+  `Thomas Fan`_.
+
 :mod:`sklearn.model_selection`
 ..............................
 
@@ -398,14 +432,14 @@ Changelog
 :mod:`sklearn.preprocessing`
 ............................
 
-- |Efficiency| :class:`preprocessing.OneHotEncoder` is now faster at
-  transforming. :pr:`15762` by `Thomas Fan`_.
-
 - |Feature| argument `drop` of :class:`preprocessing.OneHotEncoder`
   will now accept value 'if_binary' and will drop the first category of
   each feature with two categories. :pr:`16245`
   by :user:`Rushabh Vasani <rushabh-v>`.
 
+- |Efficiency| :class:`preprocessing.OneHotEncoder` is now faster at
+  transforming. :pr:`15762` by `Thomas Fan`_.
+
 - |Fix| Fix a bug in :class:`preprocessing.StandardScaler` which was incorrectly
   computing statistics when calling `partial_fit` on sparse inputs.
   :pr:`16466` by :user:`Guillaume Lemaitre <glemaitre>`.
@@ -438,16 +472,16 @@ Changelog
   number of samples (LibSVM) or the number of features (LibLinear) is large.
   :pr:`13511` by :user:`Sylvain Marié <smarie>`.
 
-- |API| :class:`svm.SVR` and :class:`svm.OneClassSVM` attributes, `probA_` and
-  `probB_`, are now deprecated as they were not useful. :pr:`15558` by
-  `Thomas Fan`_.
-
 - |Fix| Fix use of custom kernel not taking float entries such as string
   kernels in :class:`svm.SVC` and :class:`svm.SVR`. Note that custom kennels
   are now expected to validate their input where they previously received
   valid numeric arrays.
   :pr:`11296` by `Alexandre Gramfort`_ and  :user:`Georgi Peev <georgipeev>`.
 
+- |API| :class:`svm.SVR` and :class:`svm.OneClassSVM` attributes, `probA_` and
+  `probB_`, are now deprecated as they were not useful. :pr:`15558` by
+  `Thomas Fan`_.
+
 :mod:`sklearn.tree`
 ...................
 
@@ -487,14 +521,29 @@ Changelog
 Miscellaneous
 .............
 
+- |Enhancement| ``scikit-learn`` now works with ``mypy`` without errors.
+  :pr:`16726` by `Roman Yurchak`_.
+
 - |API| Most estimators now expose a `n_features_in_` attribute. This
   attribute is equal to the number of features passed to the `fit` method.
   See `SLEP010
   <https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep010/proposal.html>`_
-  for details. :pr:`16112` and :pr:`16622` by `Nicolas Hug`_.
+  for details. :pr:`16112` by `Nicolas Hug`_.
 
 - |API| Estimators now have a `requires_y` tags which is False by default
   except for estimators that inherit from `~sklearn.base.RegressorMixin` or
   `~sklearn.base.ClassifierMixin`. This tag is used to ensure that a proper
   error message is raised when y was expected but None was passed.
   :pr:`16622` by `Nicolas Hug`_.
+
+- |API| Most constructor and function parameters are now expected to be passed
+  as a keyword and not positional. :issue:`15005` by `Joel Nothman`_,
+  `Adrin Jalali`_, `Thomas Fan`_, and `Nicolas Hug`_. See `SLEP009
+  <https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep009/proposal.html>`_
+  for more details.
+
+Code and Documentation Contributors
+-----------------------------------
+
+Thanks to everyone who has contributed to the maintenance and improvement of the
+project since version 0.20, including:
diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst
new file mode 100644
index 0000000000000..dd4ab30a7f2ff
--- /dev/null
+++ b/doc/whats_new/v0.24.rst
@@ -0,0 +1,55 @@
+.. include:: _contributors.rst
+
+.. currentmodule:: sklearn
+
+.. _changes_0_24:
+
+Version 0.24.0
+==============
+
+**In Development**
+
+
+.. include:: changelog_legend.inc
+
+Put the changes in their relevant module.
+
+Changed models
+--------------
+
+The following estimators and functions, when fit with the same data and
+parameters, may produce different models from the previous version. This often
+occurs due to changes in the modelling logic (bug fixes or enhancements), or in
+random sampling procedures.
+
+- items
+- items
+
+Details are listed in the changelog below.
+
+(While we are trying to better inform users by providing this information, we
+cannot assure that this list is complete.)
+
+Changelog
+---------
+
+..
+    Entries should be grouped by module (in alphabetic order) and prefixed with
+    one of the labels: |MajorFeature|, |Feature|, |Efficiency|, |Enhancement|,
+    |Fix| or |API| (see whats_new.rst for descriptions).
+    Entries should be ordered by those labels (e.g. |Fix| after |Efficiency|).
+    Changes not specific to a module should be listed under *Multiple Modules*
+    or *Miscellaneous*.
+    Entries should end with:
+    :pr:`123456` by :user:`Joe Bloggs <joeongithub>`.
+    where 123456 is the *pull request* number, not the issue number.
+
+:mod:`sklearn.module`
+.....................
+
+
+Code and Documentation Contributors
+-----------------------------------
+
+Thanks to everyone who has contributed to the maintenance and improvement of the
+project since version 0.20, including:
diff --git a/maint_tools/whats_missing.sh b/maint_tools/whats_missing.sh
index 54ce06f8bbcf5..5b2d6b8fd8a01 100755
--- a/maint_tools/whats_missing.sh
+++ b/maint_tools/whats_missing.sh
@@ -19,7 +19,7 @@ logged_prs() {
 
 mentioned_issues() {
 	cat doc/whats_new/v$to_file.rst |
-			grep -o 'issue:`[0-9]\+`' |
+			grep -o 'issue:`[0-9]\+`\|pr:`[0-9]\+`' |
 			grep -o '[0-9]\+'
 }
 

From 76ef8b0ef07f9c03b97d29a51e1543be7720e85a Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 27 Apr 2020 09:42:01 -0400
Subject: [PATCH 072/125] API kwonly for utils (#17046)

* kwonly for utils

* More

* fixed some

* some more

* iwannagohomepls

* accept_sparse not kwonly anymore
---
 sklearn/ensemble/_forest.py                   |   6 +-
 .../_univariate_selection.py                  |   5 +-
 sklearn/feature_selection/tests/test_base.py  |   2 +-
 sklearn/linear_model/_coordinate_descent.py   |  11 +-
 sklearn/linear_model/_stochastic_gradient.py  |   4 +-
 sklearn/manifold/_spectral_embedding.py       |   3 +-
 sklearn/neighbors/_nca.py                     |   8 +-
 sklearn/neural_network/_rbm.py                |   2 +-
 sklearn/preprocessing/_data.py                |   2 +-
 sklearn/utils/__init__.py                     |  20 ++--
 sklearn/utils/class_weight.py                 |   5 +-
 sklearn/utils/multiclass.py                   |   7 +-
 sklearn/utils/tests/test_class_weight.py      |  22 ++--
 sklearn/utils/tests/test_validation.py        |   5 +-
 sklearn/utils/validation.py                   | 109 ++++++++++--------
 15 files changed, 118 insertions(+), 93 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 40a1c2434316c..98d606961c1e1 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -159,9 +159,11 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
         if class_weight == 'subsample':
             with catch_warnings():
                 simplefilter('ignore', DeprecationWarning)
-                curr_sample_weight *= compute_sample_weight('auto', y, indices)
+                curr_sample_weight *= compute_sample_weight('auto', y,
+                                                            indices=indices)
         elif class_weight == 'balanced_subsample':
-            curr_sample_weight *= compute_sample_weight('balanced', y, indices)
+            curr_sample_weight *= compute_sample_weight('balanced', y,
+                                                        indices=indices)
 
         tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
     else:
diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py
index 21a2ddc10a1eb..7e873b3a2b65c 100644
--- a/sklearn/feature_selection/_univariate_selection.py
+++ b/sklearn/feature_selection/_univariate_selection.py
@@ -146,7 +146,7 @@ def f_classif(X, y):
     chi2: Chi-squared stats of non-negative features for classification tasks.
     f_regression: F-value between label/feature for regression tasks.
     """
-    X, y = check_X_y(X, y, ['csr', 'csc', 'coo'])
+    X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'])
     args = [X[safe_mask(X, y == k)] for k in np.unique(y)]
     return f_oneway(*args)
 
@@ -277,7 +277,8 @@ def f_regression(X, y, center=True):
     SelectPercentile: Select features based on percentile of the highest
         scores.
     """
-    X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float64)
+    X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
+                     dtype=np.float64)
     n_samples = X.shape[0]
 
     # compute centered values
diff --git a/sklearn/feature_selection/tests/test_base.py b/sklearn/feature_selection/tests/test_base.py
index d1aaccde0efa3..9515bdc32c600 100644
--- a/sklearn/feature_selection/tests/test_base.py
+++ b/sklearn/feature_selection/tests/test_base.py
@@ -15,7 +15,7 @@ def __init__(self, step=2):
         self.step = step
 
     def fit(self, X, y=None):
-        X = check_array(X, 'csc')
+        X = check_array(X, accept_sparse='csc')
         self.n_input_feats = X.shape[1]
         return self
 
diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index f4430c5bcac55..cd57b9612b362 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -131,7 +131,7 @@ def _alpha_grid(X, y, Xy=None, l1_ratio=1.0, fit_intercept=True,
     if Xy is None:
         X_sparse = sparse.isspmatrix(X)
         sparse_center = X_sparse and (fit_intercept or normalize)
-        X = check_array(X, 'csc',
+        X = check_array(X, accept_sparse='csc',
                         copy=(copy_X and fit_intercept and not X_sparse))
         if not X_sparse:
             # X can be touched inplace thanks to the above line
@@ -435,10 +435,10 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
     # We expect X and y to be already Fortran ordered when bypassing
     # checks
     if check_input:
-        X = check_array(X, 'csc', dtype=[np.float64, np.float32],
+        X = check_array(X, accept_sparse='csc', dtype=[np.float64, np.float32],
                         order='F', copy=copy_X)
-        y = check_array(y, 'csc', dtype=X.dtype.type, order='F', copy=False,
-                        ensure_2d=False)
+        y = check_array(y, accept_sparse='csc', dtype=X.dtype.type,
+                        order='F', copy=False, ensure_2d=False)
         if Xy is not None:
             # Xy should be a 1d contiguous array or a 2D C ordered array
             Xy = check_array(Xy, dtype=X.dtype.type, order='C', copy=False,
@@ -1095,7 +1095,8 @@ def _path_residuals(X, y, train, test, path, path_params, alphas=None,
 
     # Do the ordering and type casting here, as if it is done in the path,
     # X is copied and a reference is kept here
-    X_train = check_array(X_train, 'csc', dtype=dtype, order=X_order)
+    X_train = check_array(X_train, accept_sparse='csc', dtype=dtype,
+                          order=X_order)
     alphas, coefs, _ = path(X_train, y_train, **path_params)
     del X_train, y_train
 
diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index 1606a7ff35adb..94428f61f1327 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -487,8 +487,8 @@ def _partial_fit(self, X, y, alpha, C,
                      loss, learning_rate, max_iter,
                      classes, sample_weight,
                      coef_init, intercept_init):
-        X, y = check_X_y(X, y, 'csr', dtype=np.float64, order="C",
-                         accept_large_sparse=False)
+        X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64,
+                         order="C", accept_large_sparse=False)
 
         n_samples, n_features = X.shape
 
diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py
index 0c8bb4902c99a..a42c97bb5d6b4 100644
--- a/sklearn/manifold/_spectral_embedding.py
+++ b/sklearn/manifold/_spectral_embedding.py
@@ -301,7 +301,8 @@ def spectral_embedding(adjacency, *, n_components=8, eigen_solver=None,
         # matrix to the solver and afterward set it back to the original.
         diag_shift = 1e-5 * sparse.eye(laplacian.shape[0])
         laplacian += diag_shift
-        ml = smoothed_aggregation_solver(check_array(laplacian, 'csr'))
+        ml = smoothed_aggregation_solver(check_array(laplacian,
+                                                     accept_sparse='csr'))
         laplacian -= diag_shift
 
         M = ml.aspreconditioner()
diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py
index 9705c9050f6c7..8920b2d99ed02 100644
--- a/sklearn/neighbors/_nca.py
+++ b/sklearn/neighbors/_nca.py
@@ -308,7 +308,7 @@ def _validate_params(self, X, y):
         # Check the preferred dimensionality of the projected space
         if self.n_components is not None:
             check_scalar(
-                self.n_components, 'n_components', numbers.Integral, 1)
+                self.n_components, 'n_components', numbers.Integral, min_val=1)
 
             if self.n_components > X.shape[1]:
                 raise ValueError('The preferred dimensionality of the '
@@ -327,9 +327,9 @@ def _validate_params(self, X, y):
                                  .format(X.shape[1],
                                          self.components_.shape[1]))
 
-        check_scalar(self.max_iter, 'max_iter', numbers.Integral, 1)
-        check_scalar(self.tol, 'tol', numbers.Real, 0.)
-        check_scalar(self.verbose, 'verbose', numbers.Integral, 0)
+        check_scalar(self.max_iter, 'max_iter', numbers.Integral, min_val=1)
+        check_scalar(self.tol, 'tol', numbers.Real, min_val=0.)
+        check_scalar(self.verbose, 'verbose', numbers.Integral, min_val=0)
 
         if self.callback is not None:
             if not callable(self.callback):
diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py
index 67e1d68a3607e..fcb4e90772598 100644
--- a/sklearn/neural_network/_rbm.py
+++ b/sklearn/neural_network/_rbm.py
@@ -357,7 +357,7 @@ def fit(self, X, y=None):
 
         n_batches = int(np.ceil(float(n_samples) / self.batch_size))
         batch_slices = list(gen_even_slices(n_batches * self.batch_size,
-                                            n_batches, n_samples))
+                                            n_batches, n_samples=n_samples))
         verbose = self.verbose
         begin = time.time()
         for iteration in range(1, self.n_iter + 1):
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index f9af3dbac6d0d..cc8776951f114 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -1707,7 +1707,7 @@ def normalize(X, norm='l2', *, axis=1, copy=True, return_norm=False):
     else:
         raise ValueError("'%d' is not a supported axis" % axis)
 
-    X = check_array(X, sparse_format, copy=copy,
+    X = check_array(X, accept_sparse=sparse_format, copy=copy,
                     estimator='the normalize function', dtype=FLOAT_DTYPES)
     if axis == 0:
         X = X.T
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index aac6e292a198a..afde7614070fd 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -29,7 +29,8 @@
                          assert_all_finite,
                          check_random_state, column_or_1d, check_array,
                          check_consistent_length, check_X_y, indexable,
-                         check_symmetric, check_scalar)
+                         check_symmetric, check_scalar,
+                         _deprecate_positional_args)
 from .. import get_config
 
 
@@ -314,10 +315,10 @@ def safe_indexing(X, indices, axis=0):
     CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are
     not supported.
     """
-    return _safe_indexing(X, indices, axis)
+    return _safe_indexing(X, indices, axis=axis)
 
 
-def _safe_indexing(X, indices, axis=0):
+def _safe_indexing(X, indices, *, axis=0):
     """Return rows, items or columns of X using indices.
 
     .. warning::
@@ -684,7 +685,8 @@ def shuffle(*arrays, **options):
     return resample(*arrays, **options)
 
 
-def safe_sqr(X, copy=True):
+@_deprecate_positional_args
+def safe_sqr(X, *, copy=True):
     """Element wise squaring of array-likes and sparse matrices.
 
     Parameters
@@ -723,7 +725,8 @@ def _chunk_generator(gen, chunksize):
             return
 
 
-def gen_batches(n, batch_size, min_batch_size=0):
+@_deprecate_positional_args
+def gen_batches(n, batch_size, *, min_batch_size=0):
     """Generator to create slices containing batch_size elements, from 0 to n.
 
     The last slice may contain less than batch_size elements, when batch_size
@@ -772,7 +775,8 @@ def gen_batches(n, batch_size, min_batch_size=0):
         yield slice(start, n)
 
 
-def gen_even_slices(n, n_packs, n_samples=None):
+@_deprecate_positional_args
+def gen_even_slices(n, n_packs, *, n_samples=None):
     """Generator to create n_packs slices going up to n.
 
     Parameters
@@ -957,8 +961,8 @@ def _print_elapsed_time(source, message=None):
                                timeit.default_timer() - start))
 
 
-def get_chunk_n_rows(row_bytes, max_n_rows=None,
-                     working_memory=None):
+@_deprecate_positional_args
+def get_chunk_n_rows(row_bytes, *, max_n_rows=None, working_memory=None):
     """Calculates how many rows can be processed within working_memory
 
     Parameters
diff --git a/sklearn/utils/class_weight.py b/sklearn/utils/class_weight.py
index 5f785cb36df45..8c64e33e1d0d4 100644
--- a/sklearn/utils/class_weight.py
+++ b/sklearn/utils/class_weight.py
@@ -4,6 +4,8 @@
 
 import numpy as np
 
+from .validation import _deprecate_positional_args
+
 
 def compute_class_weight(class_weight, classes, y):
     """Estimate class weights for unbalanced datasets.
@@ -69,7 +71,8 @@ def compute_class_weight(class_weight, classes, y):
     return weight
 
 
-def compute_sample_weight(class_weight, y, indices=None):
+@_deprecate_positional_args
+def compute_sample_weight(class_weight, y, *, indices=None):
     """Estimate sample weights by class for unbalanced datasets.
 
     Parameters
diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py
index 3301ac977b4b9..8e471d5fdf577 100644
--- a/sklearn/utils/multiclass.py
+++ b/sklearn/utils/multiclass.py
@@ -27,7 +27,9 @@ def _unique_multiclass(y):
 
 
 def _unique_indicator(y):
-    return np.arange(check_array(y, ['csr', 'csc', 'coo']).shape[1])
+    return np.arange(
+        check_array(y, accept_sparse=['csr', 'csc', 'coo']).shape[1]
+    )
 
 
 _FN_UNIQUE_LABELS = {
@@ -83,7 +85,8 @@ def unique_labels(*ys):
 
     # Check consistency for the indicator format
     if (label_type == "multilabel-indicator" and
-            len(set(check_array(y, ['csr', 'csc', 'coo']).shape[1]
+            len(set(check_array(y,
+                                accept_sparse=['csr', 'csc', 'coo']).shape[1]
                     for y in ys)) > 1):
         raise ValueError("Multi-label binary indicator input with "
                          "different numbers of labels")
diff --git a/sklearn/utils/tests/test_class_weight.py b/sklearn/utils/tests/test_class_weight.py
index 487da5b431be0..067b12cc32f28 100644
--- a/sklearn/utils/tests/test_class_weight.py
+++ b/sklearn/utils/tests/test_class_weight.py
@@ -192,39 +192,41 @@ def test_compute_sample_weight_with_subsample():
     # Test compute_sample_weight with subsamples specified.
     # Test with balanced classes and all samples present
     y = np.asarray([1, 1, 1, 2, 2, 2])
-    sample_weight = compute_sample_weight("balanced", y, range(6))
+    sample_weight = compute_sample_weight("balanced", y, indices=range(6))
     assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
 
     # Test with column vector of balanced classes and all samples present
     y = np.asarray([[1], [1], [1], [2], [2], [2]])
-    sample_weight = compute_sample_weight("balanced", y, range(6))
+    sample_weight = compute_sample_weight("balanced", y, indices=range(6))
     assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.])
 
     # Test with a subsample
     y = np.asarray([1, 1, 1, 2, 2, 2])
-    sample_weight = compute_sample_weight("balanced", y, range(4))
+    sample_weight = compute_sample_weight("balanced", y, indices=range(4))
     assert_array_almost_equal(sample_weight, [2. / 3, 2. / 3,
                                               2. / 3, 2., 2., 2.])
 
     # Test with a bootstrap subsample
     y = np.asarray([1, 1, 1, 2, 2, 2])
-    sample_weight = compute_sample_weight("balanced", y, [0, 1, 1, 2, 2, 3])
+    sample_weight = compute_sample_weight("balanced", y,
+                                          indices=[0, 1, 1, 2, 2, 3])
     expected_balanced = np.asarray([0.6, 0.6, 0.6, 3., 3., 3.])
     assert_array_almost_equal(sample_weight, expected_balanced)
 
     # Test with a bootstrap subsample for multi-output
     y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
-    sample_weight = compute_sample_weight("balanced", y, [0, 1, 1, 2, 2, 3])
+    sample_weight = compute_sample_weight("balanced", y,
+                                          indices=[0, 1, 1, 2, 2, 3])
     assert_array_almost_equal(sample_weight, expected_balanced ** 2)
 
     # Test with a missing class
     y = np.asarray([1, 1, 1, 2, 2, 2, 3])
-    sample_weight = compute_sample_weight("balanced", y, range(6))
+    sample_weight = compute_sample_weight("balanced", y, indices=range(6))
     assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.])
 
     # Test with a missing class for multi-output
     y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [2, 2]])
-    sample_weight = compute_sample_weight("balanced", y, range(6))
+    sample_weight = compute_sample_weight("balanced", y, indices=range(6))
     assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.])
 
 
@@ -237,15 +239,15 @@ def test_compute_sample_weight_errors():
     with pytest.raises(ValueError):
         compute_sample_weight("ni", y)
     with pytest.raises(ValueError):
-        compute_sample_weight("ni", y, range(4))
+        compute_sample_weight("ni", y, indices=range(4))
     with pytest.raises(ValueError):
         compute_sample_weight("ni", y_)
     with pytest.raises(ValueError):
-        compute_sample_weight("ni", y_, range(4))
+        compute_sample_weight("ni", y_, indices=range(4))
 
     # Not "balanced" for subsample
     with pytest.raises(ValueError):
-        compute_sample_weight({1: 2, 2: 1}, y, range(4))
+        compute_sample_weight({1: 2, 2: 1}, y, indices=range(4))
 
     # Not a list or preset for multi-output
     with pytest.raises(ValueError):
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index b178ccc148d9d..418f037936c64 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -63,7 +63,7 @@ def test_as_float_array():
     X = X.astype(np.int64)
     X2 = as_float_array(X, copy=True)
     # Checking that the array wasn't overwritten
-    assert as_float_array(X, False) is not X
+    assert as_float_array(X, copy=False) is not X
     assert X2.dtype == np.float64
     # Test int dtypes <= 32bit
     tested_dtypes = [np.bool,
@@ -912,7 +912,8 @@ def test_check_scalar_valid(x, target_type, min_val, max_val):
     """Test that check_scalar returns no error/warning if valid inputs are
     provided"""
     with pytest.warns(None) as record:
-        check_scalar(x, "test_name", target_type, min_val, max_val)
+        check_scalar(x, "test_name", target_type=target_type,
+                     min_val=min_val, max_val=max_val)
     assert len(record) == 0
 
 
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 953584fff0f8a..8ee18371a3009 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -36,6 +36,44 @@
 warnings.simplefilter('ignore', NonBLASDotWarning)
 
 
+def _deprecate_positional_args(f):
+    """Decorator for methods that issues warnings for positional arguments
+
+    Using the keyword-only argument syntax in pep 3102, arguments after the
+    * will issue a warning when passed as a positional argument.
+
+    Parameters
+    ----------
+    f : function
+        function to check arguments on
+    """
+    sig = signature(f)
+    kwonly_args = []
+    all_args = []
+
+    for name, param in sig.parameters.items():
+        if param.kind == Parameter.POSITIONAL_OR_KEYWORD:
+            all_args.append(name)
+        elif param.kind == Parameter.KEYWORD_ONLY:
+            kwonly_args.append(name)
+
+    @wraps(f)
+    def inner_f(*args, **kwargs):
+        extra_args = len(args) - len(all_args)
+        if extra_args > 0:
+            # ignore first 'self' argument for instance methods
+            args_msg = ['{}={}'.format(name, arg)
+                        for name, arg in zip(kwonly_args[:extra_args],
+                                             args[-extra_args:])]
+            warnings.warn("Pass {} as keyword args. From version 0.25 "
+                          "passing these as positional arguments will "
+                          "result in an error".format(", ".join(args_msg)),
+                          FutureWarning)
+        kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
+        return f(**kwargs)
+    return inner_f
+
+
 def _assert_all_finite(X, allow_nan=False, msg_dtype=None):
     """Like assert_all_finite, but only for ndarray."""
     # validation is also imported in extmath
@@ -67,7 +105,8 @@ def _assert_all_finite(X, allow_nan=False, msg_dtype=None):
             raise ValueError("Input contains NaN")
 
 
-def assert_all_finite(X, allow_nan=False):
+@_deprecate_positional_args
+def assert_all_finite(X, *, allow_nan=False):
     """Throw a ValueError if X contains NaN or infinity.
 
     Parameters
@@ -79,7 +118,8 @@ def assert_all_finite(X, allow_nan=False):
     _assert_all_finite(X.data if sp.issparse(X) else X, allow_nan)
 
 
-def as_float_array(X, copy=True, force_all_finite=True):
+@_deprecate_positional_args
+def as_float_array(X, *, copy=True, force_all_finite=True):
     """Converts an array-like to an array of floats.
 
     The new dtype will be np.float32 or np.float64, depending on the original
@@ -113,9 +153,9 @@ def as_float_array(X, copy=True, force_all_finite=True):
     """
     if isinstance(X, np.matrix) or (not isinstance(X, np.ndarray)
                                     and not sp.issparse(X)):
-        return check_array(X, ['csr', 'csc', 'coo'], dtype=np.float64,
-                           copy=copy, force_all_finite=force_all_finite,
-                           ensure_2d=False)
+        return check_array(X, accept_sparse=['csr', 'csc', 'coo'],
+                           dtype=np.float64, copy=copy,
+                           force_all_finite=force_all_finite, ensure_2d=False)
     elif sp.issparse(X) and X.dtype in [np.float32, np.float64]:
         return X.copy() if copy else X
     elif X.dtype in [np.float32, np.float64]:  # is numpy array
@@ -349,7 +389,8 @@ def _ensure_no_complex_data(array):
                          "{}\n".format(array))
 
 
-def check_array(array, accept_sparse=False, accept_large_sparse=True,
+@_deprecate_positional_args
+def check_array(array, accept_sparse=False, *, accept_large_sparse=True,
                 dtype="numeric", order=None, copy=False, force_all_finite=True,
                 ensure_2d=True, allow_nd=False, ensure_min_samples=1,
                 ensure_min_features=1, estimator=None):
@@ -620,7 +661,8 @@ def _check_large_sparse(X, accept_large_sparse=False):
                                  % indices_datatype)
 
 
-def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True,
+@_deprecate_positional_args
+def check_X_y(X, y, accept_sparse=False, *, accept_large_sparse=True,
               dtype="numeric", order=None, copy=False, force_all_finite=True,
               ensure_2d=True, allow_nd=False, multi_output=False,
               ensure_min_samples=1, ensure_min_features=1, y_numeric=False,
@@ -732,8 +774,8 @@ def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True,
                     ensure_min_features=ensure_min_features,
                     estimator=estimator)
     if multi_output:
-        y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
-                        dtype=None)
+        y = check_array(y, accept_sparse='csr', force_all_finite=True,
+                        ensure_2d=False, dtype=None)
     else:
         y = column_or_1d(y, warn=True)
         _assert_all_finite(y)
@@ -745,7 +787,8 @@ def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True,
     return X, y
 
 
-def column_or_1d(y, warn=False):
+@_deprecate_positional_args
+def column_or_1d(y, *, warn=False):
     """ Ravel column or 1d numpy array, else raises an error
 
     Parameters
@@ -825,7 +868,8 @@ def has_fit_parameter(estimator, parameter):
     return parameter in signature(estimator.fit).parameters
 
 
-def check_symmetric(array, tol=1E-10, raise_warning=True,
+@_deprecate_positional_args
+def check_symmetric(array, *, tol=1E-10, raise_warning=True,
                     raise_exception=False):
     """Make sure that array is 2D, square and symmetric.
 
@@ -881,7 +925,8 @@ def check_symmetric(array, tol=1E-10, raise_warning=True,
     return array
 
 
-def check_is_fitted(estimator, attributes=None, msg=None, all_or_any=all):
+@_deprecate_positional_args
+def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all):
     """Perform is_fitted validation for estimator.
 
     Checks if the estimator is fitted by verifying the presence of
@@ -974,7 +1019,7 @@ def check_non_negative(X, whom):
         raise ValueError("Negative values in data passed to %s" % whom)
 
 
-def check_scalar(x, name, target_type, min_val=None, max_val=None):
+def check_scalar(x, name, target_type, *, min_val=None, max_val=None):
     """Validate scalar parameters type and value.
 
     Parameters
@@ -1268,44 +1313,6 @@ def _allclose_dense_sparse(x, y, rtol=1e-7, atol=1e-9):
                      "matrix and an array")
 
 
-def _deprecate_positional_args(f):
-    """Decorator for methods that issues warnings for positional arguments
-
-    Using the keyword-only argument syntax in pep 3102, arguments after the
-    * will issue a warning when passed as a positional argument.
-
-    Parameters
-    ----------
-    f : function
-        function to check arguments on
-    """
-    sig = signature(f)
-    kwonly_args = []
-    all_args = []
-
-    for name, param in sig.parameters.items():
-        if param.kind == Parameter.POSITIONAL_OR_KEYWORD:
-            all_args.append(name)
-        elif param.kind == Parameter.KEYWORD_ONLY:
-            kwonly_args.append(name)
-
-    @wraps(f)
-    def inner_f(*args, **kwargs):
-        extra_args = len(args) - len(all_args)
-        if extra_args > 0:
-            # ignore first 'self' argument for instance methods
-            args_msg = ['{}={}'.format(name, arg)
-                        for name, arg in zip(kwonly_args[:extra_args],
-                                             args[-extra_args:])]
-            warnings.warn("Pass {} as keyword args. From version 0.25 "
-                          "passing these as positional arguments will "
-                          "result in an error".format(", ".join(args_msg)),
-                          FutureWarning)
-        kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
-        return f(**kwargs)
-    return inner_f
-
-
 def _check_fit_params(X, fit_params, indices=None):
     """Check and validate the parameters passed during `fit`.
 

From 839b356f45fac7724eab739dcc129a0c8f650a23 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Mon, 27 Apr 2020 09:45:03 -0400
Subject: [PATCH 073/125] DOC Removes examples from exceptions docstrings
 (#17040)

---
 sklearn/exceptions.py | 56 -------------------------------------------
 1 file changed, 56 deletions(-)

diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py
index 1b71050813d2b..7140b98e53027 100644
--- a/sklearn/exceptions.py
+++ b/sklearn/exceptions.py
@@ -49,24 +49,6 @@ class ChangedBehaviorWarning(UserWarning):
 class ConvergenceWarning(UserWarning):
     """Custom warning to capture convergence problems
 
-    Examples
-    --------
-
-    >>> import numpy as np
-    >>> import warnings
-    >>> from sklearn.cluster import KMeans
-    >>> from sklearn.exceptions import ConvergenceWarning
-    >>> warnings.simplefilter("always", ConvergenceWarning)
-    >>> X = np.asarray([[0, 0],
-    ...                 [0, 1],
-    ...                 [1, 0],
-    ...                 [1, 0]])  # last point is duplicated
-    >>> with warnings.catch_warnings(record=True) as w:
-    ...     km = KMeans(n_clusters=4).fit(X)
-    ...     print(w[-1].message)
-    Number of distinct clusters (3) found smaller than n_clusters (4).
-    Possibly due to duplicate points in X.
-
     .. versionchanged:: 0.18
        Moved from sklearn.utils.
     """
@@ -85,23 +67,6 @@ class DataConversionWarning(UserWarning):
           implementation's data-type expectations;
         - passes an input whose shape can be interpreted ambiguously.
 
-    Examples
-    --------
-    >>> from sklearn.utils import validation
-    >>> Y = [[1],[2],[3]]
-    >>> import warnings
-    >>> from sklearn.exceptions import DataConversionWarning
-    >>> warnings.simplefilter('always', DataConversionWarning)
-    >>> with warnings.catch_warnings(record=True) as w:
-    ...     try:
-    ...         # will trigger warning as Y is a column-vector
-    ...         Y = validation.column_or_1d(Y,warn=True)
-    ...     except ValueError:
-    ...         pass
-    ...     print(w[-1].message)
-    A column-vector y was passed when a 1d array was expected. Please change
-    the shape of y to (n_samples, ), for example using ravel().
-
     .. versionchanged:: 0.18
        Moved from sklearn.utils.validation.
     """
@@ -139,27 +104,6 @@ class FitFailedWarning(RuntimeWarning):
     and the cross-validation helper function cross_val_score to warn when there
     is an error while fitting the estimator.
 
-    Examples
-    --------
-    >>> from sklearn.model_selection import GridSearchCV
-    >>> from sklearn.svm import LinearSVC
-    >>> from sklearn.exceptions import FitFailedWarning
-    >>> import warnings
-    >>> warnings.simplefilter('always', FitFailedWarning)
-    >>> gs = GridSearchCV(LinearSVC(), {'C': [-1, -2]}, error_score=0, cv=2)
-    >>> X, y = [[1, 2], [3, 4], [5, 6], [7, 8]], [0, 0, 1, 1]
-    >>> with warnings.catch_warnings(record=True) as w:
-    ...     try:
-    ...         gs.fit(X, y)  # This will raise a ValueError since C is < 0
-    ...     except ValueError:
-    ...         pass
-    ...     print(repr(w[-1].message))
-    FitFailedWarning('Estimator fit failed. The score on this train-test
-    partition for these parameters will be set to 0.000000.
-    Details:...Traceback (most recent call last):...ValueError:
-    Penalty term must be positive; got (C=-2)...
-
-
     .. versionchanged:: 0.18
        Moved from sklearn.cross_validation.
     """

From 1b119c46937f29b1b29fb8eaaee6910beb7807d0 Mon Sep 17 00:00:00 2001
From: Adrin Jalali <adrin.jalali@gmail.com>
Date: Mon, 27 Apr 2020 16:12:29 +0200
Subject: [PATCH 074/125] MNT fix generate_authors_table.py (#17011)

---
 build_tools/generate_authors_table.py | 4 ++--
 doc/authors.rst                       | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/build_tools/generate_authors_table.py b/build_tools/generate_authors_table.py
index 81e99856c6890..eaad1df75475c 100644
--- a/build_tools/generate_authors_table.py
+++ b/build_tools/generate_authors_table.py
@@ -11,6 +11,7 @@
 import getpass
 import time
 from pathlib import Path
+from os import path
 
 print("user:", file=sys.stderr)
 user = input()
@@ -18,7 +19,7 @@
 auth = (user, passwd)
 
 LOGO_URL = 'https://avatars2.githubusercontent.com/u/365630?v=4'
-REPO_FOLDER = Path(__file__).parent.parent
+REPO_FOLDER = Path(path.abspath(__file__)).parent.parent
 
 
 def get(url):
@@ -100,7 +101,6 @@ def get_profile(login):
         'Duchesnay': 'Edouard Duchesnay',
         'Lars': 'Lars Buitinck',
         'MechCoder': 'Manoj Kumar',
-        'jeremiedbb': 'Jérémie Du Boisberranger',
     }
     if profile["name"] in missing_names:
         profile["name"] = missing_names[profile["name"]]
diff --git a/doc/authors.rst b/doc/authors.rst
index 6a03871d67e90..7b5426fe3128d 100644
--- a/doc/authors.rst
+++ b/doc/authors.rst
@@ -7,7 +7,7 @@
     </style>
     <div>
     <a href='https://github.com/jeremiedbb'><img src='https://avatars2.githubusercontent.com/u/34657725?v=4' class='avatar' /></a> <br />
-    <p>Jérémie Du Boisberranger</p>
+    <p>Jérémie du Boisberranger</p>
     </div>
     <div>
     <a href='https://github.com/jorisvandenbossche'><img src='https://avatars2.githubusercontent.com/u/1020496?v=4' class='avatar' /></a> <br />

From 91e942759fd7e16554cb5dc80918dce1f810b7f4 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Mon, 27 Apr 2020 11:23:15 -0400
Subject: [PATCH 075/125] DOC Make release highlights the first gallery section
 (#16952)

---
 doc/conf.py                                    |  5 +++++
 doc/developers/plotting.rst                    |  2 +-
 doc/modules/compose.rst                        |  2 +-
 doc/modules/ensemble.rst                       |  2 +-
 doc/modules/isotonic.rst                       |  4 ++--
 doc/modules/kernel_approximation.rst           |  6 +++---
 doc/modules/kernel_ridge.rst                   |  8 ++++----
 doc/modules/multiclass.rst                     |  6 +++---
 doc/modules/neighbors.rst                      |  8 ++++----
 doc/modules/outlier_detection.rst              | 18 +++++++++---------
 doc/modules/random_projection.rst              | 10 +++++-----
 doc/modules/tree.rst                           |  8 ++++----
 doc/modules/unsupervised_reduction.rst         |  2 +-
 doc/visualizations.rst                         | 14 +++++++-------
 examples/README.txt                            |  5 -----
 examples/miscellaneous/README.txt              |  7 +++++++
 .../plot_anomaly_comparison.py                 |  0
 .../plot_changed_only_pprint_parameter.py      |  0
 .../plot_display_object_visualization.py       |  0
 .../plot_isotonic_regression.py                |  0
 .../plot_johnson_lindenstrauss_bound.py        |  3 ++-
 .../plot_kernel_approximation.py               |  0
 .../plot_kernel_ridge_regression.py            |  0
 .../{ => miscellaneous}/plot_multilabel.py     |  0
 .../plot_multioutput_face_completion.py        |  0
 ...lot_partial_dependence_visualization_api.py |  4 ++--
 .../plot_roc_curve_visualization_api.py        |  0
 sklearn/inspection/_plot/partial_dependence.py |  2 +-
 28 files changed, 62 insertions(+), 54 deletions(-)
 create mode 100644 examples/miscellaneous/README.txt
 rename examples/{ => miscellaneous}/plot_anomaly_comparison.py (100%)
 rename examples/{ => miscellaneous}/plot_changed_only_pprint_parameter.py (100%)
 rename examples/{ => miscellaneous}/plot_display_object_visualization.py (100%)
 rename examples/{ => miscellaneous}/plot_isotonic_regression.py (100%)
 rename examples/{ => miscellaneous}/plot_johnson_lindenstrauss_bound.py (99%)
 rename examples/{ => miscellaneous}/plot_kernel_approximation.py (100%)
 rename examples/{ => miscellaneous}/plot_kernel_ridge_regression.py (100%)
 rename examples/{ => miscellaneous}/plot_multilabel.py (100%)
 rename examples/{ => miscellaneous}/plot_multioutput_face_completion.py (100%)
 rename examples/{ => miscellaneous}/plot_partial_dependence_visualization_api.py (98%)
 rename examples/{ => miscellaneous}/plot_roc_curve_visualization_api.py (100%)

diff --git a/doc/conf.py b/doc/conf.py
index a13ed14216de4..1783a676b6d01 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -281,6 +281,11 @@ def __repr__(self):
 
     def __call__(self, directory):
         src_path = os.path.normpath(os.path.join(self.src_dir, directory))
+
+        # Forces Release Highlights to the top
+        if os.path.basename(src_path) == "release_highlights":
+            return "0"
+
         readme = os.path.join(src_path, "README.txt")
 
         try:
diff --git a/doc/developers/plotting.rst b/doc/developers/plotting.rst
index 98af195b56453..7a2f6ebf69415 100644
--- a/doc/developers/plotting.rst
+++ b/doc/developers/plotting.rst
@@ -50,7 +50,7 @@ attributes::
                                 estimator.__class__.__name__)
        return viz.plot(ax=ax, name=name, **kwargs)
 
-Read more in :ref:`sphx_glr_auto_examples_plot_roc_curve_visualization_api.py`
+Read more in :ref:`sphx_glr_auto_examples_miscellaneous_plot_roc_curve_visualization_api.py`
 and the :ref:`User Guide <visualizations>`.
 
 Plotting with Multiple Axes
diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst
index 51a933dcbee47..cd29b14b1f081 100644
--- a/doc/modules/compose.rst
+++ b/doc/modules/compose.rst
@@ -144,7 +144,7 @@ or by name::
  * :ref:`sphx_glr_auto_examples_feature_selection_plot_feature_selection_pipeline.py`
  * :ref:`sphx_glr_auto_examples_model_selection_grid_search_text_feature_extraction.py`
  * :ref:`sphx_glr_auto_examples_compose_plot_digits_pipe.py`
- * :ref:`sphx_glr_auto_examples_plot_kernel_approximation.py`
+ * :ref:`sphx_glr_auto_examples_miscellaneous_plot_kernel_approximation.py`
  * :ref:`sphx_glr_auto_examples_svm_plot_svm_anova.py`
  * :ref:`sphx_glr_auto_examples_compose_plot_compare_reduction.py`
 
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index bff08f542ce11..3cf8987fcfd5a 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -246,7 +246,7 @@ amount of time (e.g., on large datasets).
 
  * :ref:`sphx_glr_auto_examples_ensemble_plot_forest_iris.py`
  * :ref:`sphx_glr_auto_examples_ensemble_plot_forest_importances_faces.py`
- * :ref:`sphx_glr_auto_examples_plot_multioutput_face_completion.py`
+ * :ref:`sphx_glr_auto_examples_miscellaneous_plot_multioutput_face_completion.py`
 
 .. topic:: References
 
diff --git a/doc/modules/isotonic.rst b/doc/modules/isotonic.rst
index 1f54dcfa50bad..8967ef18afcb3 100644
--- a/doc/modules/isotonic.rst
+++ b/doc/modules/isotonic.rst
@@ -28,6 +28,6 @@ correlation coefficient
 for predicting to unseen data. The predictions of :class:`IsotonicRegression`
 thus form a function that is piecewise linear:
 
-.. figure:: ../auto_examples/images/sphx_glr_plot_isotonic_regression_001.png
-   :target: ../auto_examples/plot_isotonic_regression.html
+.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_isotonic_regression_001.png
+   :target: ../auto_examples/miscellaneous/plot_isotonic_regression.html
    :align: center
diff --git a/doc/modules/kernel_approximation.rst b/doc/modules/kernel_approximation.rst
index 77354d5afaf1d..fb3843c6bc045 100644
--- a/doc/modules/kernel_approximation.rst
+++ b/doc/modules/kernel_approximation.rst
@@ -84,8 +84,8 @@ For a given value of ``n_components`` :class:`RBFSampler` is often less accurate
 as :class:`Nystroem`. :class:`RBFSampler` is cheaper to compute, though, making
 use of larger feature spaces more efficient.
 
-.. figure:: ../auto_examples/images/sphx_glr_plot_kernel_approximation_002.png
-    :target: ../auto_examples/plot_kernel_approximation.html
+.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_kernel_approximation_002.png
+    :target: ../auto_examples/miscellaneous/plot_kernel_approximation.html
     :scale: 50%
     :align: center
 
@@ -93,7 +93,7 @@ use of larger feature spaces more efficient.
 
 .. topic:: Examples:
 
-    * :ref:`sphx_glr_auto_examples_plot_kernel_approximation.py`
+    * :ref:`sphx_glr_auto_examples_miscellaneous_plot_kernel_approximation.py`
 
 .. _additive_chi_kernel_approx:
 
diff --git a/doc/modules/kernel_ridge.rst b/doc/modules/kernel_ridge.rst
index a67733b1ca5a5..286e9d4ac5322 100644
--- a/doc/modules/kernel_ridge.rst
+++ b/doc/modules/kernel_ridge.rst
@@ -35,8 +35,8 @@ However, prediction of 100000 target values is more than three times faster
 with :class:`~sklearn.svm.SVR` since it has learned a sparse model using only
 approximately 1/3 of the 100 training datapoints as support vectors.
 
-.. figure:: ../auto_examples/images/sphx_glr_plot_kernel_ridge_regression_001.png
-   :target: ../auto_examples/plot_kernel_ridge_regression.html
+.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_kernel_ridge_regression_001.png
+   :target: ../auto_examples/miscellaneous/plot_kernel_ridge_regression.html
    :align: center
 
 The next figure compares the time for fitting and prediction of
@@ -51,8 +51,8 @@ prediction time depends on the parameters :math:`\epsilon` and :math:`C` of
 the :class:`~sklearn.svm.SVR`; :math:`\epsilon = 0` would correspond to a
 dense model.
 
-.. figure:: ../auto_examples/images/sphx_glr_plot_kernel_ridge_regression_002.png
-   :target: ../auto_examples/plot_kernel_ridge_regression.html
+.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_kernel_ridge_regression_002.png
+   :target: ../auto_examples/miscellaneous/plot_kernel_ridge_regression.html
    :align: center
 
 
diff --git a/doc/modules/multiclass.rst b/doc/modules/multiclass.rst
index 606b4246a0b88..1f6556bfa54f3 100644
--- a/doc/modules/multiclass.rst
+++ b/doc/modules/multiclass.rst
@@ -311,15 +311,15 @@ To use this feature, feed the classifier an indicator matrix, in which cell
 [i, j] indicates the presence of label j in sample i.
 
 
-.. figure:: ../auto_examples/images/sphx_glr_plot_multilabel_001.png
-    :target: ../auto_examples/plot_multilabel.html
+.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_multilabel_001.png
+    :target: ../auto_examples/miscellaneous/plot_multilabel.html
     :align: center
     :scale: 75%
 
 
 .. topic:: Examples:
 
-    * :ref:`sphx_glr_auto_examples_plot_multilabel.py`
+    * :ref:`sphx_glr_auto_examples_miscellaneous_plot_multilabel.py`
 
 .. _ovo_classification:
 
diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst
index 9aa27a53501b8..397fdd1dd9e90 100644
--- a/doc/modules/neighbors.rst
+++ b/doc/modules/neighbors.rst
@@ -230,12 +230,12 @@ which will be used to compute the weights.
    :scale: 75
 
 The use of multi-output nearest neighbors for regression is demonstrated in
-:ref:`sphx_glr_auto_examples_plot_multioutput_face_completion.py`. In this example, the inputs
+:ref:`sphx_glr_auto_examples_miscellaneous_plot_multioutput_face_completion.py`. In this example, the inputs
 X are the pixels of the upper half of faces and the outputs Y are the pixels of
 the lower half of those faces.
 
-.. figure:: ../auto_examples/images/sphx_glr_plot_multioutput_face_completion_001.png
-   :target: ../auto_examples/plot_multioutput_face_completion.html
+.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_multioutput_face_completion_001.png
+   :target: ../auto_examples/miscellaneous/plot_multioutput_face_completion.html
    :scale: 75
    :align: center
 
@@ -245,7 +245,7 @@ the lower half of those faces.
   * :ref:`sphx_glr_auto_examples_neighbors_plot_regression.py`: an example of regression
     using nearest neighbors.
 
-  * :ref:`sphx_glr_auto_examples_plot_multioutput_face_completion.py`: an example of
+  * :ref:`sphx_glr_auto_examples_miscellaneous_plot_multioutput_face_completion.py`: an example of
     multi-output regression using nearest neighbors.
 
 
diff --git a/doc/modules/outlier_detection.rst b/doc/modules/outlier_detection.rst
index c061feb0b1d7c..76bd85f3bb1c8 100644
--- a/doc/modules/outlier_detection.rst
+++ b/doc/modules/outlier_detection.rst
@@ -98,8 +98,8 @@ Outlier Factor (LOF) does not show a decision boundary in black as it
 has no predict method to be applied on new data when it is used for outlier
 detection.
 
-.. figure:: ../auto_examples/images/sphx_glr_plot_anomaly_comparison_001.png
-   :target: ../auto_examples/plot_anomaly_comparison.html
+.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_anomaly_comparison_001.png
+   :target: ../auto_examples/miscellaneous/plot_anomaly_comparison.html
    :align: center
    :scale: 50
 
@@ -109,12 +109,12 @@ The :class:`svm.OneClassSVM` is known to be sensitive to outliers and thus
 does not perform very well for outlier detection. Finally,
 :class:`covariance.EllipticEnvelope` assumes the data is Gaussian and learns
 an ellipse. For more details on the different estimators refer to the example
-:ref:`sphx_glr_auto_examples_plot_anomaly_comparison.py` and the sections
-hereunder.
+:ref:`sphx_glr_auto_examples_miscellaneous_plot_anomaly_comparison.py` and the
+sections hereunder.
 
 .. topic:: Examples:
 
-  * See :ref:`sphx_glr_auto_examples_plot_anomaly_comparison.py`
+  * See :ref:`sphx_glr_auto_examples_miscellaneous_plot_anomaly_comparison.py`
     for a comparison of the :class:`svm.OneClassSVM`, the
     :class:`ensemble.IsolationForest`, the
     :class:`neighbors.LocalOutlierFactor` and
@@ -270,8 +270,8 @@ allows you to add more trees to an already fitted model::
    * See :ref:`sphx_glr_auto_examples_ensemble_plot_isolation_forest.py` for
      an illustration of the use of IsolationForest.
 
-   * See :ref:`sphx_glr_auto_examples_plot_anomaly_comparison.py` for a
-     comparison of :class:`ensemble.IsolationForest` with
+   * See :ref:`sphx_glr_auto_examples_miscellaneous_plot_anomaly_comparison.py`
+     for a comparison of :class:`ensemble.IsolationForest` with
      :class:`neighbors.LocalOutlierFactor`,
      :class:`svm.OneClassSVM` (tuned to perform like an outlier detection
      method) and a covariance-based outlier detection with
@@ -339,8 +339,8 @@ This strategy is illustrated below.
    * See :ref:`sphx_glr_auto_examples_neighbors_plot_lof_outlier_detection.py`
      for an illustration of the use of :class:`neighbors.LocalOutlierFactor`.
 
-   * See :ref:`sphx_glr_auto_examples_plot_anomaly_comparison.py` for a
-     comparison with other anomaly detection methods.
+   * See :ref:`sphx_glr_auto_examples_miscellaneous_plot_anomaly_comparison.py`
+     for a comparison with other anomaly detection methods.
 
 .. topic:: References:
 
diff --git a/doc/modules/random_projection.rst b/doc/modules/random_projection.rst
index eb8d6de984985..cd3c129cfad45 100644
--- a/doc/modules/random_projection.rst
+++ b/doc/modules/random_projection.rst
@@ -64,19 +64,19 @@ bounded distortion introduced by the random projection::
   >>> johnson_lindenstrauss_min_dim(n_samples=[1e4, 1e5, 1e6], eps=0.1)
   array([ 7894,  9868, 11841])
 
-.. figure:: ../auto_examples/images/sphx_glr_plot_johnson_lindenstrauss_bound_001.png
-   :target: ../auto_examples/plot_johnson_lindenstrauss_bound.html
+.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_johnson_lindenstrauss_bound_001.png
+   :target: ../auto_examples/miscellaneous/plot_johnson_lindenstrauss_bound.html
    :scale: 75
    :align: center
 
-.. figure:: ../auto_examples/images/sphx_glr_plot_johnson_lindenstrauss_bound_002.png
-   :target: ../auto_examples/plot_johnson_lindenstrauss_bound.html
+.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_johnson_lindenstrauss_bound_002.png
+   :target: ../auto_examples/miscellaneous/plot_johnson_lindenstrauss_bound.html
    :scale: 75
    :align: center
 
 .. topic:: Example:
 
-  * See :ref:`sphx_glr_auto_examples_plot_johnson_lindenstrauss_bound.py`
+  * See :ref:`sphx_glr_auto_examples_miscellaneous_plot_johnson_lindenstrauss_bound.py`
     for a theoretical explication on the Johnson-Lindenstrauss lemma and an
     empirical validation using sparse random matrices.
 
diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst
index af6fc4e1edfe9..e12b63adb48c4 100644
--- a/doc/modules/tree.rst
+++ b/doc/modules/tree.rst
@@ -280,19 +280,19 @@ X is a single real value and the outputs Y are the sine and cosine of X.
    :align: center
 
 The use of multi-output trees for classification is demonstrated in
-:ref:`sphx_glr_auto_examples_plot_multioutput_face_completion.py`. In this example, the inputs
+:ref:`sphx_glr_auto_examples_miscellaneous_plot_multioutput_face_completion.py`. In this example, the inputs
 X are the pixels of the upper half of faces and the outputs Y are the pixels of
 the lower half of those faces.
 
-.. figure:: ../auto_examples/images/sphx_glr_plot_multioutput_face_completion_001.png
-   :target: ../auto_examples/plot_multioutput_face_completion.html
+.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_multioutput_face_completion_001.png
+   :target: ../auto_examples/miscellaneous/plot_multioutput_face_completion.html
    :scale: 75
    :align: center
 
 .. topic:: Examples:
 
  * :ref:`sphx_glr_auto_examples_tree_plot_tree_regression_multioutput.py`
- * :ref:`sphx_glr_auto_examples_plot_multioutput_face_completion.py`
+ * :ref:`sphx_glr_auto_examples_miscellaneous_plot_multioutput_face_completion.py`
 
 .. topic:: References:
 
diff --git a/doc/modules/unsupervised_reduction.rst b/doc/modules/unsupervised_reduction.rst
index 3a85b8e53b553..6e16886064cfc 100644
--- a/doc/modules/unsupervised_reduction.rst
+++ b/doc/modules/unsupervised_reduction.rst
@@ -37,7 +37,7 @@ documentation: :ref:`random_projection`.
 
 .. topic:: **Examples**
 
-   * :ref:`sphx_glr_auto_examples_plot_johnson_lindenstrauss_bound.py`
+   * :ref:`sphx_glr_auto_examples_miscellaneous_plot_johnson_lindenstrauss_bound.py`
 
 Feature agglomeration
 ------------------------
diff --git a/doc/visualizations.rst b/doc/visualizations.rst
index e50a9a90a0b84..ebb98700d9e08 100644
--- a/doc/visualizations.rst
+++ b/doc/visualizations.rst
@@ -24,8 +24,8 @@ ROC curve for a fitted support vector machine:
 
     svc_disp = plot_roc_curve(svc, X_test, y_test)
 
-.. figure:: auto_examples/images/sphx_glr_plot_roc_curve_visualization_api_001.png
-    :target: auto_examples/plot_roc_curve_visualization_api.html
+.. figure:: auto_examples/miscellaneous/images/sphx_glr_plot_roc_curve_visualization_api_001.png
+    :target: auto_examples/miscellaneous/plot_roc_curve_visualization_api.html
     :align: center
     :scale: 75%
 
@@ -48,8 +48,8 @@ method of the `Display` object.
     rfc_disp = plot_roc_curve(rfc, X_test, y_test, ax=ax, alpha=0.8)
     svc_disp.plot(ax=ax, alpha=0.8)
 
-.. figure:: auto_examples/images/sphx_glr_plot_roc_curve_visualization_api_002.png
-    :target: auto_examples/plot_roc_curve_visualization_api.html
+.. figure:: auto_examples/miscellaneous/images/sphx_glr_plot_roc_curve_visualization_api_002.png
+    :target: auto_examples/miscellaneous/plot_roc_curve_visualization_api.html
     :align: center
     :scale: 75%
 
@@ -58,9 +58,9 @@ values of the curves.
 
 .. topic:: Examples:
 
-    * :ref:`sphx_glr_auto_examples_plot_roc_curve_visualization_api.py`
-    * :ref:`sphx_glr_auto_examples_plot_partial_dependence_visualization_api.py`
-    * :ref:`sphx_glr_auto_examples_plot_display_object_visualization.py`
+    * :ref:`sphx_glr_auto_examples_miscellaneous_plot_roc_curve_visualization_api.py`
+    * :ref:`sphx_glr_auto_examples_miscellaneous_plot_partial_dependence_visualization_api.py`
+    * :ref:`sphx_glr_auto_examples_miscellaneous_plot_display_object_visualization.py`
 
 Available Plotting Utilities
 ============================
diff --git a/examples/README.txt b/examples/README.txt
index 4ee6efc46d1dd..958de667a5c69 100644
--- a/examples/README.txt
+++ b/examples/README.txt
@@ -2,8 +2,3 @@
 
 Examples
 ========
-
-Miscellaneous examples
-----------------------
-
-Miscellaneous and introductory examples for scikit-learn.
diff --git a/examples/miscellaneous/README.txt b/examples/miscellaneous/README.txt
new file mode 100644
index 0000000000000..4e44ceee95809
--- /dev/null
+++ b/examples/miscellaneous/README.txt
@@ -0,0 +1,7 @@
+.. _miscellaneous_examples:
+
+Miscellaneous
+-------------
+
+Miscellaneous and introductory examples for scikit-learn.
+
diff --git a/examples/plot_anomaly_comparison.py b/examples/miscellaneous/plot_anomaly_comparison.py
similarity index 100%
rename from examples/plot_anomaly_comparison.py
rename to examples/miscellaneous/plot_anomaly_comparison.py
diff --git a/examples/plot_changed_only_pprint_parameter.py b/examples/miscellaneous/plot_changed_only_pprint_parameter.py
similarity index 100%
rename from examples/plot_changed_only_pprint_parameter.py
rename to examples/miscellaneous/plot_changed_only_pprint_parameter.py
diff --git a/examples/plot_display_object_visualization.py b/examples/miscellaneous/plot_display_object_visualization.py
similarity index 100%
rename from examples/plot_display_object_visualization.py
rename to examples/miscellaneous/plot_display_object_visualization.py
diff --git a/examples/plot_isotonic_regression.py b/examples/miscellaneous/plot_isotonic_regression.py
similarity index 100%
rename from examples/plot_isotonic_regression.py
rename to examples/miscellaneous/plot_isotonic_regression.py
diff --git a/examples/plot_johnson_lindenstrauss_bound.py b/examples/miscellaneous/plot_johnson_lindenstrauss_bound.py
similarity index 99%
rename from examples/plot_johnson_lindenstrauss_bound.py
rename to examples/miscellaneous/plot_johnson_lindenstrauss_bound.py
index b981c14fbf132..9d369c6c6d46d 100644
--- a/examples/plot_johnson_lindenstrauss_bound.py
+++ b/examples/miscellaneous/plot_johnson_lindenstrauss_bound.py
@@ -8,7 +8,8 @@
 dataset can be randomly projected into a lower dimensional Euclidean
 space while controlling the distortion in the pairwise distances.
 
-.. _`Johnson-Lindenstrauss lemma`: https://en.wikipedia.org/wiki/Johnson%E2%80%93Lindenstrauss_lemma
+.. _`Johnson-Lindenstrauss lemma`: https://en.wikipedia.org/wiki/\
+    Johnson%E2%80%93Lindenstrauss_lemma
 """
 
 print(__doc__)
diff --git a/examples/plot_kernel_approximation.py b/examples/miscellaneous/plot_kernel_approximation.py
similarity index 100%
rename from examples/plot_kernel_approximation.py
rename to examples/miscellaneous/plot_kernel_approximation.py
diff --git a/examples/plot_kernel_ridge_regression.py b/examples/miscellaneous/plot_kernel_ridge_regression.py
similarity index 100%
rename from examples/plot_kernel_ridge_regression.py
rename to examples/miscellaneous/plot_kernel_ridge_regression.py
diff --git a/examples/plot_multilabel.py b/examples/miscellaneous/plot_multilabel.py
similarity index 100%
rename from examples/plot_multilabel.py
rename to examples/miscellaneous/plot_multilabel.py
diff --git a/examples/plot_multioutput_face_completion.py b/examples/miscellaneous/plot_multioutput_face_completion.py
similarity index 100%
rename from examples/plot_multioutput_face_completion.py
rename to examples/miscellaneous/plot_multioutput_face_completion.py
diff --git a/examples/plot_partial_dependence_visualization_api.py b/examples/miscellaneous/plot_partial_dependence_visualization_api.py
similarity index 98%
rename from examples/plot_partial_dependence_visualization_api.py
rename to examples/miscellaneous/plot_partial_dependence_visualization_api.py
index 8ccb225afc2d0..761dad8b1e1fa 100644
--- a/examples/plot_partial_dependence_visualization_api.py
+++ b/examples/miscellaneous/plot_partial_dependence_visualization_api.py
@@ -10,9 +10,9 @@
 
 .. note::
 
-    See also :ref:`sphx_glr_auto_examples_plot_roc_curve_visualization_api.py`
+    See also :ref:`sphx_glr_auto_examples_miscellaneous_plot_roc_curve_visualization_api.py`
 
-"""
+"""  # noqa
 print(__doc__)
 
 import pandas as pd
diff --git a/examples/plot_roc_curve_visualization_api.py b/examples/miscellaneous/plot_roc_curve_visualization_api.py
similarity index 100%
rename from examples/plot_roc_curve_visualization_api.py
rename to examples/miscellaneous/plot_roc_curve_visualization_api.py
diff --git a/sklearn/inspection/_plot/partial_dependence.py b/sklearn/inspection/_plot/partial_dependence.py
index 812005f5ab2ae..e02717e76dce3 100644
--- a/sklearn/inspection/_plot/partial_dependence.py
+++ b/sklearn/inspection/_plot/partial_dependence.py
@@ -343,7 +343,7 @@ class PartialDependenceDisplay:
     stored as attributes.
 
     Read more in
-    :ref:`sphx_glr_auto_examples_plot_partial_dependence_visualization_api.py`
+    :ref:`sphx_glr_auto_examples_miscellaneous_plot_partial_dependence_visualization_api.py`
     and the :ref:`User Guide <visualizations>`.
 
         .. versionadded:: 0.22

From a1261a7e18c19bb3dfc8d739a6512c6f671d9e79 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 27 Apr 2020 13:58:33 -0400
Subject: [PATCH 076/125] DOC kwonly section in whatsnew (#17059)

---
 doc/whats_new/v0.23.rst | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index 4357845885e3f..6d9fbfeeebc0c 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -14,6 +14,20 @@ Version 0.23.0
 
 Put the changes in their relevant module.
 
+Enforcing keyword-only arguments
+--------------------------------
+
+In an effort to promote clear and non-ambiguous use of the library, most
+constructor and function parameters are now expected to be passed as keyword
+arguments (i.e. using the `param=value` syntax) instead of positional. To
+ease the transition, a `FutureWarning` is raised if a keyword-only parameter
+is used as positional. In version 0.25, these parameters will be strictly
+keyword-only, and a `TypeError` will be raised.
+:issue:`15005` by `Joel Nothman`_, `Adrin Jalali`_, `Thomas Fan`_, and
+`Nicolas Hug`_. See `SLEP009
+<https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep009/proposal.html>`_
+for more details.
+
 Changed models
 --------------
 
@@ -536,12 +550,6 @@ Miscellaneous
   error message is raised when y was expected but None was passed.
   :pr:`16622` by `Nicolas Hug`_.
 
-- |API| Most constructor and function parameters are now expected to be passed
-  as a keyword and not positional. :issue:`15005` by `Joel Nothman`_,
-  `Adrin Jalali`_, `Thomas Fan`_, and `Nicolas Hug`_. See `SLEP009
-  <https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep009/proposal.html>`_
-  for more details.
-
 Code and Documentation Contributors
 -----------------------------------
 

From 1ba06518c5c4c7c1865110a8f34c4da64d8e478f Mon Sep 17 00:00:00 2001
From: Adrin Jalali <adrin.jalali@gmail.com>
Date: Mon, 27 Apr 2020 20:41:35 +0200
Subject: [PATCH 077/125] API make load_* args in datasets kwarg only (#16719)

* API male load_* args in datasets kwarg only

* more loaders

* pep8

* fix test_omp usage

* fix some usages

* Update sklearn/datasets/_samples_generator.py

Co-Authored-By: Thomas J Fan <thomasjpfan@gmail.com>

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
Co-authored-by: Thomas J Fan <thomasjpfan@gmail.com>
---
 sklearn/datasets/_base.py                  | 25 ++++++---
 sklearn/datasets/_california_housing.py    |  5 +-
 sklearn/datasets/_covtype.py               |  5 +-
 sklearn/datasets/_kddcup99.py              |  5 +-
 sklearn/datasets/_lfw.py                   |  8 ++-
 sklearn/datasets/_olivetti_faces.py        |  4 +-
 sklearn/datasets/_openml.py                |  4 +-
 sklearn/datasets/_rcv1.py                  |  4 +-
 sklearn/datasets/_samples_generator.py     | 64 +++++++++++++++-------
 sklearn/datasets/_species_distributions.py |  4 +-
 sklearn/datasets/_svmlight_format_io.py    | 20 +++++--
 sklearn/datasets/_twenty_newsgroups.py     |  7 ++-
 sklearn/datasets/tests/test_base.py        |  2 +-
 sklearn/linear_model/tests/test_omp.py     |  7 ++-
 14 files changed, 117 insertions(+), 47 deletions(-)

diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py
index 9737a5f67891a..2402fc3a069dc 100644
--- a/sklearn/datasets/_base.py
+++ b/sklearn/datasets/_base.py
@@ -17,6 +17,7 @@
 from ..utils import Bunch
 from ..utils import check_random_state
 from ..utils import check_pandas_support
+from ..utils.validation import _deprecate_positional_args
 
 import numpy as np
 
@@ -80,7 +81,8 @@ def _convert_data_dataframe(caller_name, data, target,
     return combined_df, X, y
 
 
-def load_files(container_path, description=None, categories=None,
+@_deprecate_positional_args
+def load_files(container_path, *, description=None, categories=None,
                load_content=True, shuffle=True, encoding=None,
                decode_error='strict', random_state=0):
     """Load text files with categories as subfolder names.
@@ -267,7 +269,8 @@ def load_data(module_path, data_file_name):
     return data, target, target_names
 
 
-def load_wine(return_X_y=False, as_frame=False):
+@_deprecate_positional_args
+def load_wine(*, return_X_y=False, as_frame=False):
     """Load and return the wine dataset (classification).
 
     .. versionadded:: 0.18
@@ -381,7 +384,8 @@ def load_wine(return_X_y=False, as_frame=False):
                  feature_names=feature_names)
 
 
-def load_iris(return_X_y=False, as_frame=False):
+@_deprecate_positional_args
+def load_iris(*, return_X_y=False, as_frame=False):
     """Load and return the iris dataset (classification).
 
     The iris dataset is a classic and very easy multi-class classification
@@ -495,7 +499,8 @@ def load_iris(return_X_y=False, as_frame=False):
                  filename=iris_csv_filename)
 
 
-def load_breast_cancer(return_X_y=False, as_frame=False):
+@_deprecate_positional_args
+def load_breast_cancer(*, return_X_y=False, as_frame=False):
     """Load and return the breast cancer wisconsin dataset (classification).
 
     The breast cancer dataset is a classic and very easy binary classification
@@ -619,7 +624,8 @@ def load_breast_cancer(return_X_y=False, as_frame=False):
                  filename=csv_filename)
 
 
-def load_digits(n_class=10, return_X_y=False, as_frame=False):
+@_deprecate_positional_args
+def load_digits(*, n_class=10, return_X_y=False, as_frame=False):
     """Load and return the digits dataset (classification).
 
     Each datapoint is a 8x8 image of a digit.
@@ -742,7 +748,8 @@ def load_digits(n_class=10, return_X_y=False, as_frame=False):
                  DESCR=descr)
 
 
-def load_diabetes(return_X_y=False, as_frame=False):
+@_deprecate_positional_args
+def load_diabetes(*, return_X_y=False, as_frame=False):
     """Load and return the diabetes dataset (regression).
 
     ==============   ==================
@@ -834,7 +841,8 @@ def load_diabetes(return_X_y=False, as_frame=False):
                  target_filename=target_filename)
 
 
-def load_linnerud(return_X_y=False, as_frame=False):
+@_deprecate_positional_args
+def load_linnerud(*, return_X_y=False, as_frame=False):
     """Load and return the physical excercise linnerud dataset.
 
     This dataset is suitable for multi-ouput regression tasks.
@@ -937,7 +945,8 @@ def load_linnerud(return_X_y=False, as_frame=False):
                  target_filename=target_filename)
 
 
-def load_boston(return_X_y=False):
+@_deprecate_positional_args
+def load_boston(*, return_X_y=False):
     """Load and return the boston house-prices dataset (regression).
 
     ==============   ==============
diff --git a/sklearn/datasets/_california_housing.py b/sklearn/datasets/_california_housing.py
index e3df2124aab2b..107458e2d515d 100644
--- a/sklearn/datasets/_california_housing.py
+++ b/sklearn/datasets/_california_housing.py
@@ -36,6 +36,8 @@
 from ._base import _pkl_filepath
 from ._base import RemoteFileMetadata
 from ..utils import Bunch
+from ..utils.validation import _deprecate_positional_args
+
 
 # The original data can be found at:
 # https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz
@@ -48,7 +50,8 @@
 logger = logging.getLogger(__name__)
 
 
-def fetch_california_housing(data_home=None, download_if_missing=True,
+@_deprecate_positional_args
+def fetch_california_housing(*, data_home=None, download_if_missing=True,
                              return_X_y=False, as_frame=False):
     """Load the California housing dataset (regression).
 
diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py
index 6b23f913e05a7..de93b22ac4f56 100644
--- a/sklearn/datasets/_covtype.py
+++ b/sklearn/datasets/_covtype.py
@@ -28,6 +28,8 @@
 from ..utils import Bunch
 from ._base import _pkl_filepath
 from ..utils import check_random_state
+from ..utils.validation import _deprecate_positional_args
+
 
 # The original data can be found in:
 # https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz
@@ -40,7 +42,8 @@
 logger = logging.getLogger(__name__)
 
 
-def fetch_covtype(data_home=None, download_if_missing=True,
+@_deprecate_positional_args
+def fetch_covtype(*, data_home=None, download_if_missing=True,
                   random_state=None, shuffle=False, return_X_y=False):
     """Load the covertype dataset (classification).
 
diff --git a/sklearn/datasets/_kddcup99.py b/sklearn/datasets/_kddcup99.py
index c0ba00fa46f04..4e2f6856d89b1 100644
--- a/sklearn/datasets/_kddcup99.py
+++ b/sklearn/datasets/_kddcup99.py
@@ -23,6 +23,8 @@
 from ..utils import Bunch
 from ..utils import check_random_state
 from ..utils import shuffle as shuffle_method
+from ..utils.validation import _deprecate_positional_args
+
 
 # The original data can be found at:
 # https://archive.ics.uci.edu/ml/machine-learning-databases/kddcup99-mld/kddcup.data.gz
@@ -43,7 +45,8 @@
 logger = logging.getLogger(__name__)
 
 
-def fetch_kddcup99(subset=None, data_home=None, shuffle=False,
+@_deprecate_positional_args
+def fetch_kddcup99(*, subset=None, data_home=None, shuffle=False,
                    random_state=None,
                    percent10=True, download_if_missing=True, return_X_y=False):
     """Load the kddcup99 dataset (classification).
diff --git a/sklearn/datasets/_lfw.py b/sklearn/datasets/_lfw.py
index 3dc3833db3417..b8db75010e8f2 100644
--- a/sklearn/datasets/_lfw.py
+++ b/sklearn/datasets/_lfw.py
@@ -20,6 +20,7 @@
 
 from ._base import get_data_home, _fetch_remote, RemoteFileMetadata
 from ..utils import Bunch
+from ..utils.validation import _deprecate_positional_args
 
 logger = logging.getLogger(__name__)
 
@@ -215,7 +216,8 @@ def _fetch_lfw_people(data_folder_path, slice_=None, color=False, resize=None,
     return faces, target, target_names
 
 
-def fetch_lfw_people(data_home=None, funneled=True, resize=0.5,
+@_deprecate_positional_args
+def fetch_lfw_people(*, data_home=None, funneled=True, resize=0.5,
                      min_faces_per_person=0, color=False,
                      slice_=(slice(70, 195), slice(78, 172)),
                      download_if_missing=True, return_X_y=False):
@@ -385,7 +387,9 @@ def _fetch_lfw_pairs(index_file_path, data_folder_path, slice_=None,
     return pairs, target, np.array(['Different persons', 'Same person'])
 
 
-def fetch_lfw_pairs(subset='train', data_home=None, funneled=True, resize=0.5,
+@_deprecate_positional_args
+def fetch_lfw_pairs(*, subset='train', data_home=None, funneled=True,
+                    resize=0.5,
                     color=False, slice_=(slice(70, 195), slice(78, 172)),
                     download_if_missing=True):
     """Load the Labeled Faces in the Wild (LFW) pairs dataset (classification).
diff --git a/sklearn/datasets/_olivetti_faces.py b/sklearn/datasets/_olivetti_faces.py
index d5f163d468214..dfa459880a5c4 100644
--- a/sklearn/datasets/_olivetti_faces.py
+++ b/sklearn/datasets/_olivetti_faces.py
@@ -25,6 +25,7 @@
 from ._base import RemoteFileMetadata
 from ._base import _pkl_filepath
 from ..utils import check_random_state, Bunch
+from ..utils.validation import _deprecate_positional_args
 
 # The original data can be found at:
 # https://cs.nyu.edu/~roweis/data/olivettifaces.mat
@@ -35,7 +36,8 @@
               'd5fca46a4b8906c18e454d41af987794'))
 
 
-def fetch_olivetti_faces(data_home=None, shuffle=False, random_state=0,
+@_deprecate_positional_args
+def fetch_olivetti_faces(*, data_home=None, shuffle=False, random_state=0,
                          download_if_missing=True, return_X_y=False):
     """Load the Olivetti faces data-set from AT&T (classification).
 
diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
index 4040641a17574..112cd9c0e525e 100644
--- a/sklearn/datasets/_openml.py
+++ b/sklearn/datasets/_openml.py
@@ -23,6 +23,7 @@
 from ..utils import get_chunk_n_rows
 from ..utils import _chunk_generator
 from ..utils import check_pandas_support  # noqa
+from ..utils.validation import _deprecate_positional_args
 
 __all__ = ['fetch_openml']
 
@@ -608,7 +609,8 @@ def _valid_data_column_names(features_list, target_columns):
     return valid_data_column_names
 
 
-def fetch_openml(name=None, version='active', data_id=None, data_home=None,
+@_deprecate_positional_args
+def fetch_openml(name=None, *, version='active', data_id=None, data_home=None,
                  target_column='default-target', cache=True, return_X_y=False,
                  as_frame=False):
     """Fetch dataset from openml by name or dataset id.
diff --git a/sklearn/datasets/_rcv1.py b/sklearn/datasets/_rcv1.py
index 4f1c5cc4af199..abb9881700614 100644
--- a/sklearn/datasets/_rcv1.py
+++ b/sklearn/datasets/_rcv1.py
@@ -25,6 +25,7 @@
 from ._svmlight_format_io import load_svmlight_files
 from ..utils import shuffle as shuffle_
 from ..utils import Bunch
+from ..utils.validation import _deprecate_positional_args
 
 
 # The original vectorized data can be found at:
@@ -75,7 +76,8 @@
 logger = logging.getLogger(__name__)
 
 
-def fetch_rcv1(data_home=None, subset='all', download_if_missing=True,
+@_deprecate_positional_args
+def fetch_rcv1(*, data_home=None, subset='all', download_if_missing=True,
                random_state=None, shuffle=False, return_X_y=False):
     """Load the RCV1 multilabel dataset (classification).
 
diff --git a/sklearn/datasets/_samples_generator.py b/sklearn/datasets/_samples_generator.py
index 7642b6fb7dc59..ee3ac6ab2827f 100644
--- a/sklearn/datasets/_samples_generator.py
+++ b/sklearn/datasets/_samples_generator.py
@@ -18,6 +18,7 @@
 from ..utils import check_array, check_random_state
 from ..utils import shuffle as util_shuffle
 from ..utils.random import sample_without_replacement
+from ..utils.validation import _deprecate_positional_args
 
 
 def _generate_hypercube(samples, dimensions, rng):
@@ -33,7 +34,8 @@ def _generate_hypercube(samples, dimensions, rng):
     return out
 
 
-def make_classification(n_samples=100, n_features=20, n_informative=2,
+@_deprecate_positional_args
+def make_classification(n_samples=100, n_features=20, *, n_informative=2,
                         n_redundant=2, n_repeated=0, n_classes=2,
                         n_clusters_per_class=2, weights=None, flip_y=0.01,
                         class_sep=1.0, hypercube=True, shift=0.0, scale=1.0,
@@ -261,7 +263,9 @@ def make_classification(n_samples=100, n_features=20, n_informative=2,
     return X, y
 
 
-def make_multilabel_classification(n_samples=100, n_features=20, n_classes=5,
+@_deprecate_positional_args
+def make_multilabel_classification(n_samples=100, n_features=20, *,
+                                   n_classes=5,
                                    n_labels=2, length=50, allow_unlabeled=True,
                                    sparse=False, return_indicator='dense',
                                    return_distributions=False,
@@ -422,7 +426,8 @@ def sample_example():
     return X, Y
 
 
-def make_hastie_10_2(n_samples=12000, random_state=None):
+@_deprecate_positional_args
+def make_hastie_10_2(n_samples=12000, *, random_state=None):
     """Generates data for binary classification used in
     Hastie et al. 2009, Example 10.2.
 
@@ -470,7 +475,8 @@ def make_hastie_10_2(n_samples=12000, random_state=None):
     return X, y
 
 
-def make_regression(n_samples=100, n_features=100, n_informative=10,
+@_deprecate_positional_args
+def make_regression(n_samples=100, n_features=100, *, n_informative=10,
                     n_targets=1, bias=0.0, effective_rank=None,
                     tail_strength=0.5, noise=0.0, shuffle=True, coef=False,
                     random_state=None):
@@ -592,7 +598,8 @@ def make_regression(n_samples=100, n_features=100, n_informative=10,
         return X, y
 
 
-def make_circles(n_samples=100, shuffle=True, noise=None, random_state=None,
+@_deprecate_positional_args
+def make_circles(n_samples=100, *, shuffle=True, noise=None, random_state=None,
                  factor=.8):
     """Make a large circle containing a smaller circle in 2d.
 
@@ -668,7 +675,8 @@ def make_circles(n_samples=100, shuffle=True, noise=None, random_state=None,
     return X, y
 
 
-def make_moons(n_samples=100, shuffle=True, noise=None, random_state=None):
+@_deprecate_positional_args
+def make_moons(n_samples=100, *, shuffle=True, noise=None, random_state=None):
     """Make two interleaving half circles
 
     A simple toy dataset to visualize clustering and classification
@@ -731,7 +739,8 @@ def make_moons(n_samples=100, shuffle=True, noise=None, random_state=None):
     return X, y
 
 
-def make_blobs(n_samples=100, n_features=2, centers=None, cluster_std=1.0,
+@_deprecate_positional_args
+def make_blobs(n_samples=100, n_features=2, *, centers=None, cluster_std=1.0,
                center_box=(-10.0, 10.0), shuffle=True, random_state=None,
                return_centers=False):
     """Generate isotropic Gaussian blobs for clustering.
@@ -889,7 +898,9 @@ def make_blobs(n_samples=100, n_features=2, centers=None, cluster_std=1.0,
         return X, y
 
 
-def make_friedman1(n_samples=100, n_features=10, noise=0.0, random_state=None):
+@_deprecate_positional_args
+def make_friedman1(n_samples=100, n_features=10, *, noise=0.0,
+                   random_state=None):
     """Generate the "Friedman #1" regression problem
 
     This dataset is described in Friedman [1] and Breiman [2].
@@ -951,7 +962,8 @@ def make_friedman1(n_samples=100, n_features=10, noise=0.0, random_state=None):
     return X, y
 
 
-def make_friedman2(n_samples=100, noise=0.0, random_state=None):
+@_deprecate_positional_args
+def make_friedman2(n_samples=100, *, noise=0.0, random_state=None):
     """Generate the "Friedman #2" regression problem
 
     This dataset is described in Friedman [1] and Breiman [2].
@@ -1016,7 +1028,8 @@ def make_friedman2(n_samples=100, noise=0.0, random_state=None):
     return X, y
 
 
-def make_friedman3(n_samples=100, noise=0.0, random_state=None):
+@_deprecate_positional_args
+def make_friedman3(n_samples=100, *, noise=0.0, random_state=None):
     """Generate the "Friedman #3" regression problem
 
     This dataset is described in Friedman [1] and Breiman [2].
@@ -1080,7 +1093,8 @@ def make_friedman3(n_samples=100, noise=0.0, random_state=None):
     return X, y
 
 
-def make_low_rank_matrix(n_samples=100, n_features=100, effective_rank=10,
+@_deprecate_positional_args
+def make_low_rank_matrix(n_samples=100, n_features=100, *, effective_rank=10,
                          tail_strength=0.5, random_state=None):
     """Generate a mostly low rank matrix with bell-shaped singular values
 
@@ -1149,7 +1163,8 @@ def make_low_rank_matrix(n_samples=100, n_features=100, effective_rank=10,
     return np.dot(np.dot(u, s), v.T)
 
 
-def make_sparse_coded_signal(n_samples, n_components, n_features,
+@_deprecate_positional_args
+def make_sparse_coded_signal(n_samples, *, n_components, n_features,
                              n_nonzero_coefs, random_state=None):
     """Generate a signal as a sparse combination of dictionary elements.
 
@@ -1211,7 +1226,9 @@ def make_sparse_coded_signal(n_samples, n_components, n_features,
     return map(np.squeeze, (Y, D, X))
 
 
-def make_sparse_uncorrelated(n_samples=100, n_features=10, random_state=None):
+@_deprecate_positional_args
+def make_sparse_uncorrelated(n_samples=100, n_features=10, *,
+                             random_state=None):
     """Generate a random regression problem with sparse uncorrelated design
 
     This dataset is described in Celeux et al [1]. as::
@@ -1262,7 +1279,8 @@ def make_sparse_uncorrelated(n_samples=100, n_features=10, random_state=None):
     return X, y
 
 
-def make_spd_matrix(n_dim, random_state=None):
+@_deprecate_positional_args
+def make_spd_matrix(n_dim, *, random_state=None):
     """Generate a random symmetric, positive-definite matrix.
 
     Read more in the :ref:`User Guide <sample_generators>`.
@@ -1295,7 +1313,8 @@ def make_spd_matrix(n_dim, random_state=None):
     return X
 
 
-def make_sparse_spd_matrix(dim=1, alpha=0.95, norm_diag=False,
+@_deprecate_positional_args
+def make_sparse_spd_matrix(dim=1, *, alpha=0.95, norm_diag=False,
                            smallest_coef=.1, largest_coef=.9,
                            random_state=None):
     """Generate a sparse symmetric definite positive matrix.
@@ -1369,7 +1388,8 @@ def make_sparse_spd_matrix(dim=1, alpha=0.95, norm_diag=False,
     return prec
 
 
-def make_swiss_roll(n_samples=100, noise=0.0, random_state=None):
+@_deprecate_positional_args
+def make_swiss_roll(n_samples=100, *, noise=0.0, random_state=None):
     """Generate a swiss roll dataset.
 
     Read more in the :ref:`User Guide <sample_generators>`.
@@ -1421,7 +1441,8 @@ def make_swiss_roll(n_samples=100, noise=0.0, random_state=None):
     return X, t
 
 
-def make_s_curve(n_samples=100, noise=0.0, random_state=None):
+@_deprecate_positional_args
+def make_s_curve(n_samples=100, *, noise=0.0, random_state=None):
     """Generate an S curve dataset.
 
     Read more in the :ref:`User Guide <sample_generators>`.
@@ -1463,7 +1484,8 @@ def make_s_curve(n_samples=100, noise=0.0, random_state=None):
     return X, t
 
 
-def make_gaussian_quantiles(mean=None, cov=1., n_samples=100,
+@_deprecate_positional_args
+def make_gaussian_quantiles(*, mean=None, cov=1., n_samples=100,
                             n_features=2, n_classes=3,
                             shuffle=True, random_state=None):
     r"""Generate isotropic Gaussian and label samples by quantile
@@ -1558,7 +1580,8 @@ def _shuffle(data, random_state=None):
     return result, row_idx, col_idx
 
 
-def make_biclusters(shape, n_clusters, noise=0.0, minval=10,
+@_deprecate_positional_args
+def make_biclusters(shape, n_clusters, *, noise=0.0, minval=10,
                     maxval=100, shuffle=True, random_state=None):
     """Generate an array with constant block diagonal structure for
     biclustering.
@@ -1649,7 +1672,8 @@ def make_biclusters(shape, n_clusters, noise=0.0, minval=10,
     return result, rows, cols
 
 
-def make_checkerboard(shape, n_clusters, noise=0.0, minval=10,
+@_deprecate_positional_args
+def make_checkerboard(shape, n_clusters, *, noise=0.0, minval=10,
                       maxval=100, shuffle=True, random_state=None):
     """Generate an array with block checkerboard structure for
     biclustering.
diff --git a/sklearn/datasets/_species_distributions.py b/sklearn/datasets/_species_distributions.py
index 7f621d1de74eb..e17ab419c5d7e 100644
--- a/sklearn/datasets/_species_distributions.py
+++ b/sklearn/datasets/_species_distributions.py
@@ -50,6 +50,7 @@
 from ._base import _fetch_remote
 from ._base import RemoteFileMetadata
 from ..utils import Bunch
+from ..utils.validation import _deprecate_positional_args
 from ._base import _pkl_filepath
 
 # The original data can be found at:
@@ -137,7 +138,8 @@ def construct_grids(batch):
     return (xgrid, ygrid)
 
 
-def fetch_species_distributions(data_home=None,
+@_deprecate_positional_args
+def fetch_species_distributions(*, data_home=None,
                                 download_if_missing=True):
     """Loader for species distribution dataset from Phillips et. al. (2006)
 
diff --git a/sklearn/datasets/_svmlight_format_io.py b/sklearn/datasets/_svmlight_format_io.py
index 91bb35ff2ec75..8360ee4402b40 100644
--- a/sklearn/datasets/_svmlight_format_io.py
+++ b/sklearn/datasets/_svmlight_format_io.py
@@ -25,6 +25,7 @@
 from .. import __version__
 
 from ..utils import check_array, IS_PYPY
+from ..utils.validation import _deprecate_positional_args
 
 if not IS_PYPY:
     from ._svmlight_format_fast import _load_svmlight_file
@@ -37,7 +38,8 @@ def _load_svmlight_file(*args, **kwargs):
                 'for the status updates).')
 
 
-def load_svmlight_file(f, n_features=None, dtype=np.float64,
+@_deprecate_positional_args
+def load_svmlight_file(f, *, n_features=None, dtype=np.float64,
                        multilabel=False, zero_based="auto", query_id=False,
                        offset=0, length=-1):
     """Load datasets in the svmlight / libsvm format into sparse CSR matrix
@@ -151,8 +153,13 @@ def get_data():
 
         X, y = get_data()
     """
-    return tuple(load_svmlight_files([f], n_features, dtype, multilabel,
-                                     zero_based, query_id, offset, length))
+    return tuple(load_svmlight_files([f], n_features=n_features,
+                                     dtype=dtype,
+                                     multilabel=multilabel,
+                                     zero_based=zero_based,
+                                     query_id=query_id,
+                                     offset=offset,
+                                     length=length))
 
 
 def _gen_open(f):
@@ -196,7 +203,8 @@ def _open_and_load(f, dtype, multilabel, zero_based, query_id,
     return data, indices, indptr, labels, query
 
 
-def load_svmlight_files(files, n_features=None, dtype=np.float64,
+@_deprecate_positional_args
+def load_svmlight_files(files, *, n_features=None, dtype=np.float64,
                         multilabel=False, zero_based="auto", query_id=False,
                         offset=0, length=-1):
     """Load dataset from multiple files in SVMlight format
@@ -380,7 +388,9 @@ def _dump_svmlight(X, y, f, multilabel, one_based, comment, query_id):
         f.write((line_pattern % feat).encode('ascii'))
 
 
-def dump_svmlight_file(X, y, f,  zero_based=True, comment=None, query_id=None,
+@_deprecate_positional_args
+def dump_svmlight_file(X, y, f, *, zero_based=True, comment=None,
+                       query_id=None,
                        multilabel=False):
     """Dump the dataset in svmlight / libsvm file format.
 
diff --git a/sklearn/datasets/_twenty_newsgroups.py b/sklearn/datasets/_twenty_newsgroups.py
index ebbd191069c49..c5d322c88ef0c 100644
--- a/sklearn/datasets/_twenty_newsgroups.py
+++ b/sklearn/datasets/_twenty_newsgroups.py
@@ -45,6 +45,7 @@
 from ..feature_extraction.text import CountVectorizer
 from .. import preprocessing
 from ..utils import check_random_state, Bunch
+from ..utils.validation import _deprecate_positional_args
 
 logger = logging.getLogger(__name__)
 
@@ -146,7 +147,8 @@ def strip_newsgroup_footer(text):
         return text
 
 
-def fetch_20newsgroups(data_home=None, subset='train', categories=None,
+@_deprecate_positional_args
+def fetch_20newsgroups(*, data_home=None, subset='train', categories=None,
                        shuffle=True, random_state=42,
                        remove=(),
                        download_if_missing=True, return_X_y=False):
@@ -322,7 +324,8 @@ def fetch_20newsgroups(data_home=None, subset='train', categories=None,
     return data
 
 
-def fetch_20newsgroups_vectorized(subset="train", remove=(), data_home=None,
+@_deprecate_positional_args
+def fetch_20newsgroups_vectorized(*, subset="train", remove=(), data_home=None,
                                   download_if_missing=True, return_X_y=False,
                                   normalize=True):
     """Load the 20 newsgroups dataset and vectorize it into token counts \
diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py
index 224538b181696..3ec60074a4015 100644
--- a/sklearn/datasets/tests/test_base.py
+++ b/sklearn/datasets/tests/test_base.py
@@ -152,7 +152,7 @@ def test_load_digits():
 
 
 def test_load_digits_n_class_lt_10():
-    digits = load_digits(9)
+    digits = load_digits(n_class=9)
     assert digits.data.shape == (1617, 64)
     assert numpy.unique(digits.target).size == 9
 
diff --git a/sklearn/linear_model/tests/test_omp.py b/sklearn/linear_model/tests/test_omp.py
index 791983ba62cc2..f3f3080aebe66 100644
--- a/sklearn/linear_model/tests/test_omp.py
+++ b/sklearn/linear_model/tests/test_omp.py
@@ -18,8 +18,11 @@
 from sklearn.datasets import make_sparse_coded_signal
 
 n_samples, n_features, n_nonzero_coefs, n_targets = 25, 35, 5, 3
-y, X, gamma = make_sparse_coded_signal(n_targets, n_features, n_samples,
-                                       n_nonzero_coefs, random_state=0)
+y, X, gamma = make_sparse_coded_signal(n_samples=n_targets,
+                                       n_components=n_features,
+                                       n_features=n_samples,
+                                       n_nonzero_coefs=n_nonzero_coefs,
+                                       random_state=0)
 # Make X not of norm 1 for testing
 X *= 10
 y *= 10

From 5b2c931a994c9f4e39d202efd3b8a3de44309728 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Tue, 28 Apr 2020 05:38:11 +1000
Subject: [PATCH 078/125] API pairwise_distances will require explicit V/VI
 param if Y is given (#16993)

* API pairwise_distances will require explicit V/VI param if Y is given

Deprecation until version 0.25.

The current approach in `_precompute_metric_params`
(https://github.com/scikit-learn/scikit-learn/blob/f82a2cb33871a67b36150647ece1c7e56d3132bb/sklearn/metrics/pairwise.py#L1429-L1444)
means that we may be applying a different metric at training and test
time. Ideally we'd have a framework for fitting a metric on some
specific training data, but in the meantime, this deprecation stops
users making mistakes.

* DOC update what's new

* Update sklearn/metrics/tests/test_pairwise.py

Co-Authored-By: Thomas J Fan <thomasjpfan@gmail.com>

* Update sklearn/metrics/pairwise.py

Co-Authored-By: Thomas J Fan <thomasjpfan@gmail.com>

* Update sklearn/metrics/pairwise.py

Co-Authored-By: Thomas J Fan <thomasjpfan@gmail.com>

* Update sklearn/metrics/tests/test_pairwise.py

Co-Authored-By: Thomas J Fan <thomasjpfan@gmail.com>

Co-authored-by: Thomas J Fan <thomasjpfan@gmail.com>
---
 doc/whats_new/v0.23.rst                |  6 ++++++
 sklearn/metrics/pairwise.py            |  6 ++++++
 sklearn/metrics/tests/test_pairwise.py | 12 ++++++++++--
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index 6d9fbfeeebc0c..e0a3927f41405 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -397,6 +397,12 @@ Changelog
   or 'd'). :pr:`16159` by :user:`Rick Mackenbach <Rick-Mackenbach>` and
   `Thomas Fan`_.
 
+- |API| From version 0.25, :func:`metrics.pairwise.pairwise_distances` will no
+  longer automatically compute the ``VI`` parameter for Mahalanobis distance
+  and the ``V`` parameter for seuclidean distance if ``Y`` is passed. The user
+  will be expected to compute this parameter on the training data of their
+  choice and pass it to `pairwise_distances`. :pr:`16993` by `Joel Nothman`_.
+
 :mod:`sklearn.model_selection`
 ..............................
 
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 9d4107ebd66d6..20350345f54da 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -1441,12 +1441,18 @@ def _precompute_metric_params(X, Y, metric=None, **kwds):
         if X is Y:
             V = np.var(X, axis=0, ddof=1)
         else:
+            warnings.warn("from version 0.25, pairwise_distances for "
+                          "metric='seuclidean' will require V to be "
+                          "specified if Y is passed.", FutureWarning)
             V = np.var(np.vstack([X, Y]), axis=0, ddof=1)
         return {'V': V}
     if metric == "mahalanobis" and 'VI' not in kwds:
         if X is Y:
             VI = np.linalg.inv(np.cov(X.T)).T
         else:
+            warnings.warn("from version 0.25, pairwise_distances for "
+                          "metric='mahalanobis' will require VI to be "
+                          "specified if Y is passed.", FutureWarning)
             VI = np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T
         return {'VI': VI}
     return {}
diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
index f2c7a307571bc..d7a96de12c9e3 100644
--- a/sklearn/metrics/tests/test_pairwise.py
+++ b/sklearn/metrics/tests/test_pairwise.py
@@ -1281,8 +1281,16 @@ def test_pairwise_distances_data_derived_params(n_jobs, metric, dist_function,
                 params = {'VI': np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T}
 
         expected_dist_explicit_params = cdist(X, Y, metric=metric, **params)
-        dist = np.vstack(tuple(dist_function(X, Y,
-                                             metric=metric, n_jobs=n_jobs)))
+        # TODO: Remove warn_checker in 0.25
+        if y_is_x:
+            warn_checker = pytest.warns(None)
+        else:
+            warn_checker = pytest.warns(FutureWarning,
+                                        match="to be specified if Y is passed")
+        with warn_checker:
+            dist = np.vstack(tuple(dist_function(X, Y,
+                                                 metric=metric,
+                                                 n_jobs=n_jobs)))
 
         assert_allclose(dist, expected_dist_explicit_params)
         assert_allclose(dist, expected_dist_default_params)

From 41b18fea37e59a84bb9219c32c15c36432afc9ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20G=C3=B3rski?= <mateuszg18@gmail.com>
Date: Mon, 27 Apr 2020 23:32:19 +0200
Subject: [PATCH 079/125] ENH Added n_components_ to SparsePCA and
 MiniBatchSparsePCA (#16981)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add n_components_ attribute to SparcePCA and MiniBatchSparsePCA

* Add n_components_ attribute to SparcePCA and MiniBatchSparsePCA

* Add whatsnew entry

* Fix documentation

* Apply suggestions from code review

Co-Authored-By: Thomas J Fan <thomasjpfan@gmail.com>

* Add suggested changes

* Update test_sparse_pca.py

Co-authored-by: Mateusz Górski <mateuszg122@gmail.com>
Co-authored-by: Thomas J Fan <thomasjpfan@gmail.com>
---
 doc/whats_new/v0.23.rst                        |  3 +++
 sklearn/decomposition/_sparse_pca.py           | 12 ++++++++++++
 sklearn/decomposition/tests/test_sparse_pca.py | 15 +++++++++++++++
 3 files changed, 30 insertions(+)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index e0a3927f41405..b5b95e745f456 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -215,6 +215,9 @@ Changelog
   raise `invalid value encountered in multiply` during `fit`.
   :pr:`16718` by :user:`Gui Miotto <gui-miotto>`.
 
+- |Feature| Added `n_components_` attribute to :class:'decomposition.SparsePCA'
+  and :class:'MiniBatchSparsePCA'. :pr:'16981' by :user:'Mateusz Górski <Reksbril>'
+
 :mod:`sklearn.ensemble`
 .......................
 
diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py
index cf1f5a2608e1c..53f3ed3bf23ca 100644
--- a/sklearn/decomposition/_sparse_pca.py
+++ b/sklearn/decomposition/_sparse_pca.py
@@ -103,6 +103,11 @@ class SparsePCA(TransformerMixin, BaseEstimator):
     error_ : array
         Vector of errors at each iteration.
 
+    n_components_ : int
+        Estimated number of components.
+
+        .. versionadded:: 0.23
+
     n_iter_ : int
         Number of iterations run.
 
@@ -197,6 +202,7 @@ def fit(self, X, y=None):
             self.components_, axis=1)[:, np.newaxis]
         components_norm[components_norm == 0] = 1
         self.components_ /= components_norm
+        self.n_components_ = len(self.components_)
 
         self.error_ = E
         return self
@@ -312,6 +318,11 @@ class MiniBatchSparsePCA(SparsePCA):
     components_ : array, [n_components, n_features]
         Sparse components extracted from the data.
 
+    n_components_ : int
+        Estimated number of components.
+
+        .. versionadded:: 0.23
+
     n_iter_ : int
         Number of iterations run.
 
@@ -403,5 +414,6 @@ def fit(self, X, y=None):
             self.components_, axis=1)[:, np.newaxis]
         components_norm[components_norm == 0] = 1
         self.components_ /= components_norm
+        self.n_components_ = len(self.components_)
 
         return self
diff --git a/sklearn/decomposition/tests/test_sparse_pca.py b/sklearn/decomposition/tests/test_sparse_pca.py
index f3d14e31f3e1b..9ee0339a192b4 100644
--- a/sklearn/decomposition/tests/test_sparse_pca.py
+++ b/sklearn/decomposition/tests/test_sparse_pca.py
@@ -207,3 +207,18 @@ def test_spca_error_unormalized_components(spca):
     err_msg = "normalize_components=False is not supported starting "
     with pytest.raises(NotImplementedError, match=err_msg):
         spca(normalize_components=False).fit(Y)
+
+
+@pytest.mark.parametrize("SPCA", [SparsePCA, MiniBatchSparsePCA])
+@pytest.mark.parametrize("n_components", [None, 3])
+def test_spca_n_components_(SPCA, n_components):
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 12, 10
+    X = rng.randn(n_samples, n_features)
+
+    model = SPCA(n_components=n_components).fit(X)
+
+    if n_components is not None:
+        assert model.n_components_ == n_components
+    else:
+        assert model.n_components_ == n_features

From fc0041546a6e186bbacdf96e200fc863d620c44d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 27 Apr 2020 18:15:52 -0400
Subject: [PATCH 080/125] Deprecate class support for check_estimator (#17032)

and parametrize_with_checks
---
 doc/developers/develop.rst                   |  8 ++--
 doc/whats_new/v0.23.rst                      |  5 +++
 sklearn/tests/test_common.py                 | 22 +++++++++-
 sklearn/utils/estimator_checks.py            | 45 +++++++++++++++++---
 sklearn/utils/tests/test_estimator_checks.py |  4 ++
 5 files changed, 74 insertions(+), 10 deletions(-)

diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst
index f17c58cee0d7f..13d2010ca7319 100644
--- a/doc/developers/develop.rst
+++ b/doc/developers/develop.rst
@@ -246,13 +246,13 @@ whether it is just for you or for contributing it to scikit-learn, there are
 several internals of scikit-learn that you should be aware of in addition to
 the scikit-learn API outlined above. You can check whether your estimator
 adheres to the scikit-learn interface and standards by running
-:func:`utils.estimator_checks.check_estimator` on the class or using
-:func:`~sklearn.utils.parametrize_with_checks` pytest decorator (see its
-docstring for details and possible interactions with `pytest`)::
+:func:`~sklearn.utils.estimator_checks.check_estimator` on an instance. The
+:func:`~sklearn.utils.parametrize_with_checks` pytest decorator can also be
+used (see its docstring for details and possible interactions with `pytest`)::
 
   >>> from sklearn.utils.estimator_checks import check_estimator
   >>> from sklearn.svm import LinearSVC
-  >>> check_estimator(LinearSVC)  # passes
+  >>> check_estimator(LinearSVC())  # passes
 
 The main motivation to make a class compatible to the scikit-learn estimator
 interface might be that you want to use it together with model evaluation and
diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index b5b95e745f456..b596e8a1797be 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -534,6 +534,11 @@ Changelog
   matrix from a pandas DataFrame that contains only `SparseArray`s.
   :pr:`16728` by `Thomas Fan`_.
 
+- |API| Passing classes to :func:`utils.estimator_checks.check_estimator` and
+  :func:`utils.estimator_checks.parametrize_with_checks` is now deprecated,
+  and support for classes will be removed in 0.24. Pass instances instead.
+  :pr:`17032` by `Nicolas Hug`_.
+
 :mod:`sklearn.cluster`
 ......................
 
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
index af98c1bc50a74..73c99b0483de8 100644
--- a/sklearn/tests/test_common.py
+++ b/sklearn/tests/test_common.py
@@ -48,7 +48,9 @@ def test_all_estimator_no_base_class():
         assert not name.lower().startswith('base'), msg
 
 
+@ignore_warnings("Passing a class is depr", category=FutureWarning)  # 0.24
 def test_estimator_cls_parameterize_with_checks():
+    # TODO: remove test in 0.24
     # Non-regression test for #16707 to ensure that parametrize_with_checks
     # works with estimator classes
     param_checks = parametrize_with_checks([LogisticRegression])
@@ -105,7 +107,7 @@ def _tested_estimators():
         yield estimator
 
 
-@parametrize_with_checks(_tested_estimators())
+@parametrize_with_checks(list(_tested_estimators()))
 def test_estimators(estimator, check, request):
     # Common tests for estimator instances
     with ignore_warnings(category=(FutureWarning,
@@ -115,7 +117,9 @@ def test_estimators(estimator, check, request):
         check(estimator)
 
 
+@ignore_warnings("Passing a class is depr", category=FutureWarning)  # 0.24
 def test_check_estimator_generate_only():
+    # TODO in 0.24: remove checks on passing a class
     estimator_cls_gen_checks = check_estimator(LogisticRegression,
                                                generate_only=True)
     all_instance_gen_checks = check_estimator(LogisticRegression(),
@@ -238,3 +242,19 @@ def test_all_tests_are_importable():
                                  '__init__.py or an add_subpackage directive '
                                  'in the parent '
                                  'setup.py'.format(missing_tests))
+
+
+# TODO: remove in 0.24
+def test_class_support_deprecated():
+    # Make sure passing classes to check_estimator or parametrize_with_checks
+    # is deprecated
+
+    msg = "Passing a class is deprecated"
+    with pytest.warns(FutureWarning, match=msg):
+        check_estimator(LogisticRegression)
+
+    with pytest.warns(FutureWarning, match=msg):
+        parametrize_with_checks([LogisticRegression])
+
+    # Make sure check_parameters_default_constructible accepts instances now
+    check_parameters_default_constructible('name', LogisticRegression())
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index efac2aca2a2df..ec28cb22919f0 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -33,7 +33,7 @@
 from ..linear_model import Ridge
 
 from ..base import (clone, ClusterMixin, is_classifier, is_regressor,
-                    RegressorMixin, is_outlier_detector)
+                    RegressorMixin, is_outlier_detector, BaseEstimator)
 
 from ..metrics import accuracy_score, adjusted_rand_score, f1_score
 from ..random_projection import BaseRandomProjection
@@ -333,12 +333,15 @@ def _construct_instance(Estimator):
     return estimator
 
 
+# TODO: probably not needed anymore in 0.24 since _generate_class_checks should
+# be removed too. Just put this in check_estimator()
 def _generate_instance_checks(name, estimator):
     """Generate instance checks."""
     yield from ((estimator, partial(check, name))
                 for check in _yield_all_checks(name, estimator))
 
 
+# TODO: remove this in 0.24
 def _generate_class_checks(Estimator):
     """Generate class checks."""
     name = Estimator.__name__
@@ -353,6 +356,8 @@ def _mark_xfail_checks(estimator, check, pytest):
     if isinstance(estimator, type):
         # try to construct estimator instance, if it is unable to then
         # return the estimator class, ignoring the tag
+        # TODO: remove this if block in 0.24 since passing instances isn't
+        # supported anymore
         try:
             estimator = _construct_instance(estimator)
         except Exception:
@@ -385,6 +390,10 @@ def parametrize_with_checks(estimators):
     estimators : list of estimators objects or classes
         Estimators to generated checks for.
 
+        .. deprecated:: 0.23
+           Passing a class is deprecated from version 0.23, and won't be
+           supported in 0.24. Pass an instance instead.
+
     Returns
     -------
     decorator : `pytest.mark.parametrize`
@@ -395,13 +404,21 @@ def parametrize_with_checks(estimators):
     >>> from sklearn.linear_model import LogisticRegression
     >>> from sklearn.tree import DecisionTreeRegressor
 
-    >>> @parametrize_with_checks([LogisticRegression, DecisionTreeRegressor])
+    >>> @parametrize_with_checks([LogisticRegression(),
+    ...                           DecisionTreeRegressor()])
     ... def test_sklearn_compatible_estimator(estimator, check):
     ...     check(estimator)
 
     """
     import pytest
 
+    if any(isinstance(est, type) for est in estimators):
+        # TODO: remove class support in 0.24 and update docstrings
+        msg = ("Passing a class is deprecated since version 0.23 "
+               "and won't be supported in 0.24."
+               "Please pass an instance instead.")
+        warnings.warn(msg, FutureWarning)
+
     checks_generator = chain.from_iterable(
         check_estimator(estimator, generate_only=True)
         for estimator in estimators)
@@ -418,7 +435,7 @@ def check_estimator(Estimator, generate_only=False):
     """Check if estimator adheres to scikit-learn conventions.
 
     This estimator will run an extensive test-suite for input validation,
-    shapes, etc, making sure that the estimator complies with `scikit-leanrn`
+    shapes, etc, making sure that the estimator complies with `scikit-learn`
     conventions as detailed in :ref:`rolling_your_own_estimator`.
     Additional tests for classifiers, regressors, clustering or transformers
     will be run if the Estimator class inherits from the corresponding mixin
@@ -426,7 +443,9 @@ def check_estimator(Estimator, generate_only=False):
 
     This test can be applied to classes or instances.
     Classes currently have some additional tests that related to construction,
-    while passing instances allows the testing of multiple options.
+    while passing instances allows the testing of multiple options. However,
+    support for classes is deprecated since version 0.23 and will be removed
+    in version 0.24 (class checks will still be run on the instances).
 
     Setting `generate_only=True` returns a generator that yields (estimator,
     check) tuples where the check can be called independently from each
@@ -439,9 +458,13 @@ def check_estimator(Estimator, generate_only=False):
 
     Parameters
     ----------
-    estimator : estimator object or class
+    estimator : estimator object
         Estimator to check. Estimator is a class object or instance.
 
+        .. deprecated:: 0.23
+           Passing a class is deprecated from version 0.23, and won't be
+           supported in 0.24. Pass an instance instead.
+
     generate_only : bool, optional (default=False)
         When `False`, checks are evaluated when `check_estimator` is called.
         When `True`, `check_estimator` returns a generator that yields
@@ -456,8 +479,14 @@ def check_estimator(Estimator, generate_only=False):
         Generator that yields (estimator, check) tuples. Returned when
         `generate_only=True`.
     """
+    # TODO: remove class support in 0.24 and update docstrings
     if isinstance(Estimator, type):
         # got a class
+        msg = ("Passing a class is deprecated since version 0.23 "
+               "and won't be supported in 0.24."
+               "Please pass an instance instead.")
+        warnings.warn(msg, FutureWarning)
+
         checks_generator = _generate_class_checks(Estimator)
     else:
         # got an instance
@@ -2570,6 +2599,12 @@ def check_parameters_default_constructible(name, Estimator):
     # this check works on classes, not instances
     # test default-constructibility
     # get rid of deprecation warnings
+    if isinstance(Estimator, BaseEstimator):
+        # Convert estimator instance to its class
+        # TODO: Always convert to class in 0.24, because check_estimator() will
+        # only accept instances, not classes
+        Estimator = Estimator.__class__
+
     with ignore_warnings(category=FutureWarning):
         estimator = _construct_instance(Estimator)
         # test cloning
diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py
index a755daa842ef5..594ff65f9e889 100644
--- a/sklearn/utils/tests/test_estimator_checks.py
+++ b/sklearn/utils/tests/test_estimator_checks.py
@@ -356,6 +356,7 @@ def fit(self, X, y):
     check_fit_score_takes_y("test", TestEstimatorWithDeprecatedFitMethod())
 
 
+@ignore_warnings("Passing a class is depr", category=FutureWarning)  # 0.24
 def test_check_estimator():
     # tests that the estimator actually fails on "bad" estimators.
     # not a complete test of all checks, which are very extensive.
@@ -579,7 +580,10 @@ def test_check_regressor_data_not_an_array():
                         EstimatorInconsistentForPandas())
 
 
+@ignore_warnings("Passing a class is depr", category=FutureWarning)  # 0.24
 def test_check_estimator_required_parameters_skip():
+    # TODO: remove whole test in 0.24 since passes classes to check_estimator()
+    # isn't supported anymore
     class MyEstimator(BaseEstimator):
         _required_parameters = ["special_parameter"]
 

From 964c830328b2ede02814f276cbbf23d16f0e8914 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 27 Apr 2020 18:22:21 -0400
Subject: [PATCH 081/125] MNT Change print_changed_only default to True
 (#17061)

---
 conftest.py                        | 10 ----------
 doc/conf.py                        |  3 ---
 doc/whats_new/v0.23.rst            |  8 ++++++++
 sklearn/_config.py                 | 11 +++++++----
 sklearn/tests/test_base.py         |  4 ++--
 sklearn/tests/test_config.py       |  6 +++---
 sklearn/utils/tests/conftest.py    | 10 ++++++++++
 sklearn/utils/tests/test_pprint.py | 19 ++++++++-----------
 8 files changed, 38 insertions(+), 33 deletions(-)
 create mode 100644 sklearn/utils/tests/conftest.py

diff --git a/conftest.py b/conftest.py
index 2b9e87bf9f292..874931341e195 100644
--- a/conftest.py
+++ b/conftest.py
@@ -99,16 +99,6 @@ def pytest_unconfigure(config):
     del sys._is_pytest_session
 
 
-def pytest_runtest_setup(item):
-    if isinstance(item, DoctestItem):
-        set_config(print_changed_only=True)
-
-
-def pytest_runtest_teardown(item, nextitem):
-    if isinstance(item, DoctestItem):
-        set_config(print_changed_only=False)
-
-
 # TODO: Remove when modules are deprecated in 0.24
 # Configures pytest to ignore deprecated modules.
 collect_ignore_glob = [
diff --git a/doc/conf.py b/doc/conf.py
index 1783a676b6d01..d459cdfd3f1af 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -392,6 +392,3 @@ def setup(app):
 warnings.filterwarnings("ignore", category=UserWarning,
                         message='Matplotlib is currently using agg, which is a'
                                 ' non-GUI backend, so cannot show the figure.')
-
-# Reduces the output of estimators
-sklearn.set_config(print_changed_only=True)
diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index b596e8a1797be..5fc3922f1d457 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -564,6 +564,14 @@ Miscellaneous
   error message is raised when y was expected but None was passed.
   :pr:`16622` by `Nicolas Hug`_.
 
+- |API| The default setting `print_changed_only` has been changed from False
+  to True. This means that the `repr` of estimators is now more concise and
+  only shows the parameters whose default value has been changed when
+  printing an estimator. You can restore the previous behaviour by using
+  `sklearn.set_config(print_changed_only=False)`. Also, note that it is
+  always possible to quickly inspect the parameters of any estimator using
+  `est.get_params(deep=False)`. :pr:`17061` by `Nicolas Hug`_.
+
 Code and Documentation Contributors
 -----------------------------------
 
diff --git a/sklearn/_config.py b/sklearn/_config.py
index c7f3934ee1cb3..44eaae1d59012 100644
--- a/sklearn/_config.py
+++ b/sklearn/_config.py
@@ -6,7 +6,7 @@
 _global_config = {
     'assume_finite': bool(os.environ.get('SKLEARN_ASSUME_FINITE', False)),
     'working_memory': int(os.environ.get('SKLEARN_WORKING_MEMORY', 1024)),
-    'print_changed_only': False,
+    'print_changed_only': True,
 }
 
 
@@ -93,9 +93,12 @@ def config_context(**new_config):
     print_changed_only : bool, optional
         If True, only the parameters that were set to non-default
         values will be printed when printing an estimator. For example,
-        ``print(SVC())`` while True will only print 'SVC()' while the default
-        behaviour would be to print 'SVC(C=1.0, cache_size=200, ...)' with
-        all the non-changed parameters.
+        ``print(SVC())`` while True will only print 'SVC()', but would print
+        'SVC(C=1.0, cache_size=200, ...)' with all the non-changed parameters
+        when False. Default is True.
+
+        .. versionchanged:: 0.23
+           Default changed from False to True.
 
     Notes
     -----
diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index 95f7b01f27058..52f2e60b4af70 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -211,10 +211,10 @@ def test_repr():
     test = T(K(), K())
     assert (
         repr(test) ==
-        "T(a=K(c=None, d=None), b=K(c=None, d=None))")
+        "T(a=K(), b=K())")
 
     some_est = T(a=["long_params"] * 1000)
-    assert len(repr(some_est)) == 495
+    assert len(repr(some_est)) == 485
 
 
 def test_str():
diff --git a/sklearn/tests/test_config.py b/sklearn/tests/test_config.py
index dfa944110ad7a..ae13c61838694 100644
--- a/sklearn/tests/test_config.py
+++ b/sklearn/tests/test_config.py
@@ -4,7 +4,7 @@
 
 def test_config_context():
     assert get_config() == {'assume_finite': False, 'working_memory': 1024,
-                            'print_changed_only': False}
+                            'print_changed_only': True}
 
     # Not using as a context manager affects nothing
     config_context(assume_finite=True)
@@ -12,7 +12,7 @@ def test_config_context():
 
     with config_context(assume_finite=True):
         assert get_config() == {'assume_finite': True, 'working_memory': 1024,
-                                'print_changed_only': False}
+                                'print_changed_only': True}
     assert get_config()['assume_finite'] is False
 
     with config_context(assume_finite=True):
@@ -37,7 +37,7 @@ def test_config_context():
         assert get_config()['assume_finite'] is True
 
     assert get_config() == {'assume_finite': False, 'working_memory': 1024,
-                            'print_changed_only': False}
+                            'print_changed_only': True}
 
     # No positional arguments
     assert_raises(TypeError, config_context, True)
diff --git a/sklearn/utils/tests/conftest.py b/sklearn/utils/tests/conftest.py
new file mode 100644
index 0000000000000..148225a481f69
--- /dev/null
+++ b/sklearn/utils/tests/conftest.py
@@ -0,0 +1,10 @@
+import pytest
+
+import sklearn
+
+
+@pytest.fixture
+def print_changed_only_false():
+    sklearn.set_config(print_changed_only=False)
+    yield
+    sklearn.set_config(print_changed_only=True)  # reset to default
diff --git a/sklearn/utils/tests/test_pprint.py b/sklearn/utils/tests/test_pprint.py
index 146ccf781ae8a..866d872a9b65c 100644
--- a/sklearn/utils/tests/test_pprint.py
+++ b/sklearn/utils/tests/test_pprint.py
@@ -174,7 +174,7 @@ def __init__(self, missing_values=np.nan, strategy="mean",
         self.copy = copy
 
 
-def test_basic():
+def test_basic(print_changed_only_false):
     # Basic pprint test
     lr = LogisticRegression()
     expected = """
@@ -189,8 +189,7 @@ def test_basic():
 
 
 def test_changed_only():
-    # Make sure the changed_only param is correctly used
-    set_config(print_changed_only=True)
+    # Make sure the changed_only param is correctly used when True (default)
     lr = LogisticRegression(C=99)
     expected = """LogisticRegression(C=99)"""
     assert lr.__repr__() == expected
@@ -216,10 +215,8 @@ def test_changed_only():
     # make sure array parameters don't throw error (see #13583)
     repr(LogisticRegressionCV(Cs=np.array([0.1, 1])))
 
-    set_config(print_changed_only=False)
 
-
-def test_pipeline():
+def test_pipeline(print_changed_only_false):
     # Render a pipeline object
     pipeline = make_pipeline(StandardScaler(), LogisticRegression(C=999))
     expected = """
@@ -240,7 +237,7 @@ def test_pipeline():
     assert pipeline.__repr__() == expected
 
 
-def test_deeply_nested():
+def test_deeply_nested(print_changed_only_false):
     # Render a deeply nested estimator
     rfe = RFE(RFE(RFE(RFE(RFE(RFE(RFE(LogisticRegression())))))))
     expected = """
@@ -277,7 +274,7 @@ def test_deeply_nested():
     assert rfe.__repr__() == expected
 
 
-def test_gridsearch():
+def test_gridsearch(print_changed_only_false):
     # render a gridsearch
     param_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                    'C': [1, 10, 100, 1000]},
@@ -302,7 +299,7 @@ def test_gridsearch():
     assert gs.__repr__() == expected
 
 
-def test_gridsearch_pipeline():
+def test_gridsearch_pipeline(print_changed_only_false):
     # render a pipeline inside a gridsearch
     pp = _EstimatorPrettyPrinter(compact=True, indent=1, indent_at_name=True)
 
@@ -372,7 +369,7 @@ def test_gridsearch_pipeline():
     assert repr_ == expected
 
 
-def test_n_max_elements_to_show():
+def test_n_max_elements_to_show(print_changed_only_false):
 
     n_max_elements_to_show = 30
     pp = _EstimatorPrettyPrinter(
@@ -461,7 +458,7 @@ def test_n_max_elements_to_show():
     assert pp.pformat(gs) == expected
 
 
-def test_bruteforce_ellipsis():
+def test_bruteforce_ellipsis(print_changed_only_false):
     # Check that the bruteforce ellipsis (used when the number of non-blank
     # characters exceeds N_CHAR_MAX) renders correctly.
 

From dbc35934a6ebf7d0bbfdfc80e8bf8a9fabc1ba4b Mon Sep 17 00:00:00 2001
From: brigi <58770308+brigitteunger@users.noreply.github.com>
Date: Tue, 28 Apr 2020 00:43:47 +0200
Subject: [PATCH 082/125] DOC Version added and changed labels added for v0.18
 (#wimlds) (#16222)

* Versionlabels added to v0.18 (#wimlds)

* documentation issues: label changed and added for version 0.18

* fix intends and shorten description

Co-authored-by: Hannah <32333241+hhnnhh@users.noreply.github.com>
Co-authored-by: Brigitte@home <unger@nue.tu-berlin.de>
---
 sklearn/cluster/_kmeans.py                         | 3 +++
 sklearn/ensemble/_voting.py                        | 4 ++++
 sklearn/feature_selection/_rfe.py                  | 2 ++
 sklearn/feature_selection/_univariate_selection.py | 4 ++++
 sklearn/linear_model/_ransac.py                    | 4 ++++
 sklearn/metrics/_classification.py                 | 5 +++++
 sklearn/metrics/cluster/_supervised.py             | 2 ++
 sklearn/model_selection/_split.py                  | 2 ++
 sklearn/multioutput.py                             | 2 ++
 sklearn/preprocessing/_function_transformer.py     | 4 ++++
 sklearn/svm/_classes.py                            | 4 ++++
 11 files changed, 36 insertions(+)

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 21a604bed3eb5..8d24ed497aef3 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -834,6 +834,9 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator):
         For now "auto" (kept for backward compatibiliy) chooses "elkan" but it
         might change in the future for a better heuristic.
 
+        .. versionchanged:: 0.18
+            Added Elkan algorithm
+
     Attributes
     ----------
     cluster_centers_ : ndarray of shape (n_clusters, n_features)
diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py
index b044cb68e5151..0ac42407f5998 100644
--- a/sklearn/ensemble/_voting.py
+++ b/sklearn/ensemble/_voting.py
@@ -141,6 +141,8 @@ class VotingClassifier(ClassifierMixin, _BaseVoting):
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+        .. versionadded:: 0.18
+
     flatten_transform : bool, default=True
         Affects shape of transform output only when voting='soft'
         If voting='soft' and flatten_transform=True, transform method returns
@@ -232,6 +234,8 @@ def fit(self, X, y, sample_weight=None):
             Note that this is supported only if all underlying estimators
             support sample weights.
 
+            .. versionadded:: 0.18
+
         Returns
         -------
         self : object
diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py
index aedcd94943bc4..8dc7aecb7dc3e 100644
--- a/sklearn/feature_selection/_rfe.py
+++ b/sklearn/feature_selection/_rfe.py
@@ -410,6 +410,8 @@ class RFECV(RFE):
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
+        .. versionadded:: 0.18
+
     Attributes
     ----------
     n_features_ : int
diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py
index 7e873b3a2b65c..6911830099844 100644
--- a/sklearn/feature_selection/_univariate_selection.py
+++ b/sklearn/feature_selection/_univariate_selection.py
@@ -384,6 +384,8 @@ class SelectPercentile(_BaseFilter):
         Default is f_classif (see below "See also"). The default function only
         works with classification tasks.
 
+        .. versionadded:: 0.18
+
     percentile : int, optional, default=10
         Percent of features to keep.
 
@@ -467,6 +469,8 @@ class SelectKBest(_BaseFilter):
         Default is f_classif (see below "See also"). The default function only
         works with classification tasks.
 
+        .. versionadded:: 0.18
+
     k : int or "all", optional, default=10
         Number of top features to select.
         The "all" option bypasses selection, for use in a parameter search.
diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py
index 5eac651c76383..fffa29d47d91c 100644
--- a/sklearn/linear_model/_ransac.py
+++ b/sklearn/linear_model/_ransac.py
@@ -150,6 +150,8 @@ class RANSACRegressor(MetaEstimatorMixin, RegressorMixin,
         If the loss on a sample is greater than the ``residual_threshold``,
         then this sample is classified as an outlier.
 
+        .. versionadded:: 0.18
+
     random_state : int, RandomState instance, default=None
         The generator used to initialize the centers.
         Pass an int for reproducible output across multiple function calls.
@@ -239,6 +241,8 @@ def fit(self, X, y, sample_weight=None):
             raises error if sample_weight is passed and base_estimator
             fit method does not support it.
 
+            .. versionadded:: 0.18
+
         Raises
         ------
         ValueError
diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index b8a1a8e5e22b4..2ceccca65203e 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -227,6 +227,8 @@ def confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None,
     sample_weight : array-like of shape (n_samples,), default=None
         Sample weights.
 
+        .. versionadded:: 0.18
+
     normalize : {'true', 'pred', 'all'}, default=None
         Normalizes confusion matrix over the true (rows), predicted (columns)
         conditions or all the population. If None, confusion matrix will not be
@@ -789,6 +791,8 @@ def matthews_corrcoef(y_true, y_pred, *, sample_weight=None):
     sample_weight : array-like of shape (n_samples,), default=None
         Sample weights.
 
+        .. versionadded:: 0.18
+
     Returns
     -------
     mcc : float
@@ -2156,6 +2160,7 @@ def log_loss(y_true, y_pred, *, eps=1e-15, normalize=True, sample_weight=None,
         If not provided, labels will be inferred from y_true. If ``labels``
         is ``None`` and ``y_pred`` has shape (n_samples,) the labels are
         assumed to be binary and are inferred from ``y_true``.
+
         .. versionadded:: 0.18
 
     Returns
diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py
index 8a0fdcacb67f1..d652737bd23c0 100644
--- a/sklearn/metrics/cluster/_supervised.py
+++ b/sklearn/metrics/cluster/_supervised.py
@@ -881,6 +881,8 @@ def normalized_mutual_info_score(labels_true, labels_pred, *,
 def fowlkes_mallows_score(labels_true, labels_pred, *, sparse=False):
     """Measure the similarity of two clusterings of a set of points.
 
+    .. versionadded:: 0.18
+
     The Fowlkes-Mallows index (FMI) is defined as the geometric mean between of
     the precision and recall::
 
diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index edcb9b375ae79..9b2087e039f40 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -739,6 +739,8 @@ def split(self, X, y, groups=None):
 class TimeSeriesSplit(_BaseKFold):
     """Time Series cross-validator
 
+    .. versionadded:: 0.18
+
     Provides train/test indices to split time series data samples
     that are observed at fixed time intervals, in train/test sets.
     In each split, test indices must be higher than before, and thus shuffling
diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
index 815c1cbd67757..a5ede43f0fe8c 100644
--- a/sklearn/multioutput.py
+++ b/sklearn/multioutput.py
@@ -215,6 +215,8 @@ class MultiOutputRegressor(RegressorMixin, _MultiOutputEstimator):
     simple strategy for extending regressors that do not natively support
     multi-target regression.
 
+    .. versionadded:: 0.18
+
     Parameters
     ----------
     estimator : estimator object
diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py
index 21dd40365f5a0..c4e6782b7cb19 100644
--- a/sklearn/preprocessing/_function_transformer.py
+++ b/sklearn/preprocessing/_function_transformer.py
@@ -66,9 +66,13 @@ class FunctionTransformer(TransformerMixin, BaseEstimator):
     kw_args : dict, optional
         Dictionary of additional keyword arguments to pass to func.
 
+        .. versionadded:: 0.18
+
     inv_kw_args : dict, optional
         Dictionary of additional keyword arguments to pass to inverse_func.
 
+        .. versionadded:: 0.18
+
     Examples
     --------
     >>> import numpy as np
diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py
index 5ff6e74825e50..d082c22d0a3bc 100644
--- a/sklearn/svm/_classes.py
+++ b/sklearn/svm/_classes.py
@@ -213,6 +213,8 @@ def fit(self, X, y, sample_weight=None):
             samples. If not provided,
             then each sample is given unit weight.
 
+            .. versionadded:: 0.18
+
         Returns
         -------
         self : object
@@ -398,6 +400,8 @@ def fit(self, X, y, sample_weight=None):
             samples. If not provided,
             then each sample is given unit weight.
 
+            .. versionadded:: 0.18
+
         Returns
         -------
         self : object

From fb76de72e7560aaa739872e94bee761777e54c0a Mon Sep 17 00:00:00 2001
From: maikia <maja_ka@hotmail.com>
Date: Tue, 28 Apr 2020 02:25:24 +0200
Subject: [PATCH 083/125] DOC Exchanging Boston for california dataset in plot
 missing values (#16513)

* first few comments

* added new california dataset

* removed boston dataset from the file

* updating the DOCs

* adding a DOC for calculating the error

* exchanged the order started writing functions on scoring the imputers

* finished writing functions for imputers

* finished writing functions and started on DOcs

* working on the DOCs for imputers

* cleaning up

* flake8

* cleaning up

* cleaning up

* restructuring the document

* further text restructuring

* text restructuring

* flake8

* reformatting

* flake8

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Olivier Grisel <olivier.grisel@ensta.org>

* updated the intro

* improve bullet point rendering

* spelling

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Alexandre Gramfort <alexandre.gramfort@m4x.org>

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Alexandre Gramfort <alexandre.gramfort@m4x.org>

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Alexandre Gramfort <alexandre.gramfort@m4x.org>

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Alexandre Gramfort <alexandre.gramfort@m4x.org>

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Alexandre Gramfort <alexandre.gramfort@m4x.org>

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Alexandre Gramfort <alexandre.gramfort@m4x.org>

* changed the naming

* restructuring text

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Alexandre Gramfort <alexandre.gramfort@m4x.org>

* flake8

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Alexandre Gramfort <alexandre.gramfort@m4x.org>

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Alexandre Gramfort <alexandre.gramfort@m4x.org>

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Alexandre Gramfort <alexandre.gramfort@m4x.org>

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Alexandre Gramfort <alexandre.gramfort@m4x.org>

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Alexandre Gramfort <alexandre.gramfort@m4x.org>

* changing missing values from 0 to nan

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Alexandre Gramfort <alexandre.gramfort@m4x.org>

* REGRESSOR to regressor

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Lucy Liu <jliu176@gmail.com>

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Lucy Liu <jliu176@gmail.com>

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Lucy Liu <jliu176@gmail.com>

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Lucy Liu <jliu176@gmail.com>

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Lucy Liu <jliu176@gmail.com>

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Lucy Liu <jliu176@gmail.com>

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Lucy Liu <jliu176@gmail.com>

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Lucy Liu <jliu176@gmail.com>

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Lucy Liu <jliu176@gmail.com>

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Lucy Liu <jliu176@gmail.com>

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Lucy Liu <jliu176@gmail.com>

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Lucy Liu <jliu176@gmail.com>

* flake8

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Thomas J Fan <thomasjpfan@gmail.com>

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Thomas J Fan <thomasjpfan@gmail.com>

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Thomas J Fan <thomasjpfan@gmail.com>

* Update examples/impute/plot_missing_values.py

Co-Authored-By: Thomas J Fan <thomasjpfan@gmail.com>

* flake8

* reducting number of samples used from california dataset

* CLN Removes the need for MissingIndicator

* FIX Unrelated bug but is stopping the CI from passing

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
Co-authored-by: Alexandre Gramfort <alexandre.gramfort@m4x.org>
Co-authored-by: Lucy Liu <jliu176@gmail.com>
Co-authored-by: Thomas J Fan <thomasjpfan@gmail.com>
---
 doc/developers/contributing.rst        |   5 +-
 examples/impute/plot_missing_values.py | 285 ++++++++++++++++++-------
 2 files changed, 214 insertions(+), 76 deletions(-)

diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
index c886119e908c1..e13b6850d50eb 100644
--- a/doc/developers/contributing.rst
+++ b/doc/developers/contributing.rst
@@ -434,8 +434,9 @@ You can check for common programming errors with the following tools:
   must not produce new errors in your pull request. Using `# type: ignore`
   annotation can be a workaround for a few cases that are not supported by
   mypy, in particular,
-   - when importing C or Cython modules
-   - on properties with decorators
+
+  - when importing C or Cython modules
+  - on properties with decorators
 
 Bonus points for contributions that include a performance analysis with
 a benchmark script and profiling output (please report on the mailing
diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py
index 5186cf0ba3bac..2ba7dc05d16b6 100644
--- a/examples/impute/plot_missing_values.py
+++ b/examples/impute/plot_missing_values.py
@@ -5,122 +5,255 @@
 
 Missing values can be replaced by the mean, the median or the most frequent
 value using the basic :class:`sklearn.impute.SimpleImputer`.
-The median is a more robust estimator for data with high magnitude variables
-which could dominate results (otherwise known as a 'long tail').
 
-With ``KNNImputer``, missing values can be imputed using the weighted
-or unweighted mean of the desired number of nearest neighbors.
+In this example we will investigate different imputation techniques:
 
-Another option is the :class:`sklearn.impute.IterativeImputer`. This uses
-round-robin linear regression, treating every variable as an output in
-turn. The version implemented assumes Gaussian (output) variables. If your
-features are obviously non-Normal, consider transforming them to look more
-Normal so as to potentially improve performance.
+- imputation by the constant value 0
+- imputation by the mean value of each feature combined with a missing-ness
+  indicator auxiliary variable
+- k nearest neighbor imputation
+- iterative imputation
+
+We will use two datasets: Diabetes dataset which consists of 10 feature
+variables collected from diabetes patients with an aim to predict disease
+progression and California Housing dataset for which the target is the median
+house value for California districts.
+
+As neither of these datasets have missing values, we will remove some
+values to create new versions with artificially missing data. The performance
+of
+:class:`~sklearn.ensemble.RandomForestRegressor` on the full original dataset
+is then compared the performance on the altered datasets with the artificially
+missing values imputed using different techniques.
 
-In addition of using an imputing method, we can also keep an indication of the
-missing information using :func:`sklearn.impute.MissingIndicator` which might
-carry some information.
 """
 print(__doc__)
 
+# Authors: Maria Telenczuk  <https://github.com/maikia>
+# License: BSD 3 clause
+
+###############################################################################
+# Download the data and make missing values sets
+################################################
+#
+# First we download the two datasets. Diabetes dataset is shipped with
+# scikit-learn. It has 442 entries, each with 10 features. California Housing
+# dataset is much larger with 20640 entries and 8 features. It needs to be
+# downloaded. We will only use the first 400 entries for the sake of speeding
+# up the calculations but feel free to use the whole dataset.
+#
+
 import numpy as np
-import matplotlib.pyplot as plt
 
-# To use the experimental IterativeImputer, we need to explicitly ask for it:
-from sklearn.experimental import enable_iterative_imputer  # noqa
+from sklearn.datasets import fetch_california_housing
 from sklearn.datasets import load_diabetes
-from sklearn.datasets import load_boston
+
+
+rng = np.random.RandomState(42)
+
+X_diabetes, y_diabetes = load_diabetes(return_X_y=True)
+X_california, y_california = fetch_california_housing(return_X_y=True)
+X_california = X_california[:400]
+y_california = y_california[:400]
+
+
+def add_missing_values(X_full, y_full):
+    n_samples, n_features = X_full.shape
+
+    # Add missing values in 75% of the lines
+    missing_rate = 0.75
+    n_missing_samples = int(n_samples * missing_rate)
+
+    missing_samples = np.zeros(n_samples, dtype=np.bool)
+    missing_samples[: n_missing_samples] = True
+
+    rng.shuffle(missing_samples)
+    missing_features = rng.randint(0, n_features, n_missing_samples)
+    X_missing = X_full.copy()
+    X_missing[missing_samples, missing_features] = np.nan
+    y_missing = y_full.copy()
+
+    return X_missing, y_missing
+
+
+X_miss_california, y_miss_california = add_missing_values(
+    X_california, y_california)
+
+X_miss_diabetes, y_miss_diabetes = add_missing_values(
+    X_diabetes, y_diabetes)
+
+
+###############################################################################
+# Impute the missing data and score
+# #################################
+# Now we will write a function which will score the results on the differently
+# imputed data. Let's look at each imputer separately:
+#
+
+rng = np.random.RandomState(0)
+
 from sklearn.ensemble import RandomForestRegressor
-from sklearn.pipeline import make_pipeline, make_union
-from sklearn.impute import (
-    SimpleImputer, KNNImputer, IterativeImputer, MissingIndicator)
+
+# To use the experimental IterativeImputer, we need to explicitly ask for it:
+from sklearn.experimental import enable_iterative_imputer  # noqa
+from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
 from sklearn.model_selection import cross_val_score
+from sklearn.pipeline import make_pipeline
 
-rng = np.random.RandomState(0)
 
 N_SPLITS = 5
-REGRESSOR = RandomForestRegressor(random_state=0)
+regressor = RandomForestRegressor(random_state=0)
+
+###############################################################################
+# Missing information
+# -------------------
+# In addition to imputing the missing values, the imputers have an
+# `add_indicator` parameter that marks the values that were missing, which
+# might carry some information.
+#
 
 
 def get_scores_for_imputer(imputer, X_missing, y_missing):
-    estimator = make_pipeline(
-        make_union(imputer, MissingIndicator(missing_values=0)),
-        REGRESSOR)
+    estimator = make_pipeline(imputer, regressor)
     impute_scores = cross_val_score(estimator, X_missing, y_missing,
                                     scoring='neg_mean_squared_error',
                                     cv=N_SPLITS)
     return impute_scores
 
 
-def get_results(dataset):
-    X_full, y_full = dataset.data, dataset.target
-    n_samples = X_full.shape[0]
-    n_features = X_full.shape[1]
+x_labels = ['Full data',
+            'Zero imputation',
+            'Mean Imputation',
+            'KNN Imputation',
+            'Iterative Imputation']
+
+mses_california = np.zeros(5)
+stds_california = np.zeros(5)
+mses_diabetes = np.zeros(5)
+stds_diabetes = np.zeros(5)
+
+###############################################################################
+# Estimate the score
+# ------------------
+# First, we want to estimate the score on the original data:
+#
 
-    # Estimate the score on the entire dataset, with no missing values
-    full_scores = cross_val_score(REGRESSOR, X_full, y_full,
+
+def get_full_score(X_full, y_full):
+    full_scores = cross_val_score(regressor, X_full, y_full,
                                   scoring='neg_mean_squared_error',
                                   cv=N_SPLITS)
+    return full_scores.mean(), full_scores.std()
 
-    # Add missing values in 75% of the lines
-    missing_rate = 0.75
-    n_missing_samples = int(np.floor(n_samples * missing_rate))
-    missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
-                                          dtype=np.bool),
-                                 np.ones(n_missing_samples,
-                                         dtype=np.bool)))
-    rng.shuffle(missing_samples)
-    missing_features = rng.randint(0, n_features, n_missing_samples)
-    X_missing = X_full.copy()
-    X_missing[np.where(missing_samples)[0], missing_features] = 0
-    y_missing = y_full.copy()
 
-    # Estimate the score after replacing missing values by 0
-    imputer = SimpleImputer(missing_values=0,
-                            strategy='constant',
-                            fill_value=0)
+mses_california[0], stds_california[0] = get_full_score(X_california,
+                                                        y_california)
+mses_diabetes[0], stds_diabetes[0] = get_full_score(X_diabetes, y_diabetes)
+
+
+###############################################################################
+# Replace missing values by 0
+# ---------------------------
+#
+# Now we will estimate the score on the data where the missing values are
+# replaced by 0:
+#
+
+
+def get_impute_zero_score(X_missing, y_missing):
+
+    imputer = SimpleImputer(missing_values=np.nan, add_indicator=True,
+                            strategy='constant', fill_value=0)
     zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
+    return zero_impute_scores.mean(), zero_impute_scores.std()
 
-    # Estimate the score after imputation (mean strategy) of the missing values
-    imputer = SimpleImputer(missing_values=0, strategy="mean")
-    mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
 
-    # Estimate the score after kNN-imputation of the missing values
-    imputer = KNNImputer(missing_values=0)
+mses_california[1], stds_california[1] = get_impute_zero_score(
+    X_miss_california, y_miss_california)
+mses_diabetes[1], stds_diabetes[1] = get_impute_zero_score(X_miss_diabetes,
+                                                           y_miss_diabetes)
+
+
+###############################################################################
+# kNN-imputation of the missing values
+# ------------------------------------
+#
+# :class:`sklearn.impute.KNNImputer` imputes missing values using the weighted
+# or unweighted mean of the desired number of nearest neighbors.
+
+def get_impute_knn_score(X_missing, y_missing):
+    imputer = KNNImputer(missing_values=np.nan, add_indicator=True)
     knn_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
+    return knn_impute_scores.mean(), knn_impute_scores.std()
 
-    # Estimate the score after iterative imputation of the missing values
-    imputer = IterativeImputer(missing_values=0,
-                               random_state=0,
-                               n_nearest_features=5,
+
+mses_california[2], stds_california[2] = get_impute_knn_score(
+    X_miss_california, y_miss_california)
+mses_diabetes[2], stds_diabetes[2] = get_impute_knn_score(X_miss_diabetes,
+                                                          y_miss_diabetes)
+
+
+###############################################################################
+# Impute missing values with mean
+# -------------------------------
+#
+
+def get_impute_mean(X_missing, y_missing):
+    imputer = SimpleImputer(missing_values=np.nan, strategy="mean",
+                            add_indicator=True)
+    mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
+    return mean_impute_scores.mean(), mean_impute_scores.std()
+
+
+mses_california[3], stds_california[3] = get_impute_mean(X_miss_california,
+                                                         y_miss_california)
+mses_diabetes[3], stds_diabetes[3] = get_impute_mean(X_miss_diabetes,
+                                                     y_miss_diabetes)
+
+
+###############################################################################
+# Iterative imputation of the missing values
+# ------------------------------------------
+#
+# Another option is the :class:`sklearn.impute.IterativeImputer`. This uses
+# round-robin linear regression, modeling each feature with missing values as a
+# function of other features, in turn.
+# The version implemented assumes Gaussian (output) variables. If your features
+# are obviously non-normal, consider transforming them to look more normal
+# to potentially improve performance.
+#
+
+def get_impute_iterative(X_missing, y_missing):
+    imputer = IterativeImputer(missing_values=np.nan, add_indicator=True,
+                               random_state=0, n_nearest_features=5,
                                sample_posterior=True)
     iterative_impute_scores = get_scores_for_imputer(imputer,
                                                      X_missing,
                                                      y_missing)
+    return iterative_impute_scores.mean(), iterative_impute_scores.std()
 
-    return ((full_scores.mean(), full_scores.std()),
-            (zero_impute_scores.mean(), zero_impute_scores.std()),
-            (mean_impute_scores.mean(), mean_impute_scores.std()),
-            (knn_impute_scores.mean(), knn_impute_scores.std()),
-            (iterative_impute_scores.mean(), iterative_impute_scores.std()))
 
+mses_california[4], stds_california[4] = get_impute_iterative(
+    X_miss_california, y_miss_california)
+mses_diabetes[4], stds_diabetes[4] = get_impute_iterative(X_miss_diabetes,
+                                                          y_miss_diabetes)
 
-results_diabetes = np.array(get_results(load_diabetes()))
-mses_diabetes = results_diabetes[:, 0] * -1
-stds_diabetes = results_diabetes[:, 1]
+mses_diabetes = mses_diabetes * -1
+mses_california = mses_california * -1
+
+###############################################################################
+# Plot the results
+# ################
+#
+# Finally we are going to visualize the score:
+#
+
+import matplotlib.pyplot as plt
 
-results_boston = np.array(get_results(load_boston()))
-mses_boston = results_boston[:, 0] * -1
-stds_boston = results_boston[:, 1]
 
 n_bars = len(mses_diabetes)
 xval = np.arange(n_bars)
 
-x_labels = ['Full data',
-            'Zero imputation',
-            'Mean Imputation',
-            'KNN Imputation',
-            'Iterative Imputation']
 colors = ['r', 'g', 'b', 'orange', 'black']
 
 # plot diabetes results
@@ -138,16 +271,20 @@ def get_results(dataset):
 ax1.invert_yaxis()
 ax1.set_yticklabels(x_labels)
 
-# plot boston results
+# plot california dataset results
 ax2 = plt.subplot(122)
 for j in xval:
-    ax2.barh(j, mses_boston[j], xerr=stds_boston[j],
+    ax2.barh(j, mses_california[j], xerr=stds_california[j],
              color=colors[j], alpha=0.6, align='center')
 
-ax2.set_title('Imputation Techniques with Boston Data')
+ax2.set_title('Imputation Techniques with California Data')
 ax2.set_yticks(xval)
 ax2.set_xlabel('MSE')
 ax2.invert_yaxis()
 ax2.set_yticklabels([''] * n_bars)
 
 plt.show()
+
+# You can also try different techniques. For instance, the median is a more
+# robust estimator for data with high magnitude variables which could dominate
+# results (otherwise known as a 'long tail').

From 8b4d4f4aaf10fb0bdcc5829df9f265b363ce7a8e Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Tue, 28 Apr 2020 01:47:21 -0400
Subject: [PATCH 084/125] DOC Add whats new for missing PRs (#17066)

---
 doc/whats_new/v0.23.rst | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index 5fc3922f1d457..02ef6a9e94408 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -275,6 +275,11 @@ Changelog
   with log-link useful for modeling count data.
   :pr:`16692` by :user:`Christian Lorentzen <lorentzenchr>`
 
+- |Fix| Fixed a bug where :class:`ensemble.HistGradientBoostingRegressor` and
+  :class:`ensemble.HistGradientBoostingClassifier` would fail with multiple
+  calls to fit when `warm_start=True`, `early_stopping=True`, and there is no
+  validation set. :pr:`16663` by `Thomas Fan`_.
+
 :mod:`sklearn.feature_extraction`
 .................................
 
@@ -283,11 +288,16 @@ Changelog
   for datasets with large vocabularies combined with ``min_df`` or ``max_df``.
   :pr:`15834` by :user:`Santiago M. Mola <smola>`.
 
+:mod:`sklearn.feature_selection`
+................................
 
 - |Enhancement| Added support for multioutput data in
   :class:`feature_selection.RFE` and :class:`feature_selection.RFECV`.
   :pr:`16103` by :user:`Divyaprabha M <divyaprabha123>`.
 
+- |API| Adds :class:`feature_selection.SelectorMixin` back to public API.
+  :pr:`16132` by :user:`trimeta`.
+
 :mod:`sklearn.gaussian_process`
 ...............................
 
@@ -441,6 +451,11 @@ Changelog
 :mod:`sklearn.neural_network`
 .............................
 
+- |Efficiency| :class:`neural_network.MLPClassifier` and
+  :class:`neural_network.MLPRegressor` has reduced memory footprint when using
+  stochastic solvers, `'sgd'` or `'adam'`, and `shuffle=True`. :pr:`14075` by
+  :user:`meyer89`.
+
 - |Fix| Increases the numerical stability of the logistic loss function in
   :class:`neural_network.MLPClassifier` by clipping the probabilities.
   :pr:`16117` by `Thomas Fan`_.
@@ -460,6 +475,10 @@ Changelog
   each feature with two categories. :pr:`16245`
   by :user:`Rushabh Vasani <rushabh-v>`.
 
+- |Enhancement| :class:`preprocessing.OneHotEncoder`'s `drop_idx_` ndarray
+  can now contain `None`, where `drop_idx_[i] = None` means that no category
+  is dropped for index `i`. :pr:`16585` by :user:`Chiara Marmo <cmarmo>`.
+
 - |Efficiency| :class:`preprocessing.OneHotEncoder` is now faster at
   transforming. :pr:`15762` by `Thomas Fan`_.
 
@@ -472,6 +491,13 @@ Changelog
   normalizing the vectors. :pr:`16632` by
   :user:`Maura Pintor <Maupin1991>` and :user:`Battista Biggio <bbiggio>`.
 
+:mod:`sklearn.semi_supervised`
+..............................
+
+- |Fix| :class:`semi_supervised.LabelSpreading` and
+  :class:`semi_supervised.LabelPropagation` avoids divide by zero warnings
+  when normalizing `label_distributions_`. :pr:`15946` by :user:`ngshya`.
+
 :mod:`sklearn.svm`
 ..................
 
@@ -539,6 +565,9 @@ Changelog
   and support for classes will be removed in 0.24. Pass instances instead.
   :pr:`17032` by `Nicolas Hug`_.
 
+- |FIX| :func:`utils.all_estimators` now only returns public estimators.
+  :pr:`15380` by `Thomas Fan`_.
+
 :mod:`sklearn.cluster`
 ......................
 

From 3deacb98c755d0b8e46e298ff3ea918b0e0e4204 Mon Sep 17 00:00:00 2001
From: "Gregory R. Lee" <grlee77@gmail.com>
Date: Tue, 28 Apr 2020 06:19:52 -0400
Subject: [PATCH 085/125] MNT consistently call import_array() after cimport of
 numpy (#17054)

---
 sklearn/_isotonic.pyx                                         | 2 ++
 sklearn/cluster/_dbscan_inner.pyx                             | 2 ++
 sklearn/ensemble/_hist_gradient_boosting/_binning.pyx         | 3 +++
 .../ensemble/_hist_gradient_boosting/_gradient_boosting.pyx   | 2 ++
 sklearn/ensemble/_hist_gradient_boosting/_loss.pyx            | 2 ++
 sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx       | 2 ++
 sklearn/ensemble/_hist_gradient_boosting/common.pxd           | 2 ++
 sklearn/ensemble/_hist_gradient_boosting/histogram.pyx        | 2 ++
 sklearn/ensemble/_hist_gradient_boosting/splitting.pyx        | 4 +++-
 sklearn/linear_model/_sag_fast.pyx.tp                         | 1 +
 sklearn/manifold/_barnes_hut_tsne.pyx                         | 2 ++
 sklearn/manifold/_utils.pyx                                   | 4 ++++
 sklearn/neighbors/_typedefs.pyx                               | 3 +++
 sklearn/preprocessing/_csr_polynomial_expansion.pyx           | 1 +
 sklearn/svm/_libsvm_sparse.pyx                                | 4 +++-
 sklearn/utils/_logistic_sigmoid.pyx                           | 1 +
 16 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/sklearn/_isotonic.pyx b/sklearn/_isotonic.pyx
index c7486097df854..75c4bbef11379 100644
--- a/sklearn/_isotonic.pyx
+++ b/sklearn/_isotonic.pyx
@@ -11,6 +11,8 @@ cimport numpy as np
 cimport cython
 from cython cimport floating
 
+np.import_array()
+
 
 def _inplace_contiguous_isotonic_regression(floating[::1] y, floating[::1] w):
     cdef:
diff --git a/sklearn/cluster/_dbscan_inner.pyx b/sklearn/cluster/_dbscan_inner.pyx
index a348bf59d6717..b9a80686a76f8 100644
--- a/sklearn/cluster/_dbscan_inner.pyx
+++ b/sklearn/cluster/_dbscan_inner.pyx
@@ -9,6 +9,8 @@ from libcpp.vector cimport vector
 cimport numpy as np
 import numpy as np
 
+np.import_array()
+
 
 # Work around Cython bug: C++ exceptions are not caught unless thrown within
 # a cdef function with an "except +" declaration.
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx
index 1ecee3c9ee27e..4e11abfcabdf8 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx
@@ -16,6 +16,9 @@ from libc.math cimport isnan
 
 from .common cimport X_DTYPE_C, X_BINNED_DTYPE_C
 
+np.import_array()
+
+
 def _map_to_bins(const X_DTYPE_C [:, :] data,
                  list binning_thresholds,
                  const unsigned char missing_values_bin_idx,
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx
index 8d307c3806532..18f1b6a365421 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx
@@ -13,6 +13,8 @@ cimport numpy as np
 from .common import Y_DTYPE
 from .common cimport Y_DTYPE_C
 
+np.import_array()
+
 
 def _update_raw_predictions(
         Y_DTYPE_C [::1] raw_predictions,  # OUT
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
index 64480911439e5..4114cd24aa8df 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
@@ -15,6 +15,8 @@ from libc.math cimport exp, log
 from .common cimport Y_DTYPE_C
 from .common cimport G_H_DTYPE_C
 
+np.import_array()
+
 
 def _update_gradients_least_squares(
         G_H_DTYPE_C [::1] gradients,  # OUT
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
index b3234cb5ba945..d346aabdac070 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
@@ -18,6 +18,8 @@ from .common import Y_DTYPE
 from .common cimport X_BINNED_DTYPE_C
 from .common cimport node_struct
 
+np.import_array()
+
 
 def _predict_from_numeric_data(
         node_struct [:] nodes,
diff --git a/sklearn/ensemble/_hist_gradient_boosting/common.pxd b/sklearn/ensemble/_hist_gradient_boosting/common.pxd
index 60399c2fbdd70..161ad114829fe 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/common.pxd
+++ b/sklearn/ensemble/_hist_gradient_boosting/common.pxd
@@ -2,6 +2,8 @@
 import numpy as np
 cimport numpy as np
 
+np.import_array()
+
 
 ctypedef np.npy_float64 X_DTYPE_C
 ctypedef np.npy_uint8 X_BINNED_DTYPE_C
diff --git a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx
index 740e5e002cf4e..8bd7c4ee8b350 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx
@@ -17,6 +17,8 @@ from .common cimport hist_struct
 from .common cimport X_BINNED_DTYPE_C
 from .common cimport G_H_DTYPE_C
 
+np.import_array()
+
 # Notes:
 # - IN views are read-only, OUT views are write-only
 # - In a lot of functions here, we pass feature_idx and the whole 2d
diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
index 43405551ef357..984cc6767facf 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
@@ -27,6 +27,8 @@ from .common cimport hist_struct
 from .common import HISTOGRAM_DTYPE
 from .common cimport MonotonicConstraint
 
+np.import_array()
+
 
 cdef struct split_info_struct:
     # Same as the SplitInfo class, but we need a C struct to use it in the
@@ -809,7 +811,7 @@ cpdef inline Y_DTYPE_C compute_node_value(
     """
 
     cdef:
-        Y_DTYPE_C value 
+        Y_DTYPE_C value
 
     value = -sum_gradient / (sum_hessian + l2_regularization + 1e-15)
 
diff --git a/sklearn/linear_model/_sag_fast.pyx.tp b/sklearn/linear_model/_sag_fast.pyx.tp
index 5758a8e5ee34c..141890497fcd2 100644
--- a/sklearn/linear_model/_sag_fast.pyx.tp
+++ b/sklearn/linear_model/_sag_fast.pyx.tp
@@ -58,6 +58,7 @@ from ..utils._seq_dataset cimport SequentialDataset32, SequentialDataset64
 
 from libc.stdio cimport printf
 
+np.import_array()
 
 
 {{for name, c_type, np_type in get_dispatch(dtypes)}}
diff --git a/sklearn/manifold/_barnes_hut_tsne.pyx b/sklearn/manifold/_barnes_hut_tsne.pyx
index ec80890fd8a58..b15462e597684 100644
--- a/sklearn/manifold/_barnes_hut_tsne.pyx
+++ b/sklearn/manifold/_barnes_hut_tsne.pyx
@@ -18,6 +18,8 @@ from cython.parallel cimport prange, parallel
 
 from ..neighbors._quad_tree cimport _QuadTree
 
+np.import_array()
+
 
 cdef char* EMPTY_STRING = ""
 
diff --git a/sklearn/manifold/_utils.pyx b/sklearn/manifold/_utils.pyx
index 676d3676fb8c1..0cc2b0af137cc 100644
--- a/sklearn/manifold/_utils.pyx
+++ b/sklearn/manifold/_utils.pyx
@@ -5,6 +5,10 @@ cimport cython
 import numpy as np
 cimport numpy as np
 from libc.stdio cimport printf
+
+np.import_array()
+
+
 cdef extern from "numpy/npy_math.h":
     float NPY_INFINITY
 
diff --git a/sklearn/neighbors/_typedefs.pyx b/sklearn/neighbors/_typedefs.pyx
index bbdfd00505b43..789afb4997dd1 100644
--- a/sklearn/neighbors/_typedefs.pyx
+++ b/sklearn/neighbors/_typedefs.pyx
@@ -4,6 +4,9 @@ import numpy as np
 cimport numpy as np
 from libc.math cimport sqrt
 
+np.import_array()
+
+
 # use a hack to determine the associated numpy data types
 # NOTE: the following requires the buffer interface, only available in
 #       numpy 1.5+.  We'll choose the DTYPE by hand instead.
diff --git a/sklearn/preprocessing/_csr_polynomial_expansion.pyx b/sklearn/preprocessing/_csr_polynomial_expansion.pyx
index dd36f8321410f..84fef3f042dc7 100644
--- a/sklearn/preprocessing/_csr_polynomial_expansion.pyx
+++ b/sklearn/preprocessing/_csr_polynomial_expansion.pyx
@@ -8,6 +8,7 @@ from scipy.sparse import csr_matrix
 from numpy cimport ndarray
 cimport numpy as np
 
+np.import_array()
 ctypedef np.int32_t INDEX_T
 
 ctypedef fused DATA_T:
diff --git a/sklearn/svm/_libsvm_sparse.pyx b/sklearn/svm/_libsvm_sparse.pyx
index f180560f1d1e7..4b5070a64aad8 100644
--- a/sklearn/svm/_libsvm_sparse.pyx
+++ b/sklearn/svm/_libsvm_sparse.pyx
@@ -4,6 +4,8 @@ cimport numpy as np
 from scipy import sparse
 from ..exceptions import ConvergenceWarning
 
+np.import_array()
+
 
 cdef extern from *:
     ctypedef char* const_char_p "const char*"
@@ -186,7 +188,7 @@ def libsvm_sparse_train ( int n_features,
 
     # copy model.nSV
     # TODO: do only in classification
-    cdef np.ndarray n_class_SV 
+    cdef np.ndarray n_class_SV
     n_class_SV = np.empty(n_class, dtype=np.int32)
     copy_nSV(n_class_SV.data, model)
 
diff --git a/sklearn/utils/_logistic_sigmoid.pyx b/sklearn/utils/_logistic_sigmoid.pyx
index 4ca32193c5ce6..3531d99bc4f44 100644
--- a/sklearn/utils/_logistic_sigmoid.pyx
+++ b/sklearn/utils/_logistic_sigmoid.pyx
@@ -7,6 +7,7 @@ from libc.math cimport log, exp
 import numpy as np
 cimport numpy as np
 
+np.import_array()
 ctypedef np.float64_t DTYPE_t
 
 

From 1d3a553b2dfbe5cc8d32b306fe62855671fe9ae4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?=
 <34657725+jeremiedbb@users.noreply.github.com>
Date: Tue, 28 Apr 2020 12:57:01 +0200
Subject: [PATCH 086/125] [MRG] Fix LinearModelsCV for loky backend. (#14264)

---
 doc/whats_new/v0.23.rst                       |  6 +++++
 sklearn/linear_model/_coordinate_descent.py   |  9 ++++++++
 .../tests/test_coordinate_descent.py          | 23 +++++++++++++++++++
 3 files changed, 38 insertions(+)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index 02ef6a9e94408..d3a3de4d7153b 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -384,6 +384,12 @@ Changelog
   of non-zero coefficients and in the predicted output. :pr:`16849` by
   `Nicolas Hug`_.
 
+- |Fix| Fixed a bug in :class:`linear_model.ElasticNetCV`,
+  :class:`linear_model.MultitaskElasticNetCV`, :class:`linear_model.LassoCV`
+  and :class:`linear_model.MultitaskLassoCV` where fitting would fail when
+  using joblib loky backend. :pr:`14264` by
+  :user:`Jérémie du Boisberranger <jeremiedbb>`.
+
 :mod:`sklearn.metrics`
 ......................
 
diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index cd57b9612b362..2d8567b04db56 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -1068,6 +1068,15 @@ def _path_residuals(X, y, train, test, path, path_params, alphas=None,
     y_train = y[train]
     X_test = X[test]
     y_test = y[test]
+
+    if not sparse.issparse(X):
+        for array, array_input in ((X_train, X), (y_train, y),
+                                   (X_test, X), (y_test, y)):
+            if array.base is not array_input and not array.flags['WRITEABLE']:
+                # fancy indexing should create a writable copy but it doesn't
+                # for read-only memmaps (cf. numpy#14132).
+                array.setflags(write=True)
+
     fit_intercept = path_params['fit_intercept']
     normalize = path_params['normalize']
 
diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
index fdc49599788fe..142c1e9ac2a47 100644
--- a/sklearn/linear_model/tests/test_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -7,8 +7,11 @@
 import pytest
 from scipy import interpolate, sparse
 from copy import deepcopy
+import joblib
+from distutils.version import LooseVersion
 
 from sklearn.datasets import load_boston
+from sklearn.datasets import make_regression
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_almost_equal
@@ -1020,3 +1023,23 @@ def test_enet_sample_weight_sparse():
     with pytest.raises(ValueError, match="Sample weights do not.*support "
                                          "sparse matrices"):
         reg.fit(X, y, sample_weight=sw, check_input=True)
+
+
+@pytest.mark.parametrize("backend", ["loky", "threading"])
+@pytest.mark.parametrize("estimator",
+                         [ElasticNetCV, MultiTaskElasticNetCV,
+                          LassoCV, MultiTaskLassoCV])
+def test_linear_models_cv_fit_for_all_backends(backend, estimator):
+    # LinearModelsCV.fit performs inplace operations on input data which is
+    # memmapped when using loky backend, causing an error due to unexpected
+    # behavior of fancy indexing of read-only memmaps (cf. numpy#14132).
+
+    if joblib.__version__ < LooseVersion('0.12') and backend == 'loky':
+        pytest.skip('loky backend does not exist in joblib <0.12')
+
+    # Create a problem sufficiently large to cause memmapping (1MB).
+    n_targets = 1 + (estimator in (MultiTaskElasticNetCV, MultiTaskLassoCV))
+    X, y = make_regression(20000, 10, n_targets=n_targets)
+
+    with joblib.parallel_backend(backend=backend):
+        estimator(n_jobs=2, cv=3).fit(X, y)

From 54354083eb0d749391d6b51480216a7f87747049 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Tue, 28 Apr 2020 20:59:32 +1000
Subject: [PATCH 087/125] DOC markup fixes for change log

---
 doc/whats_new/v0.23.rst | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index d3a3de4d7153b..788f9fe6837b4 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -176,7 +176,7 @@ Changelog
 - |Enhancement| Added ``return_centers`` parameter  in
   :func:`datasets.make_blobs`, which can be used to return
   centers for each cluster.
-  :pr:`15709` by :user:`<shivamgargsya>` and
+  :pr:`15709` by :user:`shivamgargsya` and
   :user:`Venkatachalam N <venkyyuvy>`.
 
 - |Enhancement| Functions :func:`datasets.make_circles` and
@@ -198,8 +198,8 @@ Changelog
   ``csc`` matrices. :pr:`16837` by :user:`wornbb`.
 
 - |Fix| :class:`decomposition.PCA` with a float `n_components` parameter, will
-   exclusively choose the components that explain the variance greater than
-   `n_components`. :pr:`15669` by :user:`Krishna Chaitanya <krishnachaitanya9>`
+  exclusively choose the components that explain the variance greater than
+  `n_components`. :pr:`15669` by :user:`Krishna Chaitanya <krishnachaitanya9>`
 
 - |Fix| :class:`decomposition.PCA` with `n_components='mle'` now correctly
   handles small eigenvalues, and does not infer 0 as the correct number of
@@ -302,7 +302,7 @@ Changelog
 ...............................
 
 - |Enhancement| :func:`gaussian_process.kernels.Matern` returns the RBF kernel when ``nu=np.inf``.
-  :pr:`15503` by :user:`Sam Dixon` <sam-dixon>.
+  :pr:`15503` by :user:`Sam Dixon <sam-dixon>`.
 
 - |Fix| Fixed bug in :class:`gaussian_process.GaussianProcessRegressor` that
   caused predicted standard deviations to only be between 0 and 1 when
@@ -336,7 +336,7 @@ Changelog
   and `Olivier Grisel`_.
 
 - |Feature| Support of `sample_weight` in :class:`linear_model.ElasticNet` and
-  :class:`linear_model:Lasso` for dense feature matrix `X`.
+  :class:`linear_model.Lasso` for dense feature matrix `X`.
   :pr:`15436` by :user:`Christian Lorentzen <lorentzenchr>`.
 
 - |Efficiency| :class:`linear_model.RidgeCV` and
@@ -558,12 +558,12 @@ Changelog
 - |Enhancement| improve error message in :func:`utils.validation.column_or_1d`.
   :pr:`15926` by :user:`Loïc Estève <lesteve>`.
 
-- |Enhancement| add warning in :func:`utils.validation.check_array` for
+- |Enhancement| add warning in :func:`utils.check_array` for
   pandas sparse DataFrame.
   :pr:`16021` by :user:`Rushabh Vasani <rushabh-v>`.
 
-- |Enhancement| :func:`utils.validation.check_array` now constructs a sparse
-  matrix from a pandas DataFrame that contains only `SparseArray`s.
+- |Enhancement| :func:`utils.check_array` now constructs a sparse
+  matrix from a pandas DataFrame that contains only `SparseArray` columns.
   :pr:`16728` by `Thomas Fan`_.
 
 - |API| Passing classes to :func:`utils.estimator_checks.check_estimator` and

From 9f015c8a14a67d248599dc376d33ec612dd9dbb9 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Tue, 28 Apr 2020 11:23:44 -0400
Subject: [PATCH 088/125] FIX Mixed bool dtype in pandas (#17008)

---
 sklearn/utils/validation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 8ee18371a3009..1fde1f0d69fb1 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -507,7 +507,7 @@ def check_array(array, accept_sparse=False, *, accept_large_sparse=True,
         # pandas boolean dtype __array__ interface coerces bools to objects
         for i, dtype_iter in enumerate(dtypes_orig):
             if dtype_iter.kind == 'b':
-                dtypes_orig[i] = np.object
+                dtypes_orig[i] = np.dtype(np.object)
 
         if all(isinstance(dtype, np.dtype) for dtype in dtypes_orig):
             dtype_orig = np.result_type(*dtypes_orig)

From acbe13c07d204e6ba463f41d2fd39b2a1c776f20 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Tue, 28 Apr 2020 13:00:12 -0400
Subject: [PATCH 089/125] ENH Adds pandas IntegerArray support to check_array
 (#16508)

---
 doc/whats_new/v0.23.rst                    | 20 +++++++
 sklearn/impute/_base.py                    |  9 ++-
 sklearn/impute/_iterative.py               |  4 +-
 sklearn/impute/_knn.py                     |  4 +-
 sklearn/impute/tests/test_common.py        | 29 +++++++++
 sklearn/metrics/pairwise.py                | 23 ++++---
 sklearn/preprocessing/tests/test_common.py | 30 ++++++++++
 sklearn/utils/tests/test_validation.py     | 31 ++++++++++
 sklearn/utils/validation.py                | 70 +++++++++++++++-------
 9 files changed, 186 insertions(+), 34 deletions(-)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index 788f9fe6837b4..aedd5fe804722 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -316,6 +316,10 @@ Changelog
   ``max_value`` and ``min_value``. Array-like inputs allow a different max and min to be specified
   for each feature. :pr:`16403` by :user:`Narendra Mukherjee <narendramukherjee>`.
 
+- |Enhancement| :class:`impute.SimpleImputer`, :class:`impute.KNNImputer`, and
+  :class:`impute.SimpleImputer` accepts pandas' nullable integer dtype with
+  missing values. :pr:`16508` by `Thomas Fan`_.
+
 :mod:`sklearn.inspection`
 .........................
 
@@ -485,6 +489,13 @@ Changelog
   can now contain `None`, where `drop_idx_[i] = None` means that no category
   is dropped for index `i`. :pr:`16585` by :user:`Chiara Marmo <cmarmo>`.
 
+- |Enhancement| :class:`preprocessing.MaxAbsScaler`,
+  :class:`preprocessing.MinMaxScaler`, :class:`preprocessing.StandardScaler`,
+  :class:`preprocessing.PowerTransformer`,
+  :class:`preprocessing.QuantileTransformer`,
+  :class:`preprocessing.RobustScaler` now supports pandas' nullable integer
+  dtype with missing values. :pr:`16508` by `Thomas Fan`_.
+
 - |Efficiency| :class:`preprocessing.OneHotEncoder` is now faster at
   transforming. :pr:`15762` by `Thomas Fan`_.
 
@@ -566,6 +577,15 @@ Changelog
   matrix from a pandas DataFrame that contains only `SparseArray` columns.
   :pr:`16728` by `Thomas Fan`_.
 
+- |Enhancement| :func:`utils.validation.check_array` supports pandas'
+  nullable integer dtype with missing values when `force_all_finite` is set to
+  `False` or `'allow-nan'` in which case the data is converted to floating
+  point values where `pd.NA` values are replaced by `np.nan`. As a consequence,
+  all :mod:`sklearn.preprocessing` transformers that accept numeric inputs with
+  missing values represented as `np.nan` now also accepts being directly fed
+  pandas dataframes with `pd.Int* or `pd.Uint*` typed columns that use `pd.NA`
+  as a missing value marker. :pr:`16508` by `Thomas Fan`_.
+
 - |API| Passing classes to :func:`utils.estimator_checks.check_estimator` and
   :func:`utils.estimator_checks.parametrize_with_checks` is now deprecated,
   and support for classes will be removed in 0.24. Pass instances instead.
diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py
index 5f1069708a20e..517de982d8478 100644
--- a/sklearn/impute/_base.py
+++ b/sklearn/impute/_base.py
@@ -128,7 +128,9 @@ class SimpleImputer(_BaseImputer):
     ----------
     missing_values : number, string, np.nan (default) or None
         The placeholder for the missing values. All occurrences of
-        `missing_values` will be imputed.
+        `missing_values` will be imputed. For pandas' dataframes with
+        nullable integer dtypes with missing values, `missing_values`
+        should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`.
 
     strategy : string, default='mean'
         The imputation strategy.
@@ -476,8 +478,9 @@ class MissingIndicator(TransformerMixin, BaseEstimator):
     ----------
     missing_values : number, string, np.nan (default) or None
         The placeholder for the missing values. All occurrences of
-        `missing_values` will be indicated (True in the output array), the
-        other values will be marked as False.
+        `missing_values` will be imputed. For pandas' dataframes with
+        nullable integer dtypes with missing values, `missing_values`
+        should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`.
 
     features : str, default=None
         Whether the imputer mask should represent all or a subset of
diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py
index 17a3d05507205..8f80c9723eac3 100644
--- a/sklearn/impute/_iterative.py
+++ b/sklearn/impute/_iterative.py
@@ -54,7 +54,9 @@ class IterativeImputer(_BaseImputer):
 
     missing_values : int, np.nan, default=np.nan
         The placeholder for the missing values. All occurrences of
-        ``missing_values`` will be imputed.
+        `missing_values` will be imputed. For pandas' dataframes with
+        nullable integer dtypes with missing values, `missing_values`
+        should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`.
 
     sample_posterior : boolean, default=False
         Whether to sample from the (Gaussian) predictive posterior of the
diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py
index 44fccf024247e..80a6423bdef79 100644
--- a/sklearn/impute/_knn.py
+++ b/sklearn/impute/_knn.py
@@ -32,7 +32,9 @@ class KNNImputer(_BaseImputer):
     ----------
     missing_values : number, string, np.nan or None, default=`np.nan`
         The placeholder for the missing values. All occurrences of
-        `missing_values` will be imputed.
+        `missing_values` will be imputed. For pandas' dataframes with
+        nullable integer dtypes with missing values, `missing_values`
+        should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`.
 
     n_neighbors : int, default=5
         Number of neighboring samples to use for imputation.
diff --git a/sklearn/impute/tests/test_common.py b/sklearn/impute/tests/test_common.py
index a8d2fd9d6b2f7..220a335c15285 100644
--- a/sklearn/impute/tests/test_common.py
+++ b/sklearn/impute/tests/test_common.py
@@ -84,3 +84,32 @@ def test_imputers_add_indicator_sparse(imputer, marker):
     imputer.set_params(add_indicator=False)
     X_trans_no_indicator = imputer.fit_transform(X)
     assert_allclose_dense_sparse(X_trans[:, :-4], X_trans_no_indicator)
+
+
+# ConvergenceWarning will be raised by the IterativeImputer
+@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
+@pytest.mark.parametrize("imputer", IMPUTERS)
+@pytest.mark.parametrize("add_indicator", [True, False])
+def test_imputers_pandas_na_integer_array_support(imputer, add_indicator):
+    # Test pandas IntegerArray with pd.NA
+    pd = pytest.importorskip('pandas', minversion="1.0")
+    marker = np.nan
+    imputer = imputer.set_params(add_indicator=add_indicator,
+                                 missing_values=marker)
+
+    X = np.array([
+        [marker, 1,      5,      marker, 1],
+        [2,      marker, 1,      marker, 2],
+        [6,      3,      marker, marker, 3],
+        [1,      2,      9,      marker, 4]
+    ])
+    # fit on numpy array
+    X_trans_expected = imputer.fit_transform(X)
+
+    # Creates dataframe with IntegerArrays with pd.NA
+    X_df = pd.DataFrame(X, dtype="Int16", columns=["a", "b", "c", "d", "e"])
+
+    # fit on pandas dataframe with IntegerArrays
+    X_trans = imputer.fit_transform(X_df)
+
+    assert_allclose(X_trans_expected, X_trans)
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 20350345f54da..2e1332d18a20c 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -100,17 +100,20 @@ def check_pairwise_arrays(X, Y, *, precomputed=False, dtype=None,
         raise an error.
 
     force_all_finite : boolean or 'allow-nan', (default=True)
-        Whether to raise an error on np.inf and np.nan in array. The
+        Whether to raise an error on np.inf, np.nan, pd.NA in array. The
         possibilities are:
 
         - True: Force all values of array to be finite.
-        - False: accept both np.inf and np.nan in array.
-        - 'allow-nan': accept only np.nan values in array. Values cannot
-          be infinite.
+        - False: accepts np.inf, np.nan, pd.NA in array.
+        - 'allow-nan': accepts only np.nan and pd.NA values in array. Values
+          cannot be infinite.
 
         .. versionadded:: 0.22
            ``force_all_finite`` accepts the string ``'allow-nan'``.
 
+        .. versionchanged:: 0.23
+           Accepts `pd.NA` and converts it into `np.nan`
+
     copy : bool
         Whether a forced copy will be triggered. If copy=False, a copy might
         be triggered by a conversion.
@@ -1691,15 +1694,19 @@ def pairwise_distances(X, Y=None, metric="euclidean", *, n_jobs=None,
         for more details.
 
     force_all_finite : boolean or 'allow-nan', (default=True)
-        Whether to raise an error on np.inf and np.nan in array. The
+        Whether to raise an error on np.inf, np.nan, pd.NA in array. The
         possibilities are:
 
         - True: Force all values of array to be finite.
-        - False: accept both np.inf and np.nan in array.
-        - 'allow-nan': accept only np.nan values in array. Values cannot
-          be infinite.
+        - False: accepts np.inf, np.nan, pd.NA in array.
+        - 'allow-nan': accepts only np.nan and pd.NA values in array. Values
+          cannot be infinite.
 
         .. versionadded:: 0.22
+           ``force_all_finite`` accepts the string ``'allow-nan'``.
+
+        .. versionchanged:: 0.23
+           Accepts `pd.NA` and converts it into `np.nan`
 
     **kwds : optional keyword parameters
         Any further parameters are passed directly to the distance function.
diff --git a/sklearn/preprocessing/tests/test_common.py b/sklearn/preprocessing/tests/test_common.py
index 7938256d482b7..802329fc5ce32 100644
--- a/sklearn/preprocessing/tests/test_common.py
+++ b/sklearn/preprocessing/tests/test_common.py
@@ -126,3 +126,33 @@ def test_missing_value_handling(est, func, support_sparse, strictly_positive):
                 Xt_inv_sp = est_sparse.inverse_transform(Xt_sp)
             assert len(records) == 0
             assert_allclose(Xt_inv_sp.A, Xt_inv_dense)
+
+
+@pytest.mark.parametrize(
+    "est, func",
+    [(MaxAbsScaler(), maxabs_scale),
+     (MinMaxScaler(), minmax_scale),
+     (StandardScaler(), scale),
+     (StandardScaler(with_mean=False), scale),
+     (PowerTransformer('yeo-johnson'), power_transform),
+     (PowerTransformer('box-cox'), power_transform,),
+     (QuantileTransformer(n_quantiles=3), quantile_transform),
+     (RobustScaler(), robust_scale),
+     (RobustScaler(with_centering=False), robust_scale)]
+)
+def test_missing_value_pandas_na_support(est, func):
+    # Test pandas IntegerArray with pd.NA
+    pd = pytest.importorskip('pandas', minversion="1.0")
+
+    X = np.array([[1, 2, 3, np.nan, np.nan, 4, 5, 1],
+                  [np.nan, np.nan, 8, 4, 6, np.nan, np.nan, 8],
+                  [1, 2, 3, 4, 5, 6, 7, 8]]).T
+
+    # Creates dataframe with IntegerArrays with pd.NA
+    X_df = pd.DataFrame(X, dtype="Int16", columns=['a', 'b', 'c'])
+    X_df['c'] = X_df['c'].astype('int')
+
+    X_trans = est.fit_transform(X)
+    X_df_trans = est.fit_transform(X_df)
+
+    assert_allclose(X_trans, X_df_trans)
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 418f037936c64..bcfd8fcd8d50e 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -349,6 +349,37 @@ def test_check_array():
             check_array(X, dtype="numeric")
 
 
+@pytest.mark.parametrize("pd_dtype", ["Int8", "Int16", "UInt8", "UInt16"])
+@pytest.mark.parametrize("dtype, expected_dtype", [
+    ([np.float32, np.float64], np.float32),
+    (np.float64, np.float64),
+    ("numeric", np.float64),
+])
+def test_check_array_pandas_na_support(pd_dtype, dtype, expected_dtype):
+    # Test pandas IntegerArray with pd.NA
+    pd = pytest.importorskip('pandas', minversion="1.0")
+
+    X_np = np.array([[1, 2, 3, np.nan, np.nan],
+                     [np.nan, np.nan, 8, 4, 6],
+                     [1, 2, 3, 4, 5]]).T
+
+    # Creates dataframe with IntegerArrays with pd.NA
+    X = pd.DataFrame(X_np, dtype=pd_dtype, columns=['a', 'b', 'c'])
+    # column c has no nans
+    X['c'] = X['c'].astype('float')
+    X_checked = check_array(X, force_all_finite='allow-nan', dtype=dtype)
+    assert_allclose(X_checked, X_np)
+    assert X_checked.dtype == expected_dtype
+
+    X_checked = check_array(X, force_all_finite=False, dtype=dtype)
+    assert_allclose(X_checked, X_np)
+    assert X_checked.dtype == expected_dtype
+
+    msg = "Input contains NaN, infinity"
+    with pytest.raises(ValueError, match=msg):
+        check_array(X, force_all_finite=True)
+
+
 def test_check_array_pandas_dtype_object_conversion():
     # test that data-frame like objects with dtype object
     # get converted
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 1fde1f0d69fb1..7a6ef1e05fdde 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -135,17 +135,20 @@ def as_float_array(X, *, copy=True, force_all_finite=True):
         returned if X's dtype is not a floating point type.
 
     force_all_finite : boolean or 'allow-nan', (default=True)
-        Whether to raise an error on np.inf and np.nan in X. The possibilities
-        are:
+        Whether to raise an error on np.inf, np.nan, pd.NA in X. The
+        possibilities are:
 
         - True: Force all values of X to be finite.
-        - False: accept both np.inf and np.nan in X.
-        - 'allow-nan': accept only np.nan values in X. Values cannot be
-          infinite.
+        - False: accepts np.inf, np.nan, pd.NA in X.
+        - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot
+          be infinite.
 
         .. versionadded:: 0.20
            ``force_all_finite`` accepts the string ``'allow-nan'``.
 
+        .. versionchanged:: 0.23
+           Accepts `pd.NA` and converts it into `np.nan`
+
     Returns
     -------
     XT : {array, sparse matrix}
@@ -317,17 +320,20 @@ def _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy,
         be triggered by a conversion.
 
     force_all_finite : boolean or 'allow-nan', (default=True)
-        Whether to raise an error on np.inf and np.nan in X. The possibilities
-        are:
+        Whether to raise an error on np.inf, np.nan, pd.NA in X. The
+        possibilities are:
 
         - True: Force all values of X to be finite.
-        - False: accept both np.inf and np.nan in X.
-        - 'allow-nan': accept only np.nan values in X. Values cannot be
-          infinite.
+        - False: accepts np.inf, np.nan, pd.NA in X.
+        - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot
+          be infinite.
 
         .. versionadded:: 0.20
            ``force_all_finite`` accepts the string ``'allow-nan'``.
 
+        .. versionchanged:: 0.23
+           Accepts `pd.NA` and converts it into `np.nan`
+
     Returns
     -------
     spmatrix_converted : scipy sparse matrix.
@@ -438,19 +444,20 @@ def check_array(array, accept_sparse=False, *, accept_large_sparse=True,
         be triggered by a conversion.
 
     force_all_finite : boolean or 'allow-nan', (default=True)
-        Whether to raise an error on np.inf and np.nan in array. The
+        Whether to raise an error on np.inf, np.nan, pd.NA in array. The
         possibilities are:
 
         - True: Force all values of array to be finite.
-        - False: accept both np.inf and np.nan in array.
-        - 'allow-nan': accept only np.nan values in array. Values cannot
-          be infinite.
-
-        For object dtyped data, only np.nan is checked and not np.inf.
+        - False: accepts np.inf, np.nan, pd.NA in array.
+        - 'allow-nan': accepts only np.nan and pd.NA values in array. Values
+          cannot be infinite.
 
         .. versionadded:: 0.20
            ``force_all_finite`` accepts the string ``'allow-nan'``.
 
+        .. versionchanged:: 0.23
+           Accepts `pd.NA` and converts it into `np.nan`
+
     ensure_2d : boolean (default=True)
         Whether to raise a value error if array is not 2D.
 
@@ -491,6 +498,7 @@ def check_array(array, accept_sparse=False, *, accept_large_sparse=True,
     # check if the object contains several dtypes (typically a pandas
     # DataFrame), and store them. If not, store None.
     dtypes_orig = None
+    has_pd_integer_array = False
     if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'):
         # throw warning if columns are sparse. If all columns are sparse, then
         # array.sparse exists and sparsity will be perserved (later).
@@ -508,6 +516,19 @@ def check_array(array, accept_sparse=False, *, accept_large_sparse=True,
         for i, dtype_iter in enumerate(dtypes_orig):
             if dtype_iter.kind == 'b':
                 dtypes_orig[i] = np.dtype(np.object)
+            elif dtype_iter.name.startswith(("Int", "UInt")):
+                # name looks like an Integer Extension Array, now check for
+                # the dtype
+                with suppress(ImportError):
+                    from pandas import (Int8Dtype, Int16Dtype,
+                                        Int32Dtype, Int64Dtype,
+                                        UInt8Dtype, UInt16Dtype,
+                                        UInt32Dtype, UInt64Dtype)
+                    if isinstance(dtype_iter, (Int8Dtype, Int16Dtype,
+                                               Int32Dtype, Int64Dtype,
+                                               UInt8Dtype, UInt16Dtype,
+                                               UInt32Dtype, UInt64Dtype)):
+                        has_pd_integer_array = True
 
         if all(isinstance(dtype, np.dtype) for dtype in dtypes_orig):
             dtype_orig = np.result_type(*dtypes_orig)
@@ -528,6 +549,10 @@ def check_array(array, accept_sparse=False, *, accept_large_sparse=True,
             # list of accepted types.
             dtype = dtype[0]
 
+    if has_pd_integer_array:
+        # If there are any pandas integer extension arrays,
+        array = array.astype(dtype)
+
     if force_all_finite not in (True, False, 'allow-nan'):
         raise ValueError('force_all_finite should be a bool or "allow-nan"'
                          '. Got {!r} instead'.format(force_all_finite))
@@ -712,18 +737,21 @@ def check_X_y(X, y, accept_sparse=False, *, accept_large_sparse=True,
         be triggered by a conversion.
 
     force_all_finite : boolean or 'allow-nan', (default=True)
-        Whether to raise an error on np.inf and np.nan in X. This parameter
-        does not influence whether y can have np.inf or np.nan values.
+        Whether to raise an error on np.inf, np.nan, pd.NA in X. This parameter
+        does not influence whether y can have np.inf, np.nan, pd.NA values.
         The possibilities are:
 
         - True: Force all values of X to be finite.
-        - False: accept both np.inf and np.nan in X.
-        - 'allow-nan': accept only np.nan values in X. Values cannot be
-          infinite.
+        - False: accepts np.inf, np.nan, pd.NA in X.
+        - 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot
+          be infinite.
 
         .. versionadded:: 0.20
            ``force_all_finite`` accepts the string ``'allow-nan'``.
 
+        .. versionchanged:: 0.23
+           Accepts `pd.NA` and converts it into `np.nan`
+
     ensure_2d : boolean (default=True)
         Whether to raise a value error if X is not 2D.
 

From 1bd740436ddeadd00567cd2ab7ec15e20c2f5a57 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Tue, 28 Apr 2020 17:16:20 -0400
Subject: [PATCH 090/125] DOC Fixes formating in whats new (#17076)

---
 doc/whats_new/v0.23.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index aedd5fe804722..0e149ed03a9fa 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -215,8 +215,9 @@ Changelog
   raise `invalid value encountered in multiply` during `fit`.
   :pr:`16718` by :user:`Gui Miotto <gui-miotto>`.
 
-- |Feature| Added `n_components_` attribute to :class:'decomposition.SparsePCA'
-  and :class:'MiniBatchSparsePCA'. :pr:'16981' by :user:'Mateusz Górski <Reksbril>'
+- |Feature| Added `n_components_` attribute to :class:`decomposition.SparsePCA`
+  and :class:`decomposition.MiniBatchSparsePCA`. :pr:`16981` by
+  :user:`Mateusz Górski <Reksbril>`.
 
 :mod:`sklearn.ensemble`
 .......................

From 2dd12af9d687f4349a37ee0df90e61c9f992b092 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@gmail.com>
Date: Tue, 28 Apr 2020 23:26:26 +0200
Subject: [PATCH 091/125] DOC Improve claim prediction example (#16648)

Co-Authored-By: Christian Lorentzen <lorentzen.ch@googlemail.com>
Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com>
---
 ...plot_poisson_regression_non_normal_loss.py | 537 +++++++++++-------
 1 file changed, 325 insertions(+), 212 deletions(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 4b0386edfcdf6..4fc3bea7bda51 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -3,87 +3,87 @@
 Poisson regression and non-normal loss
 ======================================
 
-This example illustrates the use of log-linear Poisson regression
-on the `French Motor Third-Party Liability Claims dataset
-<https://www.openml.org/d/41214>`_ from [1]_ and compares
-it with models learned with least squared error. In this dataset, each sample
-corresponds to an insurance policy, i.e. a contract within an insurance
-company and an individual (policiholder). Available features include driver
-age, vehicle age, vehicle power, etc.
-
-A few definitions: a *claim* is the request made by a policyholder to the
-insurer to compensate for a loss covered by the insurance. The *exposure* is
-the duration of the insurance coverage of a given policy, in years.
-
-Our goal is to predict the expected number of insurance claims (or frequency)
-following car accidents for a policyholder given the historical data over a
-population of policyholders.
+This example illustrates the use of log-linear Poisson regression on the
+`French Motor Third-Party Liability Claims dataset
+<https://www.openml.org/d/41214>`_ from [1]_ and compares it with a linear
+model fitted with the usual least squared error and a non-linear GBRT model
+fitted with the Poisson loss (and a log-link).
+
+A few definitions:
+
+- A **policy** is a contract between an insurance company and an individual:
+  the **policyholder**, that is, the vehicle driver in this case.
+
+- A **claim** is the request made by a policyholder to the insurer to
+  compensate for a loss covered by the insurance.
+
+- The **exposure** is the duration of the insurance coverage of a given policy,
+  in years.
+
+- The claim **frequency** is the number of claims divided by the exposure,
+  typically measured in number of claims per year.
+
+In this dataset, each sample corresponds to an insurance policy. Available
+features include driver age, vehicle age, vehicle power, etc.
+
+Our goal is to predict the expected frequency of claims following car accidents
+for a new policyholder given the historical data over a population of
+policyholders.
 
 .. [1]  A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor
-    Third-Party Liability Claims (November 8, 2018).
-    `doi:10.2139/ssrn.3164764 <http://dx.doi.org/10.2139/ssrn.3164764>`_
+    Third-Party Liability Claims (November 8, 2018). `doi:10.2139/ssrn.3164764
+    <http://dx.doi.org/10.2139/ssrn.3164764>`_
 
 """
 print(__doc__)
-
 # Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
 #          Roman Yurchak <rth.yurchak@gmail.com>
+#          Olivier Grisel <olivier.grisel@ensta.org>
 # License: BSD 3 clause
-import warnings
-
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 
-from sklearn.datasets import fetch_openml
-from sklearn.dummy import DummyRegressor
-from sklearn.compose import ColumnTransformer
-from sklearn.linear_model import Ridge, PoissonRegressor
-from sklearn.model_selection import train_test_split
-from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
-from sklearn.preprocessing import OrdinalEncoder
-from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
-from sklearn.ensemble import RandomForestRegressor
-from sklearn.utils import gen_even_slices
-from sklearn.metrics import auc
-
-from sklearn.metrics import mean_squared_error, mean_absolute_error
-from sklearn.metrics import mean_poisson_deviance
-
 
-def load_mtpl2(n_samples=100000):
-    """Fetch the French Motor Third-Party Liability Claims dataset.
-
-    Parameters
-    ----------
-    n_samples: int or None, default=100000
-      Number of samples to select (for faster run time). If None, the full
-      dataset with 678013 samples is returned.
-    """
+##############################################################################
+# The French Motor Third-Party Liability Claims dataset
+# -----------------------------------------------------
+#
+# Let's load the motor claim dataset from OpenML:
+# https://www.openml.org/d/41214
 
-    # freMTPL2freq dataset from https://www.openml.org/d/41214
-    df = fetch_openml(data_id=41214, as_frame=True)['data']
+from sklearn.datasets import fetch_openml
 
-    # unquote string fields
-    for column_name in df.columns[df.dtypes.values == np.object]:
-        df[column_name] = df[column_name].str.strip("'")
-    if n_samples is not None:
-        return df.iloc[:n_samples]
-    return df
 
+df = fetch_openml(data_id=41214, as_frame=True).frame
+df
 
 ##############################################################################
-# Let's load the motor claim dataset. We ignore the severity data for this
-# study for the sake of simplicitly.
+# The number of claims (``ClaimNb``) is a positive integer that can be modeled
+# as a Poisson distribution. It is then assumed to be the number of discrete
+# events occurring with a constant rate in a given time interval (``Exposure``,
+# in units of years).
 #
-# We also subsample the data for the sake of computational cost and running
-# time. Using the full dataset would lead to similar conclusions.
+# Here we want to model the frequency ``y = ClaimNb / Exposure`` conditionally
+# on ``X`` via a (scaled) Poisson distribution, and use ``Exposure`` as
+# ``sample_weight``.
+
+df["Frequency"] = df["ClaimNb"] / df["Exposure"]
+
+print("Average Frequency = {}"
+      .format(np.average(df["Frequency"], weights=df["Exposure"])))
 
-df = load_mtpl2(n_samples=300000)
+print("Fraction of exposure with zero claims = {0:.1%}"
+      .format(df.loc[df["ClaimNb"] == 0, "Exposure"].sum() /
+              df["Exposure"].sum()))
 
-# Correct for unreasonable observations (that might be data error)
-df["Exposure"] = df["Exposure"].clip(upper=1)
+fig, (ax0, ax1, ax2) = plt.subplots(ncols=3, figsize=(16, 4))
+ax0.set_title("Number of claims")
+_ = df["ClaimNb"].hist(bins=30, log=True, ax=ax0)
+ax1.set_title("Exposure in years")
+_ = df["Exposure"].hist(bins=30, log=True, ax=ax1)
+ax2.set_title("Frequency (number of claims per year)")
+_ = df["Frequency"].hist(bins=30, log=True, ax=ax2)
 
 ##############################################################################
 # The remaining columns can be used to predict the frequency of claim events.
@@ -93,6 +93,12 @@ def load_mtpl2(n_samples=100000):
 # In order to fit linear models with those predictors it is therefore
 # necessary to perform standard feature transformations as follows:
 
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
+from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
+from sklearn.compose import ColumnTransformer
+
+
 log_scale_transformer = make_pipeline(
     FunctionTransformer(np.log, validate=False),
     StandardScaler()
@@ -112,123 +118,144 @@ def load_mtpl2(n_samples=100000):
     remainder="drop",
 )
 
-##############################################################################
-# The number of claims (``ClaimNb``) is a positive integer that can be modeled
-# as a Poisson distribution. It is then assumed to be the number of discrete
-# events occurring with a constant rate in a given time interval
-# (``Exposure``, in units of years). Here we model the frequency
-# ``y = ClaimNb / Exposure``, which is still a (scaled) Poisson distribution,
-# and use ``Exposure`` as ``sample_weight``.
-
-df["Frequency"] = df["ClaimNb"] / df["Exposure"]
-
-print(
-   pd.cut(df["Frequency"], [-1e-6, 1e-6, 1, 2, 3, 4, 5]).value_counts()
-)
-
-print("Average Frequency = {}"
-      .format(np.average(df["Frequency"], weights=df["Exposure"])))
-
-print("Percentage of zero claims = {0:%}"
-      .format(df.loc[df["ClaimNb"] == 0, "Exposure"].sum() /
-              df["Exposure"].sum()))
 
 ##############################################################################
-# It is worth noting that 92 % of policyholders have zero claims, and if we
-# were to convert this problem into a binary classification task, it would be
-# significantly imbalanced.
+# A constant prediction baseline
+# ------------------------------
+#
+# It is worth noting that more than 93% of policyholders have zero claims. If
+# we were to convert this problem into a binary classification task, it would
+# be significantly imbalanced, and even a simplistic model that would only
+# predict mean can achieve an accuracy of 93%.
 #
 # To evaluate the pertinence of the used metrics, we will consider as a
 # baseline a "dummy" estimator that constantly predicts the mean frequency of
 # the training sample.
 
-df_train, df_test = train_test_split(df, random_state=0)
+from sklearn.dummy import DummyRegressor
+from sklearn.pipeline import Pipeline
+from sklearn.model_selection import train_test_split
 
-dummy = make_pipeline(
-    linear_model_preprocessor,
-    DummyRegressor(strategy='mean')
-)
-dummy.fit(df_train, df_train["Frequency"],
-          dummyregressor__sample_weight=df_train["Exposure"])
+df_train, df_test = train_test_split(df, test_size=0.33, random_state=0)
+
+dummy = Pipeline([
+    ("preprocessor", linear_model_preprocessor),
+    ("regressor", DummyRegressor(strategy='mean')),
+]).fit(df_train, df_train["Frequency"],
+       regressor__sample_weight=df_train["Exposure"])
+
+
+##############################################################################
+# Let's compute the performance of this constant prediction baseline with 3
+# different regression metrics:
+
+from sklearn.metrics import mean_squared_error
+from sklearn.metrics import mean_absolute_error
+from sklearn.metrics import mean_poisson_deviance
 
 
 def score_estimator(estimator, df_test):
     """Score an estimator on the test set."""
-
     y_pred = estimator.predict(df_test)
 
     print("MSE: %.3f" %
           mean_squared_error(df_test["Frequency"], y_pred,
-                             df_test["Exposure"]))
+                             sample_weight=df_test["Exposure"]))
     print("MAE: %.3f" %
           mean_absolute_error(df_test["Frequency"], y_pred,
-                              df_test["Exposure"]))
+                              sample_weight=df_test["Exposure"]))
 
-    # ignore non-positive predictions, as they are invalid for
-    # the Poisson deviance
+    # Ignore non-positive predictions, as they are invalid for
+    # the Poisson deviance.
     mask = y_pred > 0
     if (~mask).any():
-        warnings.warn("Estimator yields non-positive predictions for {} "
-                      "samples out of {}. These will be ignored while "
-                      "computing the Poisson deviance"
-                      .format((~mask).sum(), mask.shape[0]))
+        n_masked, n_samples = (~mask).sum(), mask.shape[0]
+        print(f"WARNING: Estimator yields invalid, non-positive predictions "
+              f" for {n_masked} samples out of {n_samples}. These predictions "
+              f"are ignored when computing the Poisson deviance.")
 
     print("mean Poisson deviance: %.3f" %
           mean_poisson_deviance(df_test["Frequency"][mask],
                                 y_pred[mask],
-                                df_test["Exposure"][mask]))
+                                sample_weight=df_test["Exposure"][mask]))
 
 
 print("Constant mean frequency evaluation:")
 score_estimator(dummy, df_test)
 
 ##############################################################################
-# We start by modeling the target variable with the least squares linear
-# regression model,
+# (Generalized) Linear models
+# ---------------------------
+#
+# We start by modeling the target variable with the (l2 penalized) least
+# squares linear regression model, more comonly known as Ridge regression. We
+# use a low penalization `alpha`, as we expect such a linear model to under-fit
+# on such a large dataset.
 
-ridge = make_pipeline(linear_model_preprocessor, Ridge(alpha=1.0))
-ridge.fit(df_train, df_train["Frequency"],
-          ridge__sample_weight=df_train["Exposure"])
+from sklearn.linear_model import Ridge
+
+
+ridge_glm = Pipeline([
+    ("preprocessor", linear_model_preprocessor),
+    ("regressor", Ridge(alpha=1e-6)),
+]).fit(df_train, df_train["Frequency"],
+       regressor__sample_weight=df_train["Exposure"])
 
 ##############################################################################
 # The Poisson deviance cannot be computed on non-positive values predicted by
-# the model. For models that do return a few non-positive predictions
-# (e.g. :class:`linear_model.Ridge`) we ignore the corresponding samples,
+# the model. For models that do return a few non-positive predictions (e.g.
+# :class:`~sklearn.linear_model.Ridge`) we ignore the corresponding samples,
 # meaning that the obtained Poisson deviance is approximate. An alternative
-# approach could be to use :class:`compose.TransformedTargetRegressor`
+# approach could be to use :class:`~sklearn.compose.TransformedTargetRegressor`
 # meta-estimator to map ``y_pred`` to a strictly positive domain.
 
 print("Ridge evaluation:")
-score_estimator(ridge, df_test)
+score_estimator(ridge_glm, df_test)
 
 ##############################################################################
 # Next we fit the Poisson regressor on the target variable. We set the
-# regularization strength ``alpha`` to 1 over number of samples in oder to
-# mimic the Ridge regressor whose L2 penalty term scales differently with the
-# number of samples.
+# regularization strength ``alpha`` to approximately 1e-6 over number of
+# samples (i.e. `1e-12`) in order to mimic the Ridge regressor whose L2 penalty
+# term scales differently with the number of samples.
 
-poisson = make_pipeline(
-    linear_model_preprocessor,
-    PoissonRegressor(alpha=1/df_train.shape[0], max_iter=1000)
-)
-poisson.fit(df_train, df_train["Frequency"],
-            poissonregressor__sample_weight=df_train["Exposure"])
+from sklearn.linear_model import PoissonRegressor
+
+n_samples = df_train.shape[0]
+
+poisson_glm = Pipeline([
+    ("preprocessor", linear_model_preprocessor),
+    ("regressor", PoissonRegressor(alpha=1e-12, max_iter=300))
+])
+poisson_glm.fit(df_train, df_train["Frequency"],
+                regressor__sample_weight=df_train["Exposure"])
 
 print("PoissonRegressor evaluation:")
-score_estimator(poisson, df_test)
+score_estimator(poisson_glm, df_test)
 
 ##############################################################################
-# Finally, we will consider a non-linear model, namely a random forest. Random
-# forests do not require the categorical data to be one-hot encoded: instead,
-# we can encode each category label with an arbitrary integer using
-# :class:`preprocessing.OrdinalEncoder`. With this encoding, the forest will
-# treat the categorical features as ordered features, which might not be always
-# a desired behavior. However this effect is limited for deep enough trees
-# which are able to recover the categorical nature of the features. The main
-# advantage of the :class:`preprocessing.OrdinalEncoder` over the
-# :class:`preprocessing.OneHotEncoder` is that it will make training faster.
-
-rf_preprocessor = ColumnTransformer(
+# Finally, we will consider a non-linear model, namely Gradient Boosting
+# Regression Trees. Tree-based models do not require the categorical data to be
+# one-hot encoded: instead, we can encode each category label with an arbitrary
+# integer using :class:`~sklearn.preprocessing.OrdinalEncoder`. With this
+# encoding, the trees will treat the categorical features as ordered features,
+# which might not be always a desired behavior. However this effect is limited
+# for deep enough trees which are able to recover the categorical nature of the
+# features. The main advantage of the
+# :class:`~sklearn.preprocessing.OrdinalEncoder` over the
+# :class:`~sklearn.preprocessing.OneHotEncoder` is that it will make training
+# faster.
+#
+# Gradient Boosting also gives the possibility to fit the trees with a Poisson
+# loss (with an implicit log-link function) instead of the default
+# least-squares loss. Here we only fit trees with the Poisson loss to keep this
+# example concise.
+
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingRegressor
+from sklearn.preprocessing import OrdinalEncoder
+
+
+tree_preprocessor = ColumnTransformer(
     [
         ("categorical", OrdinalEncoder(),
             ["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
@@ -237,22 +264,22 @@ def score_estimator(estimator, df_test):
     ],
     remainder="drop",
 )
-rf = make_pipeline(
-    rf_preprocessor,
-    RandomForestRegressor(min_weight_fraction_leaf=0.01, n_jobs=2)
-)
-rf.fit(df_train, df_train["Frequency"].values,
-       randomforestregressor__sample_weight=df_train["Exposure"].values)
-
+poisson_gbrt = Pipeline([
+    ("preprocessor", tree_preprocessor),
+    ("regressor", HistGradientBoostingRegressor(loss="poisson",
+                                                max_leaf_nodes=128)),
+])
+poisson_gbrt.fit(df_train, df_train["Frequency"],
+                 regressor__sample_weight=df_train["Exposure"])
 
-print("RandomForestRegressor evaluation:")
-score_estimator(rf, df_test)
+print("Poisson Gradient Boosted Trees evaluation:")
+score_estimator(poisson_gbrt, df_test)
 
 
 ##############################################################################
-# Like the Ridge regression above, the random forest model minimizes the
-# conditional squared error, too. However, because of a higher predictive
-# power, it also results in a smaller Poisson deviance than the Poisson
+# Like the Ridge regression above, the gradient boosted trees model minimizes
+# the conditional squared error. However, because of a higher predictive power,
+# it also results in a smaller Poisson deviance than the linear Poisson
 # regression model.
 #
 # Evaluating models with a single train / test split is prone to random
@@ -263,7 +290,7 @@ def score_estimator(estimator, df_test):
 # comparing the histogram of observed target values with that of predicted
 # values:
 
-fig, axes = plt.subplots(2, 4, figsize=(16, 6), sharey=True)
+fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(16, 6), sharey=True)
 fig.subplots_adjust(bottom=0.2)
 n_bins = 20
 for row_idx, label, df in zip(range(2),
@@ -278,7 +305,7 @@ def score_estimator(estimator, df_test):
     axes[row_idx, 0].set_ylim([1e1, 5e5])
     axes[row_idx, 0].set_ylabel(label + " samples")
 
-    for idx, model in enumerate([ridge, poisson, rf]):
+    for idx, model in enumerate([ridge_glm, poisson_glm, poisson_gbrt]):
         y_pred = model.predict(df)
 
         pd.Series(y_pred).hist(bins=np.linspace(-1, 4, n_bins),
@@ -292,21 +319,42 @@ def score_estimator(estimator, df_test):
 
 ##############################################################################
 # The experimental data presents a long tail distribution for ``y``. In all
-# models we predict a mean expected value, so we will have necessarily fewer
-# extreme values. Additionally, the normal distribution used in ``Ridge`` and
-# ``RandomForestRegressor`` has a constant variance, while for the Poisson
-# distribution used in ``PoissonRegressor``, the variance is proportional to
-# the mean predicted value.
+# models, we predict the expected frequency of a random variable, so we will
+# have necessarily fewer extreme values than for the observed realizations of
+# that random variable. This explains that the mode of the histograms of model
+# predictions doesn't necessarily correspond to the smallest value.
+# Additionally, the normal distribution used in ``Ridge`` has a constant
+# variance, while for the Poisson distribution used in ``PoissonRegressor`` and
+# ``HistGradientBoostingRegressor``, the variance is proportional to the
+# predicted expected value.
+#
+# Thus, among the considered estimators, ``PoissonRegressor`` and
+# ``HistGradientBoostingRegressor`` are a-priori better suited for modeling the
+# long tail distribution of the non-negative data as compared to the ``Ridge``
+# model which makes a wrong assumption on the distribution of the target
+# variable.
+#
+# The ``HistGradientBoostingRegressor`` estimator has the most flexibility and
+# is able to predict higher expected values.
 #
-# Thus, among the considered estimators, ``PoissonRegressor`` is better suited
-# for modeling the long tail distribution of the data as compared to the
-# ``Ridge`` and ``RandomForestRegressor`` estimators.
+# Note that we could have used the least squares loss for the
+# ``HistGradientBoostingRegressor`` model. This would wrongly assume a normal
+# distribution the response variable as for the `Ridge` model, and possibly
+# also lead to slightly negative predictions. However the gradient boosted
+# trees would still perform relatively well and in particular better than
+# ``PoissonRegressor`` thanks to the flexibility of the trees combined with the
+# large number of training samples.
+#
+# Evaluation of the calibration of predictions
+# --------------------------------------------
 #
 # To ensure that estimators yield reasonable predictions for different
 # policyholder types, we can bin test samples according to ``y_pred`` returned
 # by each model. Then for each bin, we compare the mean predicted ``y_pred``,
 # with the mean observed target:
 
+from sklearn.utils import gen_even_slices
+
 
 def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
                                   n_bins=100):
@@ -352,104 +400,169 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
     return bin_centers, y_true_bin, y_pred_bin
 
 
-fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 3.5))
+print(f"Actual number of claims: {df_test['ClaimNb'].sum()}")
+fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))
 plt.subplots_adjust(wspace=0.3)
 
-for axi, model in zip(ax, [ridge, poisson, rf]):
+for axi, model in zip(ax.ravel(), [ridge_glm, poisson_glm, poisson_gbrt,
+                                   dummy]):
     y_pred = model.predict(df_test)
-
+    y_true = df_test["Frequency"].values
+    exposure = df_test["Exposure"].values
     q, y_true_seg, y_pred_seg = _mean_frequency_by_risk_group(
-        df_test["Frequency"].values,
-        y_pred,
-        sample_weight=df_test["Exposure"].values,
-        n_bins=10)
+        y_true, y_pred, sample_weight=exposure, n_bins=10)
+
+    # Name of the model after the estimator used in the last step of the
+    # pipeline.
+    print(f"Predicted number of claims by {model[-1]}: "
+          f"{np.sum(y_pred * exposure):.1f}")
 
-    axi.plot(q, y_pred_seg, marker='o', linestyle="-", label="predictions")
-    axi.plot(q, y_true_seg, marker='x', linestyle="--", label="observations")
+    axi.plot(q, y_pred_seg, marker='x', linestyle="--", label="predictions")
+    axi.plot(q, y_true_seg, marker='o', linestyle="--", label="observations")
     axi.set_xlim(0, 1.0)
-    axi.set_ylim(0, 0.6)
+    axi.set_ylim(0, 0.5)
     axi.set(
-        title=model[-1].__class__.__name__,
+        title=model[-1],
         xlabel='Fraction of samples sorted by y_pred',
         ylabel='Mean Frequency (y_pred)'
     )
     axi.legend()
 plt.tight_layout()
 
-##############################################################################
-# The ``Ridge`` regression model can predict very low expected frequencies
-# that do not match the data. It can therefore severly under-estimate the risk
-# for some policyholders.
-#
-# ``PoissonRegressor`` and ``RandomForestRegressor`` show better consistency
-# between predicted and observed targets, especially for low predicted target
-# values.
-#
-# However, for some business applications, we are not necessarily interested
-# in the ability of the model to predict the expected frequency value, but
-# instead to predict which policyholder groups are the riskiest and which are
-# the safest. In this case, the model evaluation would cast the problem as a
-# ranking problem rather than a regression problem.
-#
-# To compare the 3 models within this perspective, one can plot the fraction of
-# the number of claims vs the fraction of exposure for test samples ordered by
-# the model predictions, from safest to riskiest  according to each model:
-
-
-def _cumulated_claims(y_true, y_pred, exposure):
-    idx_sort = np.argsort(y_pred)  # from safest to riskiest
-    sorted_exposure = exposure[idx_sort]
-    sorted_frequencies = y_true[idx_sort]
-    cumulated_exposure = np.cumsum(sorted_exposure)
-    cumulated_exposure /= cumulated_exposure[-1]
-    cumulated_claims = np.cumsum(sorted_exposure * sorted_frequencies)
+###############################################################################
+# The dummy regression model predicts a constant frequency. This model does not
+# attribute the same tied rank to all samples but is none-the-less globally
+# well calibrated (to estimate the mean frequency of the entire population).
+#
+# The ``Ridge`` regression model can predict very low expected frequencies that
+# do not match the data. It can therefore severly under-estimate the risk for
+# some policyholders.
+#
+# ``PoissonRegressor`` and ``HistGradientBoostingRegressor`` show better
+# consistency between predicted and observed targets, especially for low
+# predicted target values.
+#
+# The sum of all predictions also confirms the calibration issue of the
+# ``Ridge`` model: it under-estimates by more than 3% the total number of
+# claims in the test set while the other three models can approximately recover
+# the total number of claims of the test portfolio.
+#
+# Evaluation of the ranking power
+# -------------------------------
+#
+# For some business applications, we are interested in the ability of the model
+# to rank the riskiest from the safest policyholders, irrespective of the
+# absolute value of the prediction. In this case, the model evaluation would
+# cast the problem as a ranking problem rather than a regression problem.
+#
+# To compare the 3 models from this perspective, one can plot the cumulative
+# proportion of claims vs the cumulative proportion of exposure for the test
+# samples order by the model predictions, from safest to riskiest according to
+# each model.
+#
+# This plot is called a Lorenz curve and can be summarized by the Gini index:
+
+from sklearn.metrics import auc
+
+
+def lorenz_curve(y_true, y_pred, exposure):
+    y_true, y_pred = np.asarray(y_true), np.asarray(y_pred)
+    exposure = np.asarray(exposure)
+
+    # order samples by increasing predicted risk:
+    ranking = np.argsort(y_pred)
+    ranked_exposure = exposure[ranking]
+    ranked_frequencies = y_true[ranking]
+    ranked_exposure = exposure[ranking]
+    cumulated_claims = np.cumsum(ranked_frequencies * ranked_exposure)
     cumulated_claims /= cumulated_claims[-1]
+    cumulated_exposure = np.cumsum(ranked_exposure)
+    cumulated_exposure /= cumulated_exposure[-1]
     return cumulated_exposure, cumulated_claims
 
 
 fig, ax = plt.subplots(figsize=(8, 8))
 
-for model in [ridge, poisson, rf]:
+for model in [dummy, ridge_glm, poisson_glm, poisson_gbrt]:
     y_pred = model.predict(df_test)
-    cum_exposure, cum_claims = _cumulated_claims(
-        df_test["Frequency"].values,
-        y_pred,
-        df_test["Exposure"].values)
-    area = auc(cum_exposure, cum_claims)
-    label = "{} (area under curve: {:.3f})".format(
-        model[-1].__class__.__name__, area)
+    cum_exposure, cum_claims = lorenz_curve(df_test["Frequency"], y_pred,
+                                            df_test["Exposure"])
+    gini = 1 - 2 * auc(cum_exposure, cum_claims)
+    label = "{} (Gini: {:.2f})".format(model[-1], gini)
     ax.plot(cum_exposure, cum_claims, linestyle="-", label=label)
 
 # Oracle model: y_pred == y_test
-cum_exposure, cum_claims = _cumulated_claims(
-    df_test["Frequency"].values,
-    df_test["Frequency"].values,
-    df_test["Exposure"].values)
-area = auc(cum_exposure, cum_claims)
-label = "Oracle (area under curve: {:.3f})".format(area)
+cum_exposure, cum_claims = lorenz_curve(df_test["Frequency"],
+                                        df_test["Frequency"],
+                                        df_test["Exposure"])
+gini = 1 - 2 * auc(cum_exposure, cum_claims)
+label = "Oracle (Gini: {:.2f})".format(gini)
 ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label=label)
 
 # Random Baseline
 ax.plot([0, 1], [0, 1], linestyle="--", color="black",
         label="Random baseline")
 ax.set(
-    title="Cumulated number of claims by model",
-    xlabel='Fraction of exposure (from safest to riskiest)',
-    ylabel='Fraction of number of claims'
+    title="Lorenz curves by model",
+    xlabel='Cumulative proportion of exposure (from safest to riskiest)',
+    ylabel='Cumulative proportion of claims'
 )
 ax.legend(loc="upper left")
 
 ##############################################################################
-# This plot reveals that the random forest model is slightly better at ranking
-# policyholders by risk profiles even if the absolute value of the predicted
-# expected frequencies are less well calibrated than for the linear Poisson
-# model.
+# As expected, the dummy regressor is unable to correctly rank the samples and
+# therefore performs the worst on this plot.
+#
+# The tree-based model is significantly better at ranking policyholders by risk
+# while the two linear models perform similarly.
 #
 # All three models are significantly better than chance but also very far from
 # making perfect predictions.
 #
 # This last point is expected due to the nature of the problem: the occurrence
 # of accidents is mostly dominated by circumstantial causes that are not
-# captured in the columns of the dataset or that are indeed random.
+# captured in the columns of the dataset and can indeed be considered as purely
+# random.
+#
+# The linear models assume no interactions between the input variables which
+# likely causes under-fitting. Inserting a polynomial feature extractor
+# (:func:`~sklearn.preprocessing.PolynomialFeatures`) indeed increases their
+# discrimative power by 2 points of Gini index. In particular it improves the
+# ability of the models to identify the top 5% riskiest profiles.
+#
+# Main takeaways
+# --------------
+#
+# - The performance of the models can be evaluted by their ability to yield
+#   well-calibrated predictions and a good ranking.
+#
+# - The Gini index reflects the ability of a model to rank predictions
+#   irrespective of their absolute values, and therefore only assess their
+#   ranking power.
+#
+# - The calibration of the model can be assessed by plotting the mean observed
+#   value vs the mean predicted value on groups of test samples binned by
+#   predicted risk.
+#
+# - The least squares loss (along with the implicit use of the identity link
+#   function) of the Ridge regression model seems to cause this model to be
+#   badly calibrated. In particular, it tends to underestimate the risk and can
+#   even predict invalid negative frequencies.
+#
+# - Using the Poisson loss with a log-link can correct these problems and lead
+#   to a well-calibrated linear model.
+#
+# - Despite the improvement in calibration, the ranking power of both linear
+#   models are comparable and well below the ranking power of the Gradient
+#   Boosting Regression Trees.
+#
+# - The Poisson deviance computed as an evaluation metric reflects both the
+#   calibration and the ranking power of the model. It also makes a linear
+#   assumption on the ideal relationship between the expected value and the
+#   variance of the response variable. For the sake of conciseness we did not
+#   check whether this assumption holds.
+#
+# - Traditional regression metrics such as Mean Squared Error and Mean Absolute
+#   Error are hard to meaningfully interpret on count values with many zeros.
 
 plt.show()

From 1c69a8a55cc18b461b1befbd68c99a5020140363 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@googlemail.com>
Date: Wed, 29 Apr 2020 02:26:51 +0200
Subject: [PATCH 092/125] DOC small typos and fixes for poisson example
 (#17078)

---
 ...plot_poisson_regression_non_normal_loss.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 4fc3bea7bda51..3a24b55848013 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -277,10 +277,9 @@ def score_estimator(estimator, df_test):
 
 
 ##############################################################################
-# Like the Ridge regression above, the gradient boosted trees model minimizes
-# the conditional squared error. However, because of a higher predictive power,
-# it also results in a smaller Poisson deviance than the linear Poisson
-# regression model.
+# Like the Poisson GLM above, the gradient boosted trees model minimizes
+# the Poisson deviance. However, because of a higher predictive power,
+# it reaches lower values of Poisson deviance.
 #
 # Evaluating models with a single train / test split is prone to random
 # fluctuations. If computing resources allow, it should be verified that
@@ -339,7 +338,7 @@ def score_estimator(estimator, df_test):
 #
 # Note that we could have used the least squares loss for the
 # ``HistGradientBoostingRegressor`` model. This would wrongly assume a normal
-# distribution the response variable as for the `Ridge` model, and possibly
+# distributed response variable as does the `Ridge` model, and possibly
 # also lead to slightly negative predictions. However the gradient boosted
 # trees would still perform relatively well and in particular better than
 # ``PoissonRegressor`` thanks to the flexibility of the trees combined with the
@@ -533,13 +532,9 @@ def lorenz_curve(y_true, y_pred, exposure):
 # Main takeaways
 # --------------
 #
-# - The performance of the models can be evaluted by their ability to yield
+# - The performance of the models can be evaluated by their ability to yield
 #   well-calibrated predictions and a good ranking.
 #
-# - The Gini index reflects the ability of a model to rank predictions
-#   irrespective of their absolute values, and therefore only assess their
-#   ranking power.
-#
 # - The calibration of the model can be assessed by plotting the mean observed
 #   value vs the mean predicted value on groups of test samples binned by
 #   predicted risk.
@@ -552,6 +547,10 @@ def lorenz_curve(y_true, y_pred, exposure):
 # - Using the Poisson loss with a log-link can correct these problems and lead
 #   to a well-calibrated linear model.
 #
+# - The Gini index reflects the ability of a model to rank predictions
+#   irrespective of their absolute values, and therefore only assess their
+#   ranking power.
+#
 # - Despite the improvement in calibration, the ranking power of both linear
 #   models are comparable and well below the ranking power of the Gradient
 #   Boosting Regression Trees.

From ad6a9f977e374842578d3da70c873a451776891f Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Wed, 29 Apr 2020 04:05:37 -0400
Subject: [PATCH 093/125] TST Skips derivative check on 32bit platforms
 (#17073)

* TST Checks type for derivative check

* TST Skips test for 32bit linux

* REV Less diffs
---
 sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
index 7fc6ab9097873..c3f6ded7be39a 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
@@ -9,6 +9,7 @@
 from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES
 from sklearn.ensemble._hist_gradient_boosting.common import Y_DTYPE
 from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
+from sklearn.utils._testing import skip_if_32bit
 
 
 def get_derivatives_helper(loss):
@@ -58,8 +59,7 @@ def get_hessians(y_true, raw_predictions):
 ])
 @pytest.mark.skipif(sp_version == (1, 2, 0),
                     reason='bug in scipy 1.2.0, see scipy issue #9608')
-@pytest.mark.skipif(Y_DTYPE != np.float64,
-                    reason='Newton internally uses float64 != Y_DTYPE')
+@skip_if_32bit
 def test_derivatives(loss, x0, y_true):
     # Check that gradients are zero when the loss is minimized on 1D array
     # using Halley's method with the first and second order derivatives

From 9f04837ea991b1a063c48e48175fd179be1885ad Mon Sep 17 00:00:00 2001
From: Chiara Marmo <cmarmo@users.noreply.github.com>
Date: Wed, 29 Apr 2020 14:30:44 +0200
Subject: [PATCH 094/125] DOC Update funders. (#17079)

---
 doc/about.rst                 |  42 +++++++++++++++++-----------------
 doc/images/anaconda-small.png | Bin 11313 -> 0 bytes
 doc/templates/index.html      |   1 -
 3 files changed, 21 insertions(+), 22 deletions(-)
 delete mode 100644 doc/images/anaconda-small.png

diff --git a/doc/about.rst b/doc/about.rst
index a6cdd54eb9201..814a4724d9579 100644
--- a/doc/about.rst
+++ b/doc/about.rst
@@ -271,14 +271,18 @@ July 2017.
    </div>
    </div>
 
-............
+Past Sponsors
+.............
 
 .. raw:: html
 
    <div class="sk-sponsor-div">
    <div class="sk-sponsor-div-box">
 
-`Anaconda, Inc <https://www.anaconda.com/>`_ funds Adrin Jalali since 2019.
+`INRIA <https://www.inria.fr>`_ actively supports this project. It has
+provided funding for Fabian Pedregosa (2010-2012), Jaques Grobler
+(2012-2013) and Olivier Grisel (2013-2017) to work on this project
+full-time. It also hosts coding sprints and other events.
 
 .. raw:: html
 
@@ -286,67 +290,63 @@ July 2017.
 
    <div class="sk-sponsor-div-box">
 
-.. image:: images/anaconda.png
+.. image:: images/inria-logo.jpg
    :width: 100pt
    :align: center
-   :target: https://sydney.edu.au/
+   :target: https://www.inria.fr
 
 .. raw:: html
 
    </div>
    </div>
 
-Past Sponsors
-.............
+.....................
 
 .. raw:: html
 
    <div class="sk-sponsor-div">
    <div class="sk-sponsor-div-box">
 
-`INRIA <https://www.inria.fr>`_ actively supports this project. It has
-provided funding for Fabian Pedregosa (2010-2012), Jaques Grobler
-(2012-2013) and Olivier Grisel (2013-2017) to work on this project
-full-time. It also hosts coding sprints and other events.
+`Paris-Saclay Center for Data Science
+<https://www.datascience-paris-saclay.fr/>`_
+funded one year for a developer to work on the project full-time
+(2014-2015), 50% of the time of Guillaume Lemaitre (2016-2017) and 50% of the
+time of Joris van den Bossche (2017-2018).
 
 .. raw:: html
 
    </div>
-
    <div class="sk-sponsor-div-box">
 
-.. image:: images/inria-logo.jpg
+.. image:: images/cds-logo.png
    :width: 100pt
    :align: center
-   :target: https://www.inria.fr
+   :target: https://www.datascience-paris-saclay.fr/
 
 .. raw:: html
 
    </div>
    </div>
 
-.....................
+............
 
 .. raw:: html
 
    <div class="sk-sponsor-div">
    <div class="sk-sponsor-div-box">
 
-`Paris-Saclay Center for Data Science
-<https://www.datascience-paris-saclay.fr/>`_
-funded one year for a developer to work on the project full-time
-(2014-2015), 50% of the time of Guillaume Lemaitre (2016-2017) and 50% of the
-time of Joris van den Bossche (2017-2018).
+`Anaconda, Inc <https://www.anaconda.com/>`_ funded Adrin Jalali in 2019.
 
 .. raw:: html
 
    </div>
+
    <div class="sk-sponsor-div-box">
 
-.. image:: images/cds-logo.png
+.. image:: images/anaconda.png
    :width: 100pt
    :align: center
-   :target: https://www.datascience-paris-saclay.fr/
+   :target: https://www.anaconda.com/
 
 .. raw:: html
 
diff --git a/doc/images/anaconda-small.png b/doc/images/anaconda-small.png
deleted file mode 100644
index ccb8bb8b707deca78f49e2423dbd380b48ba4052..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 11313
zcmV-1EY8!3P)<h;3K|Lk000e1NJLTq002k;001Ni1^@s6D%*T!0016GdQ@0+Qek%>
zaB^>EX>4U6ba`-PAZ2)IW&i+q+O3;eawNHOMgOr1Edlee9E@h#ftIgx5y9fntS5Vt
zs=AA0W@H2acfW%JH2c5)^O*nOPl>^rm`cqpXUm`1V)LDEs(t=EU!9Hj-}@(iyyw2Z
zZhU;-@VpfG8lE5Xey{Jmo<2WN-s|1?{<<miy-$5_^uF<N!=NX7zFF@e-x~%0yzbtg
zP4mw;J%5?b`+qrh+^4s9G(Uchg~eEj`vq_O4ldaHuYaBt7)5BI>vjI+6|Yx3--r83
z2*0eK`LlGtEa1nw{#-vh-z&oBay&Qsm{$I=PVeh3{l0#8mA;P1`S%k4`G)@N<%a$G
z=gaz?v+6nfnO%?Buw40_)W;O>cU;I#Vz?j6{4M;S_`ck~jlb1)JBiw2s>2Y6>6-Jb
z>~Ya8*WGdZew}VI#OSv#eD!|%a9^vT_~z#gO4MI>d@;mlg-T2&kqf-+|BS`m=Qcf$
zhgF`u99MeB%@oCNf6dPq{<nX>-RK;pxpP{~`m7b>^2#rUp~&f9t|B1ryla~7`a1J`
zu|NMPuz?7syUdjdb~~P1jO0FZE8I8-n+%sm`_zjo>%9R>#IqBFae)Zf>_T$c+2Vb1
zj&-cWr!#UNq8~)SC6oMOls<-#2~PETHlHc(Uh9*8UItzWL<%_+LPG%&tQb?w)LSKm
zx=A6$lu}M5)znhYA;+9@&Ls<{dI=?#RB|b$mR5QVHP%#fEw$EGd-E*-W68AKN~^84
z-g#(`oqKhz@4PVl2qTU(@+hN@Hu@xeW}Io}S!SJW_T^VtK!90!l~q?;eY;I6?YPs<
zyX?B#?uT4E;lz_pKIPQYPX80NH>%&G_A_$-I%@8Xn!kb4b>%Z^yj;r1Eu7#aDP}~>
zM@PhqB0xfW#mrV0qgUh<GuwPHAtcWzlY+CHB1Qz`c0L{VCw8Ba`)k|`tNZW9&HolT
zqtN|-M9wI5Un2L<xcw5f)i2zJ8z5C7YYIXJ%5S%_eJnG!b3{2|oDss=K82I{K9x4~
z4tM1#YmE`kA`|D{XZcaXF{h@ZOHT4rUw6De@qZpYGKQt^L52cMJy$GGeicS_GS7WK
zZ+^W2V{zsFyduv&E!>Nl`k0Bed+)*6)5oXGy^rpq7e4QykKKD%+ro3pA9wjN6Z7wm
zq|_D04yCL?Hn#GtX&ELL2@qnnMw<lZ^j!8zsq<2p1zND_xtOq#VCC#^1~54ycOPXw
z<8@}YUiGb3z6IqWG_uEHZP(d7R~z{LuOqD=D`lQ({amTza&Os<+`6Vg){c=++L79g
zM85=r|7|tRkL4`X*{PMBO{@E<Y52H+Gk?f_=2Kd$Gp{@uu{)83oqV0>qrzwF4vE@H
z)fGP=vEre}x=YxD^;$Dif`M_=QsQpj4F<_8!Bbgr_K12haLt~U^y)kIRjK3Di75r{
zLZy@vpb!f4>|yYfKH@5=fe<}2#d>-vXZjA*mXs(;8!9oVH7}*`JfSDcy0}$bW9ehr
z1<U{!7(+o?9GGAdl){2~KmhP(UKETkF^`yYno$hjW-g&oXlpUI$pn2$ZuQI!Dw9mH
zPu_5pbuDp}<P1xkc}GD(%Jq|acb$bu6lL0e4!=s7`-HM5GaWnTsmqzBKWp>Ivi6DF
ztW>(!oQOlq=gy{rnhPl9hCnH)%>BWxQ_`ws_jO!4iTN#%iwK12PBGx3^IEsyBiWq>
zppKw)?W9>ij8p+2x7QXzgRsCF@dW%pJU~ih3ucCCMrTZs8nGp$U4W%}Y)X?K>}U&d
z)BxF02~_jh9!ZKGn~gbyN_2??8RM(jel(ylmyQFN#Chxl#FBR_D@~{WC~8vRmWh9g
zWh4^uh@%8TMD60ry#Jm}j%=0(s3#yyD15o}=ao?j^b{4(Fx=ih6~iM+h5e#^?_lkC
zK6LM)J6wGHxWo5xn$Lm$xeL)7$92%e<=TQiM#&Wjsh$B#k!Fsh?S#4PunzggV(UJ2
z!C}LZs$>Y|&cNF$BxPe1-gd>j9M}pq$sp?;-3A8>6(G0>7e+pD_x}RB=C2HL$WT1W
zv($))jlcy+GD!;r693>gHA%*+rh}<-<H7o=kSdQka2nI`ecgp#f(u58W(O)~7tXR~
zb6k?ja3>Je8QnHx_3-80`ySzX-bLDWU-|DJUxCdJw0^+a{2i@FJoCZk_u9Y1IukG@
zhYbvezWS3*5khg~z-l7r)4*K`kzPd2+s#$F3KdNQO5}!0V0C;pBuFHE8W2V4wkcdh
zMwERD@AL^!LigyIHd_U}JJ+(Bz!B_a(L;9S&C(g@F1EHw6zYQX%X`ZXp-3vB1)c_B
z1~OL_k)di9a)Nud9mRyXSc(^{G!Bkw+hd8qPO(V!W9K!98hj*=9UQU?{T&c^_}X5w
zKog~oYO5>(h%m5O3_3v)r+Z`_*y_+w2aR~*1U9N%q$mg<c9DiPW?U2ybBJCn;7T}Q
zR_G`=hE3AXK6pewAWiI;f*k81K0J9kK7mQ|sF@92?J~#MOpCnQJp9zw4R27BJgxb&
zArz{??F7N{kE=b@x~#7Wki+wV?kwDfax+y<lA_IcJh)d4pKc7$yByPYm9^`R@y`_}
zarJOrCeOFas58G9SuVq(YEiqfPRZj&wXb4o4j`gGJ{~Bx*9ik7GdBwig1Af~6_9hF
z9dn0$U_Xt`lb0fiOdN}@Go_NljYJ!BQB5FD0+C7KBZg{m#!X5D<YILjzVRNqJd+G%
zZ8CZqICZ-mnD(%$I~55`G$=M&gv4!5r({vzy+KSxp7$U^_Vj=`^|Q1zq;hfz3iC$a
z#iTCGfC0{|=m#+P?3@QEa#ZgRiXw*AQL2#{7n^JXDF_;?k<pMB5N3I?2x=e`mt6My
z1d$=}_U?X|tXM)vR62LZ(4^y;0gK&0TI3u>GoI%j5g+-gJ`c&ta_oDH#)89>TtxnY
z@2en|?wj{5^Snm_Kkup`^p51mSJJ26E5UmluD&a_9G^Q=izqf=;ljs2A1(k`SASgl
z^~&eM=Q7~E;EO2C2ukCr?KF%KL?HmkvBM?E98X+4w_pgAG!=;&1QbYROf;$xHs_|w
z$JJc|ngo~zl@-g?rbQV6qf1QjC&5=*Xb2GrMCDL?h%yqJ-0|db^@T#i(dHMkQ5bRL
z#fNf*1`!Q;PEwko;6BPhq)$1QnRg$zJw{q;v~LzcirqU#bVPLI&rVOaunIP1{yRz>
z2vYhUI@G&SRLs=@5TD3M>cWHCOaNd@D5p&BZO8KYN+zF9KG#ID3?@qn&Wf#V__FFD
zKSU88fjlV^$Se}M=!~P#`vUNKugy}Tc4qRdIkJSHu>dPk!wY4t>Of7|!8&K3>XLX=
zTLx%rD7FT!$uq*5_)F|ZXoPPfBD{o~)W~J9w8zyTV#|SM6&mveZB4wB!Iz&zAj*?s
z(_tSh`F~e60EMs>RrM?@^Ui-MPV-rv%Gq0=->dU`ZK^b%!qgbQ6sGw{WxjREZQshY
z_DgMkFU_|wRhQ;lZGNfDKh^iAHh(MJ&r&phsm=eX+TNo4EMfD#hQF6Et3bMt05+$D
zHiH1*HWXq-^F*gYY-u&HdnAKaSUwCmllYU}D7YCjlW%;nB5;ywmMq$9!*L^DHw%or
zD9Q6*TU29M2F}D>_%s6K<XUv#cU=u8iBtzj04bCi5rI)%HZ;^iOOgvcMij+j@GkqA
zjCcuMAr?;?pLuNVeMy%D(z@`3Su5pvT(b}f#0P=0K&jZ}d*60%VpWS_5<U5WeS#*c
zN)|qaDIxzRaTPOz$6gQLCny!`fP<i^!-Vf@1}2IeFjmgTNkAaPp{&mJojL&-(r7)A
zxoX^^pSg+iC><DUvG|QlS`-cGfDzl~n#EM{^wK#^YKJk21fx9`8<jC7csbQIsnZwa
z9=7V*31KsGXQRNA34m2)H77bOF-o7d8Z~3y<=mJ%N)Z{NHQpZGupFMaFwQDO2{>#o
zLZ|w{U?)Tb57p)*KRn5ayurj`^BH#a=rh23b_6vM7e-8_Tt@<~i7Jf8OmZONYN;y{
z$}*DyHp^(r+>Hoc1>1w%d*}^|x~UEd#peUaP>UN?F>){z8YPUl!T3(@SLC2E9gJv$
zi&L5p2?@xx*bu+*3goAtP$}^N?ntd+65I7)|8y^8`FZ@=hl3#<)#w6v)h4+us-U(5
zYJ_E!Qz6X(duYem^*jO&{D~Uh2))c&zJQPU6@BE|QsEgaMot$(1~<hOB?^^KgZiIu
z-nWdJH}_qQ;bSb{zdDrX1!A09Tv440f=;?HE&HN5BH})e#0Ko$dDi8Zt97j|O&V>=
zGBz@6WU`uEOvV5%<gRD=^K4WIDwe{kR!x{)zyphtExV+JMBl)9?2Skxj>%aJ4AwIN
zLfLGm?jjDP;3`&t0I4$^9c%ASg2{F)AUcdTQzrNj>2L$tO<tirPn8XzEDb3LIgm`X
zO)#EXO6cR3B5cof_*`H3OvFl&)RLIccYTs#A|#nHqd3;6%VV!=yq2E#ia(^aoB-zX
z^nvzN2#bm|NzSIe%g5*ceB3Az|J<MYUIcLP#Z!uve{<d21CQ0JtC9H*p*%MlYg1tm
zCf<pcm&pkB3IZ)mQzBHE|JN4x0%?#b01R&W+mdF<5JfCxQO%Jw!HbqNVyGJcwAkJm
zuXaQ|5U+CN15sOp4qc~8+an=MC{-y3NCc6RsLCV&4LI(0%m8SK6dwZGR$XZF1fB#7
zs~-T16~~D{9-9RS-1D)=qZ3%`-+ba9bHCZG{3gPG*Wj%EFC7Z=ON*j@B(DG0oq(}p
z=<q*WVX6LE`^e7|cgpwTu5Dg9%%`3G+06cEWurEKcDl`<R=0#)V1jC}sGDU^kgp6_
z^?7?W040pT%EcZR@VO^6%g76fqU{WkSw+|~K&;GkaV@n)m=2YO7yxE=SKX1Le45C|
z9Ce@CDi#b&z3{T$h!a!5?h^+jF_MLlxs!9H4n|>2V^P9<Vhl(>%@K!_kHZu<UR|CW
zYgaVEg!r-P^Xio$c0r)zuY1}kyF026NW&B&2^=(nR#2w_W~)q;f{_5hu7`a=5~ob!
zBUkp*F`{nC&glJ~+<>{WkOjE{7sTnQvL`jD5kP!Xv^!#X(cgtYUKt0H_tsZ97yZuX
zwfEAFOh-bfSGLqKm0xuj3gdQ|QXdwt12CeaCm&Q4k=`xwFBdGoDzpk=L;I4b;2NPV
zPiQxZlF1|Pop<rws`<c_(KxwrGs4MwQ-RH|27O^*M2#Q9VU+z<H<&NNfocS0;DkCz
z1f+hsLOpC&P)ime=-WgI{?!SmE{1BAL8c+%fqnOGODN08m4$TOWDH8DFmoUg{0MDL
zb&<4LRiieYwg<|4bNpv^mdf=~=lLz$Nu6NRP6ui>_QEOMDY3=aR7R*Ha+lJ;iJYNK
zXUWh<(~`;oyHJ&kh>xv~6pJm1#~lWf0&qheBjil-rsS)-PVgf}X7?Lw!<<+=D6A&!
zs<2o{nx}<G{1!s87ZPypjuO0rO$PxA#E%!lgl3`=eDdg{h!<%SN78x#^zbf(jIt$f
z7`bcvmJJAn+Vps*n>ZLRr+!w!zLto!HJ-9l^B=&{m?XaI?NSgwv@e78_*pnJ>TXOq
zM}ATxy$Dwkf5BsGyP2gDxbSp~D`UoYB3*c(8g_U2@xuY0Kssr6sRuBmG&mAXt;c)n
zIt&9?2K;0ZN?6I$A~m*f<=jtNCMR<H*m+2|v#!B7r07$RDDjg(_BG7}P63Y722wr$
zJE98{j3JTqtE7Y>`B1xihEv8IcW*3+xiGVp41-Z1S)4<0*!Sy;SbwTUcdJQF1kw`o
zZSKONdMUw!<%R9F)kIYcirLFjW?};-Ua9YR95YU#K%TIkfMJjq>6rhar89xyjwjC8
zXSrbkTw4t41-NDLC%9)Qd&orBSQ!Gi!Hh}&!C(>IYC{o!C>;d>lVU)o;-&sIMNR3<
zrqonl!u-B{s&+l=SZ|x73QuKkp7~z+DswPocb=VqMfERac?S;~iVcm$w`v*U=bcw?
zxULFvs;lhf!UGp*Yl+XpIz%Nwri}_TINmTRkf!bewQsROLCui>h&LvxF6Ri1$)N*0
z4nTco6c=&da-9*uAu9@w5fukQBJDf_iB>I+jm(C7a0OFvTeQs|Ae4^$3Od8FuBRn4
zL~AR0!@6ckqtR~%wGTAe#0e)vQq5Ep)HP>N|FMmVEOL2GY@^dKAq4JN1kX$@1=11M
zGSqvR>Qf+6Q4XxKOy0RLu7*9N8L;E%Sdxf20-R`z8im@<uzne0!A(Z#-?v9%)znY;
zYO6jf&gfEYmp%#w>^?fx1HCxzeyF5M(V~1WEI>H`78JO`9R3<$92w3L;|4YKhK9i^
zF9yy4paBvWgw$RfdEiaRR!2Ip=^DShw#W$tLlSRo5AE0*auIbnbbNx506Q=(D;9C!
z>iyxFqtWpR>L2^9D$INuwRv}AeIbMmZYoo9V(`Gb5pgPku1!(c6fG``MDZrO8aDJ(
z08$(}p3Zr^sKSr?1W6Vol{ebA<c?N$V|$-eN;U#7fu$pau9%F&#Ck1N3ho!tg1AmA
zf%r!xg_Ci&O0#<XNA4a858FWc4)IC+$71A0eSx8rcD3RUyd~H0Mc5_KzWPjZAs6@&
z8J}|TXkpAqmv~3S6FFG~G=d?Ep(x-Bk;G(onmXNq>wvVf<NxgEj1^fhDrWpTbkGbC
zhd@M`*j^lfU<9v&nNn&{TLr*wMIF4mM>v)`U+8<aNVX*k@Sn$Fv1ZJl3`{V4W-*QP
zK@D7syvqWqDN}t#2`0l8E=Qd$%v_Mp)Xs}N*}D$LV88%_Rd0h<MSJBqDMrc@<-{6J
z^*E6*HPxt1`|Q0o7w~gfFzT>02&ZdjYy*cQ^VhIYc1Wqjh{U7pxEj=vo6$Lnx+Db>
z`KL2kAT?na*gK4$4}8&VVzQ|jGEN8f5GT}QN9RpB(@x5PvXDpw6cPXg4}gafvI}0t
z*u%9IUx@*0+6^(GW$X=g9B(^#X8Y{2T|OcvR4Wl<_DefUTIvqG;C7E1*5EN7TkvfV
z`p|>6UP-I$wD+A@&j@-pwNj~gaD|*iHZvKW_Yk3LX|$KFeYqkK)>%Jka*33<a3?FX
zC_ALsZbQ7PiR`TQb*h!8Ang|Qhz=q-5`(M{H=Ce+=JA-B;ge2y0(GP*Jk0v?b47s^
zz!&NyPO6ldwYQ$s?o}6rv{~JBAm#25dK>_FQo;EtO$VvMFROK~2E|e8QhgvJth`iw
zQ!JKsPC(MzkUH9uv<OqmXQ-D(yj=A?y5w<swHA@IHr;e|OPiOVpVQfaMidqc6=flK
zNqUakJR9x~bq!J=#n#ikmWsfJQuEyb<ww2w<q?Qt18A+1GlbksqDfw1!nIK@8o|DP
z+~@Ny+G*oM5$*w}U;q%!<Jh%*-w0JPF-U{z@Q8z7$Wm2oUbk6abx5PDlsm+;<e`e%
zuQ=ygFV*{C<xr^?qvzIq63K{0=@E+4j;!nmx|sEy>XCU+7+ZrlW7WY?uWbNI2!7%*
zI>%_9MnID89KIx{HW!XHNd0R-Uz6_5IlV(X$!VQU!eStt$ZYUkjT78LN8GWPj;SIU
z1X_f4D(VOo!jFh-l5{)jm=AS<?8jJ_;ix2dttOHbc!uW)@|<YgLwb;cqrrGOkSRt3
zCld}(=V?RH(STl}c9DXM%xap9erHjp9EHvixlS^&Fo1s{$%zoGU$aH!<8*G@IO}X2
z(7!N+JV_@3F*R^&7r9ooaw6`eYU`MM&a7KP)k#|mB9;zvML;s?Kv$HJz7p9X%s>vr
zDEBK|k(K`kGC68$0~5gVmcnL;6YjZ(Jf&`6$|~(vSpsj>R}vDMBP%!X;YF?2derbt
zNb=#U=$G(RuwR`x!J;}$1a~SOq7De{iI1QrvFqwAvxq`J@S;jTjqI@V3uIXdp@trD
zIueEOOu1zilI&Ozf7Icr8KI#O%msX&iw%$PGj&v4ReL}U{{UHCZPW)S5_D1kbq9VX
zTs#zqAP&^#wksIXm(Dg|vl#B<G2{X2^b*CgyIm7fyM$su7BE$>{e%J;Q2_RMR?A_t
zU>^vIwGh3s!~_fmNID$_Xj(E+CzldWyOA6^PRazKcvVD~lSSH?OYurV;ID7im`cHT
z%i-;u_X*Ve+~^UQ+Lom=EgjKJ5Mi~YLvFX`rx{o!NElB%)YvVQw~P9Tlt79i^<dwk
zCL0xpG{sCn{2+Vn2>7707SaLuel#lZiS|hrp2zsWmK`f;5opvlc(~NVf%p#$r6>jI
zgM&4bYmGuRklUa|4;5K^AQZy5&3)U}9^}TtKnu}JB!u@?YM_uu?~zadqDPGQ88S7P
z#f-LQVh+aFxf^hs)h}AngCh!VAt4*C3?Z2$iFFPG=ZAw3UFrm%kywSL2Qq%@Sb!u$
zvFbv)4dJh<K2`%*u0AC>nI)Q>>di9ptRBNA0GJw-4}J{sfz-l8fJCJ5RLr#J5lJ&<
z##-y}5uP^63iri;@p8f$Ayn{2D_ssHpr%t&gRMJKWO|*C=uaa7OmGPI^5h7@c>Ou>
zu<ijI->QK&SrZD)-ik~XfJSs@U=T-zC&02fDZR}b<dx^1H{i&D{xlwRf^ZoMTbz(G
zkarITBm8u1I6#h>h&rU5erd$={j6EZuiVrQFiaA3mWx4bXP}qN+H7c{C1*W&uG8=U
z$U)X^9Ra3F8yW{&6j-O^n1GH*E3PgHUf0C;7qKFU4XjP4-ETm_GIPkLZgQX1#`{^u
zg2p6`u`BW`yJ-Uv9g6eArLJgjFjCMXx{xPiMS_a|2!u`p0ZX9*GY!A3_79T!3@c|D
z(Dc%`y`l#|TN09S@{rhduv4oCv{enVf@!kIq%ZtEGOW4CRQ)0id`DeG*<fk_)UMrN
zD%2@2SYo`s01kmUq#u2YguPXCA2_We5Kb+6wd=ZeMtH{18BQjiQ!-$m)26m0KPgKv
zqmKH9eET<LJ?Ge5kUhcPU{x@PnuA?BLNT#KSjZVoT0^X^mSXORN9XpaK^*BDmRP+2
z7#dNYb^hyV+Ho29KmjN`@(Rq{kjuDZ9@wau6C@P)pd5h2Uds1S9aT_3?4f5d@zfX6
zW!267SJrO>jVM#onVC2tbnr|(opl;Y$cHRP>sq2YDV(NFT7*ArR^SLMtbiQ7wpet)
zpCTUdK+l+X@4x}D`1Zg$cp#e9nrhdFHNdT$GIXrjGOE=B1%r79HcXNN_uNC>3t+3N
zucNVAv7gTAI!tTKXsCsU(#CH9|0e~?P~}Xd*ip5k@nI4IoDDi#);@%?@}q^Pt4#@#
zuZ^t7IQi~%e1av@;X5=SqI+#~4KHGN7NsZ^cY0{xI@B{*Iq6GmNl?M)tc5LTVznW8
zrZf_Apr~6H4Il`!cG*n*`T@#Pd;ior42(7IY69CkuCYO9aMKYvU<6t%mt9G5wV}EH
z!Jt<CVuChY-7at%rQw8LrQo+<pXC4%k_R-s`;;J5BHE~osoqoNMskZ8l9epS#^Yt~
zdCaCmSylaFr!EH=TKoO&hY)>iRV9P)21FuxH==&+Y?7>4Hwr_>%CYh~Qt&LfdT!6U
zt5rSJ`7GEaP$_pD@!Wl48waw}8@2BMDiqnZn7k_!=6Ca~DEj2ta0SxAb7-dHj-hId
zkC$Sn7OrC7-AI8Gl!<3?acJxQR<@efBctDCnz`X5?P3USBaXTb5>!M6u24H879V8+
z|8Bwc4b#E2gijr+IZ@G|Ow#Ff+{#cRov%T^GJ<l$WND|aNhTsK1}e?0ef#F8+`Wb4
zvX1T~wY=LP;5zA}lXWOa9U~=BXj<)I>FkrpSnwzlsuP-GPz`cwmmcIo26rle7o$1_
z_%VkR$DGLgH>?F|3Yox&Bv|2D@&JYs>Qp9>g*V6g9|D0EiIwyr1fU8k7-tC_P3cjc
zLM>{4Etym}*)gYB8XB^)cHkr<vRemE@w7eSUY4o_3xH_RcIBux`_vjdrq&IdwR!rK
zZt2|}10Hm3NyF6ISY^oTJ$Sor%z<N;0pW=TXT`z0M)miYbY$2vhQS?3A#;jOtEZu3
z<~mLQfLU~lj`VJPJzx@GTnw3P92TOuaDQ)xdJ-mSZwhVD(Ahedm5+Qg!!G(Q2as2a
zU{0O94~K^m-VH!nVch){IqiUvtFQ)m9@T``Nfwk9P?!sxO2RE+qV{<3^8XD1+6+V<
z;-cXI00D$)LqkwWLqi~Na&Km7Y-Iodc$|HaJxIe)6opSyMJ*KvJBWy6s7@9{MI5yX
zMW_&Jg;pI*Uit@38j=(jN5Qq=;Ll>!!Nplu2UkH5`~h)sby9SZ67Ne2En<9dc^~J!
zbGYw5K&Y3QW_64Mnr@q^L|n{dSH-SZgdk82qeo_zF(*k$_^z*e1o(az<5~Xq{#?Ck
z&SF47B%Wo4X%lY{PjA`==Y8S`E6OVIIq{fD4H7?cUGeyhbHQbSXGY9)YMwYkEEd{W
zX=7G2HR37asH*9dFJwGcId5^+%2n3vlfN*W(^r<cPBnxC7O?~gA{10nLK!w<wCki;
zNYQ!R$3N)$C2}d`DuIz>0TpPFT|f9A{GP3qpPcZL!f~MU#c@7HfUaGjQFEN{W5;Qn
z0KsSAN^kiqbztU`^jb@c9szyZz{Pb-Q}%$%9bn+ekWJZ@f>c6254@kzH)VnTTOhRN
z&8>Nk(+40;vr66o2Zz9DfwI?q-re0kw}0<6=l25uA#!;(q=Ie$000JJOGiWi{{a60
z|De66lK=n!32;bRa{vG>`~Uzg`~j}R?3e%m00(qQO+^Re3IYlqAt?g6@c;k|+et)0
zRA}DqT6uI;)w%zD-#+);n>i#RQ<!lGw15#Z-2@03MetdXAqcV@o_4`=Sw&EbwiT<j
z7y<j#+V?C31p;BJH~`^-6f$6#9#esWG6;xF34|m!_nx!A{y`FMB1PzOVa<BqUw7?&
z_P6)>o!>X^OO=(CIC}Ibz7J7KL2C^Wxu)k=tXP4v(lR6_CF1jLJK7qNW|D1N{xc`f
z9L7%nzOO#`(kd!j1Ix0$m-8_*MC6*z*Q!-3#KgpacI?=JzJ2>%^GjV#GBY!=Z{I!~
zJ9g}<`pChEn~@w8+%Du1?-_5QGuoBwtEijjigrzeqF)mci<U|VKGr^aR`KYft=|Vg
z{rdHT&*#I=ojbqt+5G%`j2}N94B&eKr$Mge3$Y*Mx;Z#k1YOMtvokw*rt>D(k`2RG
zO%q!_5fOk83C37<sq)mNIr#~*)57rs2;b-pdp#c)oK%?cdsfuZf5G>vugdp@So#Iq
zYfl)k{I1`ygNHRgBPkm1Y~?CTo}UrZVgYL9ZjF>hS&u~Yjvhz`XOZIPYfn_e8RdGn
z?wt3nzrVd}@9|Yf-aNPMbd2`vVPeZ10KUt1iJnHVqbPCi;D{Ruka43nx@CN?wosHr
zTJp4CKP9&I8=;P>;~yW%J(R!C27ogqCo2F{060!eAG_yePUj$107Bn-kYXL7=34;9
zwKxVTmkI;0M}SU5lnKDGeNw5f0i197GA$?{w|<|lwgy3Ddn1t)J>V|LOl)xg8KI6m
zHh6|I)z9oYtDtm@=QGh)cjj^Z<8Wd6H|MPyYb1cW28`HLPq!F=hzTi$+2iaQYdU~a
zV$1qX53k#Gt?T`MKep$6ez4vrHR!yn_p1r676FiG0Vx{6PSxF4;Y|{ej^e+*w4!!<
z6-3LD0EZnK(kZN0i!m|nvp0&ZkCKC<p~d^J=r~>S^$udXd~k!hwumSe+BXtpa5+2p
zQzbZZXyN`o|B9advv$_B9o!@ha<&KtUHrdFSQmXKMN7G8DUBK0?an|ZT&u+Y@1XWg
zGIXMf7QrB5f+NhFsQr45@2uD3Y8$*vv?KwzE=OcWI#UBz_|N<Hv=}EXf=#Tbr?Yj?
z<94m}YM<$e*J~a`iXIQB_YRgnw*F{~00;oQr)pOCtNov{gQq&e%n!{7r#s=#gTk(5
z?H6Btft1DBj}lWtBS5yiPfXHc0NOHf<3(kvGh*qvw$Cm#Bb_mhF!R0G$$ibMts5OS
z0`archOvYH`!XeQ)_=kG)^{bDKe|l%@PY&TMNkF7HnH?B0NbMD+x*JCC@Tv9HFw<o
z|1>aq>Ce}^!6q*cj9A*QpV&NcL&c3c_YEsnkAHO7?ws3WmJ#gOAhzDYrh2V@YHI58
zoP)5jUPO)o2qLCFq-Y?3peFg0rC%!^xw@c5R`UQ*I>vK2b!qk>?bm&j696@MVCn$_
zJqZ6a{M*{nQ;Y9kJ-x*@%r*{R{bkMZ>Q6+=O8}~_Aqtk-b+)@?RMGFRH#PXtF{nXy
zJh#cmA^e*=XQa$@MFkB9AstLVh2W<8CfOl~Y+{=~+%Rg@Cj5Y6{7@U~YK_g0Z#WNN
zHZje<QdCMzOhjyKEcWi*`-1_*_&1On6|#S{0R215{{wWrJ}6~LcHG(0le_oM*wJ{S
zaLa_BxY}mK#FhTSQ)(Z*e9P%B$n3}t?p|}edgIn7iyM2(aWj$~!JWc-my9UddZjzE
z{(l^BvoqS23MQvrV{I*aaP9tvIY&DudOX<E6&2Td02|yU${Ti|M;VtNd~1&e08#|^
zqO4@^*EQR=Jh3sbw?wH+vs1;EW5pwjzD%5x7UBpu;~V0h?X@c_ww~S<wKv>CvwK5w
zFp@t=k1rX$rkq@ies8$DzFR%Fc09(PY|bDiMf0MQ+vYYIbns1V@MlSHq-V7m8WTAF
z2pRmD3NczYI$y0391}W{nHF??_%;=&C-%9%oVqk;T2Q-?qim`rZ18+XxVblVSx!O!
zcLq2y8c29^;N7hg+a6L*wVD*YuR@Hnc7r=^aWBfcJwVljpbjBR`n;ClXc*{_(<PJ?
zdx+^t08lQYH!-gxgV&M4YmHFHw$=mMAK^zsMmPUpcg{))RW4(tyC5@7yc%|$PHz`_
zbG$8fQxi)m%({n&Vu|UQ%V}KWFOv;k)?-{xce`O7(PS=wB$4;EAu@O}m~L&K-Ra(j
z@u*?#aZ|rGQ%TW_qNUibwbJ}m{(fTV835Q{YuWC5v*OuQ%K`YcUq32!I#n!rk(h2%
zrt&1rNs9_-qud>SYpc1vUrFHl5<l>)01-fd_Sr94)&5lN*JA*jC4(2FEY2Pt*p6|v
z33>=X6q{-i0I-wiA3c41r6SeT#K_>u5L6M<PuzuBIVB^DRyG6MshsMQZewEyfBn%{
z+rHhCAVj6K&LTy9pyfOnJPE*30O*$8J(3K5UPN}E-+OMz&Z*@MCcL!)=G0|5FF{b}
zul5ft8|yign3#wo&SS;hCiL8;oa!%*Fmob+mm9KcVj2~3d#fq{KfhWtvh)E@34oFz
zi|<=`(Ww0!DVh#o835`1y95U-ngl@=DH@XWLHbY2#;x6zpP%2<NxK(jr4Un+?X#bR
z;4m9J<y*Ont!F~ygmS8t?uAV#4*<9={&uj*&x$SkwO^(XQ&Mt4W-0)<ZVrkA5G0l?
z3;?Gw(zhSdSpi5AEvw4LdJZ)*@>Cc~MiqT1T22sA3gi+;L`$Xi>&c|(vD9TbuUl2$
z{ST14PNr3gt(UXnZg=D*cLIR0t`Q?Z)Y+=>kA>i(5$t&SicyRm{44|~POLmKU)%ak
zVjAdPklC-<9%n7Doee>qa;hQ`bP1f?-fLffQk*7QX4M?O_yGi`l&PMnrx&BC5j1sv
zCO6F4|EwNsww5+OykVAD`gg=Msr!HR{N;DI-2i|Zjj_wPTZs&sB3d@@e|zUgqUAkO
zG$v_odWVMK!bF7Rc^N%P(GU@-?J%^<dk~}mAa<&6v8Xh6%~!V9$^j6}4j$iRf0HMR
z2p~ug?il(f5xGE$h9=F=h^;&8-439R4a&Xa_2lLqMLPDELpFdd#FUdbH@$ts!~t~(
z?t;uLVu}H<33S=x1>X{ijrXsfDq7|k!Nzj{99P5VlU8fPNzuKc^^xt*lpg7Dcb6<;
z>H~op`}7@iA<_$gX@nY2&iv!dOJV-r-b2x0Y0t?3>WJttDQZ{WnA$^%ewOrpde?K?
zPWuBlDjBo(Z|;Q|LzPo`0E7TQ(!7jYNYQ-|>}G=_0YqM^sSsnhjHk;U@cd`$(wteO
z=-JTj;U4#*tXIU+7nDnl0#UmQ2P(&iUr!^a@|fY~UsDRRU(mJofuxW@sbtVJ2u@qo
z{<p7`h@anc?y-noQBlP7A6J;~mOMYRA3LaHwZr=Y5w#b;j0GTU@)QUvi0L$;@x8w!
z=Es~5MgA7R1*7wvUNN>puGsSDjl)+>Y^;Ok4vsOyonNS6V-f_fi3CI|(J0Mbv&p?M
zb0nK;#ig*So?>FFx}rLJ^F!-TTuN#PsSC5;C8qHSv(tX{&4%Ms+YIR7Bt=gsr&<Z8
zYA7|9IHQA9@u;G|rYz2WfDL|^O<q`Uc7R~NU1yIjdvM)1r0UNhmzlKl#d53v{|wA^
zwGJw}3|qa4nCfLV@p8R^ofS(SDIT$ULGrwe{=}4~Ej^|9{?&O6%aRvl#v4J#rtmfq
zj|=2=BO)@=SLr){KQxtjJf24V85tSJ1}z#gn1$h}Q>WNR!`SBCB7(JR*8%`0W=#0B
z$MW9UmiAQ|wkm)n?{9u+{fR(5zdiNFi%|4&P)Lyfm0>gOF&QJDik%lf({!0>hB6#A
zwY8t`9`pIGLeFBrGZlY$=MROa)2dbmwGGY+HA7mT_f{P|weH0F_a1!5s`X4kUS1x&
zHcf}7eCJ`M1NN$Y>DcspA|oSW0YF)55Bq3XK!m+evE@wpxOOAGzxF(XX(#hev5)RS
n9o98_@A5%~Lo%7D0s{O4_yF;NymWiy00000NkvXXu0mjfw|`wS

diff --git a/doc/templates/index.html b/doc/templates/index.html
index e17111fb48eef..8d3bdfaec2b28 100644
--- a/doc/templates/index.html
+++ b/doc/templates/index.html
@@ -252,7 +252,6 @@ <h4 class="sk-landing-call-header">Who uses scikit-learn?</h4>
                 <img class="sk-footer-funding-logo" src="_static/intel-small.png" title="Intel" >
                 <img class="sk-footer-funding-logo" src="_static/nvidia-small.png" title="Nvidia" >
                 <img class="sk-footer-funding-logo" src="_static/dataiku-small.png" title="Dataiku" >
-                <img class="sk-footer-funding-logo" src="_static/anaconda-small.png" title="Anaconda" >
         </div>
         </a>
   </div>

From b9403f62ac65e7e6575168ef74b43fb012010599 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@gmail.com>
Date: Wed, 29 Apr 2020 17:42:25 +0200
Subject: [PATCH 095/125] DOC Better headers in Poisson regression example
 (#17080)

---
 .../plot_poisson_regression_non_normal_loss.py        | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
index 3a24b55848013..59c07580b81ba 100644
--- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -184,7 +184,7 @@ def score_estimator(estimator, df_test):
 score_estimator(dummy, df_test)
 
 ##############################################################################
-# (Generalized) Linear models
+# (Generalized) linear models
 # ---------------------------
 #
 # We start by modeling the target variable with the (l2 penalized) least
@@ -217,6 +217,12 @@ def score_estimator(estimator, df_test):
 # regularization strength ``alpha`` to approximately 1e-6 over number of
 # samples (i.e. `1e-12`) in order to mimic the Ridge regressor whose L2 penalty
 # term scales differently with the number of samples.
+#
+# Since the Poisson regressor internally models the log of the expected target
+# value instead of the expected value directly (log vs identity link function),
+# the relationship between X and y is not exactly linear anymore. Therefore the
+# Poisson regressor is called a Generalized Linear Model (GLM) rather than a
+# vanilla linear model as is the case for Ridge regression.
 
 from sklearn.linear_model import PoissonRegressor
 
@@ -233,6 +239,9 @@ def score_estimator(estimator, df_test):
 score_estimator(poisson_glm, df_test)
 
 ##############################################################################
+# Gradient Boosting Regression Trees for Poisson regression
+# ---------------------------------------------------------
+#
 # Finally, we will consider a non-linear model, namely Gradient Boosting
 # Regression Trees. Tree-based models do not require the categorical data to be
 # one-hot encoded: instead, we can encode each category label with an arbitrary

From ee2508ce45fd7d491b25e354509cda26c14b16ec Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Wed, 29 Apr 2020 18:31:58 -0400
Subject: [PATCH 096/125] ENH Adds HTML visualizations for estimators (#14180)

---
 doc/modules/classes.rst                       |   1 +
 doc/modules/compose.rst                       |  25 ++
 doc/whats_new/v0.23.rst                       |   8 +
 .../plot_column_transformer_mixed_types.py    |   9 +
 sklearn/_config.py                            |  19 +-
 sklearn/base.py                               |  13 +
 sklearn/compose/_column_transformer.py        |   6 +
 sklearn/ensemble/_stacking.py                 |  27 ++
 sklearn/ensemble/_voting.py                   |   5 +
 sklearn/pipeline.py                           |  20 ++
 sklearn/tests/test_base.py                    |  14 +
 sklearn/tests/test_config.py                  |   9 +-
 sklearn/utils/__init__.py                     |   3 +-
 sklearn/utils/_estimator_html_repr.py         | 311 ++++++++++++++++++
 .../utils/tests/test_estimator_html_repr.py   | 267 +++++++++++++++
 15 files changed, 732 insertions(+), 5 deletions(-)
 create mode 100644 sklearn/utils/_estimator_html_repr.py
 create mode 100644 sklearn/utils/tests/test_estimator_html_repr.py

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 3d9924638b69b..2489eaf55bac7 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1569,6 +1569,7 @@ Plotting
    utils.deprecated
    utils.estimator_checks.check_estimator
    utils.estimator_checks.parametrize_with_checks
+   utils.estimator_html_repr
    utils.extmath.safe_sparse_dot
    utils.extmath.randomized_range_finder
    utils.extmath.randomized_svd
diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst
index cd29b14b1f081..e7dac0dadc630 100644
--- a/doc/modules/compose.rst
+++ b/doc/modules/compose.rst
@@ -528,6 +528,31 @@ above example would be::
                                   ('countvectorizer', CountVectorizer(),
                                    'title')])
 
+.. _visualizing_composite_estimators:
+
+Visualizing Composite Estimators
+================================
+
+Estimators can be displayed with a HTML representation when shown in a
+jupyter notebook. This can be useful to diagnose or visualize a Pipeline with
+many estimators. This visualization is activated by setting the
+`display` option in :func:`sklearn.set_config`::
+
+  >>> from sklearn import set_config
+  >>> set_config(display='diagram')   # doctest: +SKIP
+  >>> # diplays HTML representation in a jupyter context
+  >>> column_trans  # doctest: +SKIP
+
+An example of the HTML output can be seen in the 
+**HTML representation of Pipeline** section of 
+:ref:`sphx_glr_auto_examples_compose_plot_column_transformer_mixed_types.py`.
+As an alternative, the HTML can be written to a file using
+:func:`~sklearn.utils.estimator_html_repr`::
+
+   >>> from sklearn.utils import estimator_html_repr
+   >>> with open('my_estimator.html', 'w') as f:  # doctest: +SKIP
+   ...     f.write(estimator_html_repr(clf))
+
 .. topic:: Examples:
 
  * :ref:`sphx_glr_auto_examples_compose_plot_column_transformer.py`
diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index 0e149ed03a9fa..1ac63ca473faf 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -567,6 +567,9 @@ Changelog
 :mod:`sklearn.utils`
 ....................
 
+- |Feature| Adds :func:`utils.estimator_html_repr` for returning a
+  HTML representation of an estimator. :pr:`14180` by `Thomas Fan`_.
+
 - |Enhancement| improve error message in :func:`utils.validation.column_or_1d`.
   :pr:`15926` by :user:`Loïc Estève <lesteve>`.
 
@@ -605,6 +608,11 @@ Changelog
 Miscellaneous
 .............
 
+- |MajorFeature| Adds a HTML representation of estimators to be shown in
+  a jupyter notebook or lab. This visualization is acitivated by setting the
+  `display` option in :func:`sklearn.set_config`. :pr:`14180` by
+  `Thomas Fan`_.
+
 - |Enhancement| ``scikit-learn`` now works with ``mypy`` without errors.
   :pr:`16726` by `Roman Yurchak`_.
 
diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py
index 1c79c4bb1d607..24fc4d69e35d0 100644
--- a/examples/compose/plot_column_transformer_mixed_types.py
+++ b/examples/compose/plot_column_transformer_mixed_types.py
@@ -87,6 +87,15 @@
 clf.fit(X_train, y_train)
 print("model score: %.3f" % clf.score(X_test, y_test))
 
+##############################################################################
+# HTML representation of ``Pipeline``
+###############################################################################
+# When the ``Pipeline`` is printed out in a jupyter notebook an HTML
+# representation of the estimator is displayed as follows:
+from sklearn import set_config
+set_config(display='diagram')
+clf
+
 ###############################################################################
 # Use ``ColumnTransformer`` by selecting column by data types
 ###############################################################################
diff --git a/sklearn/_config.py b/sklearn/_config.py
index 44eaae1d59012..f183203e13228 100644
--- a/sklearn/_config.py
+++ b/sklearn/_config.py
@@ -7,6 +7,7 @@
     'assume_finite': bool(os.environ.get('SKLEARN_ASSUME_FINITE', False)),
     'working_memory': int(os.environ.get('SKLEARN_WORKING_MEMORY', 1024)),
     'print_changed_only': True,
+    'display': 'text',
 }
 
 
@@ -27,7 +28,7 @@ def get_config():
 
 
 def set_config(assume_finite=None, working_memory=None,
-               print_changed_only=None):
+               print_changed_only=None, display=None):
     """Set global scikit-learn configuration
 
     .. versionadded:: 0.19
@@ -59,6 +60,13 @@ def set_config(assume_finite=None, working_memory=None,
 
         .. versionadded:: 0.21
 
+    display : {'text', 'diagram'}, optional
+        If 'diagram', estimators will be displayed as text in a jupyter lab
+        of notebook context. If 'text', estimators will be displayed as
+        text. Default is 'text'.
+
+        .. versionadded:: 0.23
+
     See Also
     --------
     config_context: Context manager for global scikit-learn configuration
@@ -70,6 +78,8 @@ def set_config(assume_finite=None, working_memory=None,
         _global_config['working_memory'] = working_memory
     if print_changed_only is not None:
         _global_config['print_changed_only'] = print_changed_only
+    if display is not None:
+        _global_config['display'] = display
 
 
 @contextmanager
@@ -100,6 +110,13 @@ def config_context(**new_config):
         .. versionchanged:: 0.23
            Default changed from False to True.
 
+    display : {'text', 'diagram'}, optional
+        If 'diagram', estimators will be displayed as text in a jupyter lab
+        of notebook context. If 'text', estimators will be displayed as
+        text. Default is 'text'.
+
+        .. versionadded:: 0.23
+
     Notes
     -----
     All settings, not just those presently modified, will be returned to
diff --git a/sklearn/base.py b/sklearn/base.py
index bf5ee370aa8f1..666574b491594 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -17,9 +17,11 @@
 import numpy as np
 
 from . import __version__
+from ._config import get_config
 from .utils import _IS_32BIT
 from .utils.validation import check_X_y
 from .utils.validation import check_array
+from .utils._estimator_html_repr import estimator_html_repr
 from .utils.validation import _deprecate_positional_args
 
 _DEFAULT_TAGS = {
@@ -435,6 +437,17 @@ def _validate_data(self, X, y=None, reset=True,
 
         return out
 
+    def _repr_html_(self):
+        """HTML representation of estimator"""
+        return estimator_html_repr(self)
+
+    def _repr_mimebundle_(self, **kwargs):
+        """Mime bundle used by jupyter kernels to display estimator"""
+        output = {"text/plain": repr(self)}
+        if get_config()["display"] == 'diagram':
+            output["text/html"] = estimator_html_repr(self)
+        return output
+
 
 class ClassifierMixin:
     """Mixin class for all classifiers in scikit-learn."""
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index 2ef8876b0c4e7..f148633021a97 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -15,6 +15,7 @@
 from joblib import Parallel, delayed
 
 from ..base import clone, TransformerMixin
+from ..utils._estimator_html_repr import _VisualBlock
 from ..pipeline import _fit_transform_one, _transform_one, _name_estimators
 from ..preprocessing import FunctionTransformer
 from ..utils import Bunch
@@ -637,6 +638,11 @@ def _hstack(self, Xs):
             Xs = [f.toarray() if sparse.issparse(f) else f for f in Xs]
             return np.hstack(Xs)
 
+    def _sk_visual_block_(self):
+        names, transformers, name_details = zip(*self.transformers)
+        return _VisualBlock('parallel', transformers,
+                            names=names, name_details=name_details)
+
 
 def _check_X(X):
     """Use check_array only on lists and other non-array-likes / sparse"""
diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py
index a75e9236f1612..73aa55c0575a7 100644
--- a/sklearn/ensemble/_stacking.py
+++ b/sklearn/ensemble/_stacking.py
@@ -13,6 +13,7 @@
 from ..base import clone
 from ..base import ClassifierMixin, RegressorMixin, TransformerMixin
 from ..base import is_classifier, is_regressor
+from ..utils._estimator_html_repr import _VisualBlock
 
 from ._base import _fit_single_estimator
 from ._base import _BaseHeterogeneousEnsemble
@@ -233,6 +234,14 @@ def predict(self, X, **predict_params):
             self.transform(X), **predict_params
         )
 
+    def _sk_visual_block_(self, final_estimator):
+        names, estimators = zip(*self.estimators)
+        parallel = _VisualBlock('parallel', estimators, names=names,
+                                dash_wrapped=False)
+        serial = _VisualBlock('serial', (parallel, final_estimator),
+                              dash_wrapped=False)
+        return _VisualBlock('serial', [serial])
+
 
 class StackingClassifier(ClassifierMixin, _BaseStacking):
     """Stack of estimators with a final classifier.
@@ -496,6 +505,15 @@ def transform(self, X):
         """
         return self._transform(X)
 
+    def _sk_visual_block_(self):
+        # If final_estimator's default changes then this should be
+        # updated.
+        if self.final_estimator is None:
+            final_estimator = LogisticRegression()
+        else:
+            final_estimator = self.final_estimator
+        return super()._sk_visual_block_(final_estimator)
+
 
 class StackingRegressor(RegressorMixin, _BaseStacking):
     """Stack of estimators with a final regressor.
@@ -665,3 +683,12 @@ def transform(self, X):
             Prediction outputs for each estimator.
         """
         return self._transform(X)
+
+    def _sk_visual_block_(self):
+        # If final_estimator's default changes then this should be
+        # updated.
+        if self.final_estimator is None:
+            final_estimator = RidgeCV()
+        else:
+            final_estimator = self.final_estimator
+        return super()._sk_visual_block_(final_estimator)
diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py
index 0ac42407f5998..6a2b5736d8b4e 100644
--- a/sklearn/ensemble/_voting.py
+++ b/sklearn/ensemble/_voting.py
@@ -32,6 +32,7 @@
 from ..utils.validation import column_or_1d
 from ..utils.validation import _deprecate_positional_args
 from ..exceptions import NotFittedError
+from ..utils._estimator_html_repr import _VisualBlock
 
 
 class _BaseVoting(TransformerMixin, _BaseHeterogeneousEnsemble):
@@ -104,6 +105,10 @@ def n_features_in_(self):
 
         return self.estimators_[0].n_features_in_
 
+    def _sk_visual_block_(self):
+        names, estimators = zip(*self.estimators)
+        return _VisualBlock('parallel', estimators, names=names)
+
 
 class VotingClassifier(ClassifierMixin, _BaseVoting):
     """Soft Voting/Majority Rule classifier for unfitted estimators.
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 8e2a539786557..6f02cb565e15c 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -18,6 +18,7 @@
 from joblib import Parallel, delayed
 
 from .base import clone, TransformerMixin
+from .utils._estimator_html_repr import _VisualBlock
 from .utils.metaestimators import if_delegate_has_method
 from .utils import Bunch, _print_elapsed_time
 from .utils.validation import check_memory
@@ -623,6 +624,21 @@ def n_features_in_(self):
         # delegate to first step (which will call _check_is_fitted)
         return self.steps[0][1].n_features_in_
 
+    def _sk_visual_block_(self):
+        _, estimators = zip(*self.steps)
+
+        def _get_name(name, est):
+            if est is None or est == 'passthrough':
+                return f'{name}: passthrough'
+            # Is an estimator
+            return f'{name}: {est.__class__.__name__}'
+        names = [_get_name(name, est) for name, est in self.steps]
+        name_details = [str(est) for est in estimators]
+        return _VisualBlock('serial', estimators,
+                            names=names,
+                            name_details=name_details,
+                            dash_wrapped=False)
+
 
 def _name_estimators(estimators):
     """Generate names for estimators."""
@@ -1004,6 +1020,10 @@ def n_features_in_(self):
         # X is passed to all transformers so we just delegate to the first one
         return self.transformer_list[0][1].n_features_in_
 
+    def _sk_visual_block_(self):
+        names, transformers = zip(*self.transformer_list)
+        return _VisualBlock('parallel', transformers, names=names)
+
 
 def make_union(*transformers, **kwargs):
     """
diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index 52f2e60b4af70..e20fa440d1933 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -23,6 +23,7 @@
 
 from sklearn.base import TransformerMixin
 from sklearn.utils._mocking import MockDataFrame
+from sklearn import config_context
 import pickle
 
 
@@ -511,3 +512,16 @@ def fit(self, X, y=None):
         params = est.get_params()
 
     assert params['param'] is None
+
+
+def test_repr_mimebundle_():
+    # Checks the display configuration flag controls the json output
+    tree = DecisionTreeClassifier()
+    output = tree._repr_mimebundle_()
+    assert "text/plain" in output
+    assert "text/html" not in output
+
+    with config_context(display='diagram'):
+        output = tree._repr_mimebundle_()
+        assert "text/plain" in output
+        assert "text/html" in output
diff --git a/sklearn/tests/test_config.py b/sklearn/tests/test_config.py
index ae13c61838694..eec349861258c 100644
--- a/sklearn/tests/test_config.py
+++ b/sklearn/tests/test_config.py
@@ -4,7 +4,8 @@
 
 def test_config_context():
     assert get_config() == {'assume_finite': False, 'working_memory': 1024,
-                            'print_changed_only': True}
+                            'print_changed_only': True,
+                            'display': 'text'}
 
     # Not using as a context manager affects nothing
     config_context(assume_finite=True)
@@ -12,7 +13,8 @@ def test_config_context():
 
     with config_context(assume_finite=True):
         assert get_config() == {'assume_finite': True, 'working_memory': 1024,
-                                'print_changed_only': True}
+                                'print_changed_only': True,
+                                'display': 'text'}
     assert get_config()['assume_finite'] is False
 
     with config_context(assume_finite=True):
@@ -37,7 +39,8 @@ def test_config_context():
         assert get_config()['assume_finite'] is True
 
     assert get_config() == {'assume_finite': False, 'working_memory': 1024,
-                            'print_changed_only': True}
+                            'print_changed_only': True,
+                            'display': 'text'}
 
     # No positional arguments
     assert_raises(TypeError, config_context, True)
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index afde7614070fd..f814ea11c12c1 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -25,6 +25,7 @@
 from ..exceptions import DataConversionWarning
 from .deprecation import deprecated
 from .fixes import np_version
+from ._estimator_html_repr import estimator_html_repr
 from .validation import (as_float_array,
                          assert_all_finite,
                          check_random_state, column_or_1d, check_array,
@@ -52,7 +53,7 @@
            "check_symmetric", "indices_to_mask", "deprecated",
            "parallel_backend", "register_parallel_backend",
            "resample", "shuffle", "check_matplotlib_support", "all_estimators",
-           "DataConversionWarning"
+           "DataConversionWarning", "estimator_html_repr"
            ]
 
 IS_PYPY = platform.python_implementation() == 'PyPy'
diff --git a/sklearn/utils/_estimator_html_repr.py b/sklearn/utils/_estimator_html_repr.py
new file mode 100644
index 0000000000000..9b2e45790fd2b
--- /dev/null
+++ b/sklearn/utils/_estimator_html_repr.py
@@ -0,0 +1,311 @@
+from contextlib import closing
+from contextlib import suppress
+from io import StringIO
+import uuid
+import html
+
+from sklearn import config_context
+
+
+class _VisualBlock:
+    """HTML Representation of Estimator
+
+    Parameters
+    ----------
+    kind : {'serial', 'parallel', 'single'}
+        kind of HTML block
+
+    estimators : list of estimators or `_VisualBlock`s or a single estimator
+        If kind != 'single', then `estimators` is a list of
+        estimators.
+        If kind == 'single', then `estimators` is a single estimator.
+
+    names : list of str
+        If kind != 'single', then `names` corresponds to estimators.
+        If kind == 'single', then `names` is a single string corresponding to
+        the single estimator.
+
+    name_details : list of str, str, or None, default=None
+        If kind != 'single', then `name_details` corresponds to `names`.
+        If kind == 'single', then `name_details` is a single string
+        corresponding to the single estimator.
+
+    dash_wrapped : bool, default=True
+        If true, wrapped HTML element will be wrapped with a dashed border.
+        Only active when kind != 'single'.
+    """
+    def __init__(self, kind, estimators, *, names=None, name_details=None,
+                 dash_wrapped=True):
+        self.kind = kind
+        self.estimators = estimators
+        self.dash_wrapped = dash_wrapped
+
+        if self.kind in ('parallel', 'serial'):
+            if names is None:
+                names = (None, ) * len(estimators)
+            if name_details is None:
+                name_details = (None, ) * len(estimators)
+
+        self.names = names
+        self.name_details = name_details
+
+    def _sk_visual_block_(self):
+        return self
+
+
+def _write_label_html(out, name, name_details,
+                      outer_class="sk-label-container",
+                      inner_class="sk-label",
+                      checked=False):
+    """Write labeled html with or without a dropdown with named details"""
+    out.write(f'<div class="{outer_class}">'
+              f'<div class="{inner_class} sk-toggleable">')
+    name = html.escape(name)
+
+    if name_details is not None:
+        checked_str = 'checked' if checked else ''
+        est_id = uuid.uuid4()
+        out.write(f'<input class="sk-toggleable__control sk-hidden--visually" '
+                  f'id="{est_id}" type="checkbox" {checked_str}>'
+                  f'<label class="sk-toggleable__label" for="{est_id}">'
+                  f'{name}</label>'
+                  f'<div class="sk-toggleable__content"><pre>{name_details}'
+                  f'</pre></div>')
+    else:
+        out.write(f'<label>{name}</label>')
+    out.write('</div></div>')  # outer_class inner_class
+
+
+def _get_visual_block(estimator):
+    """Generate information about how to display an estimator.
+    """
+    with suppress(AttributeError):
+        return estimator._sk_visual_block_()
+
+    if isinstance(estimator, str):
+        return _VisualBlock('single', estimator,
+                            names=estimator, name_details=estimator)
+    elif estimator is None:
+        return _VisualBlock('single', estimator,
+                            names='None', name_details='None')
+
+    # check if estimator looks like a meta estimator wraps estimators
+    if hasattr(estimator, 'get_params'):
+        estimators = []
+        for key, value in estimator.get_params().items():
+            # Only look at the estimators in the first layer
+            if '__' not in key and hasattr(value, 'get_params'):
+                estimators.append(value)
+        if len(estimators):
+            return _VisualBlock('parallel', estimators, names=None)
+
+    return _VisualBlock('single', estimator,
+                        names=estimator.__class__.__name__,
+                        name_details=str(estimator))
+
+
+def _write_estimator_html(out, estimator, estimator_label,
+                          estimator_label_details, first_call=False):
+    """Write estimator to html in serial, parallel, or by itself (single).
+    """
+    if first_call:
+        est_block = _get_visual_block(estimator)
+    else:
+        with config_context(print_changed_only=True):
+            est_block = _get_visual_block(estimator)
+
+    if est_block.kind in ('serial', 'parallel'):
+        dashed_wrapped = first_call or est_block.dash_wrapped
+        dash_cls = " sk-dashed-wrapped" if dashed_wrapped else ""
+        out.write(f'<div class="sk-item{dash_cls}">')
+
+        if estimator_label:
+            _write_label_html(out, estimator_label, estimator_label_details)
+
+        kind = est_block.kind
+        out.write(f'<div class="sk-{kind}">')
+        est_infos = zip(est_block.estimators, est_block.names,
+                        est_block.name_details)
+
+        for est, name, name_details in est_infos:
+            if kind == 'serial':
+                _write_estimator_html(out, est, name, name_details)
+            else:  # parallel
+                out.write('<div class="sk-parallel-item">')
+                # wrap element in a serial visualblock
+                serial_block = _VisualBlock('serial', [est],
+                                            dash_wrapped=False)
+                _write_estimator_html(out, serial_block, name, name_details)
+                out.write('</div>')  # sk-parallel-item
+
+        out.write('</div></div>')
+    elif est_block.kind == 'single':
+        _write_label_html(out, est_block.names, est_block.name_details,
+                          outer_class="sk-item", inner_class="sk-estimator",
+                          checked=first_call)
+
+
+_STYLE = """
+div.sk-top-container {
+  color: black;
+  background-color: white;
+}
+div.sk-toggleable {
+  background-color: white;
+}
+label.sk-toggleable__label {
+  cursor: pointer;
+  display: block;
+  width: 100%;
+  margin-bottom: 0;
+  padding: 0.2em 0.3em;
+  box-sizing: border-box;
+  text-align: center;
+}
+div.sk-toggleable__content {
+  max-height: 0;
+  max-width: 0;
+  overflow: hidden;
+  text-align: left;
+  background-color: #f0f8ff;
+}
+div.sk-toggleable__content pre {
+  margin: 0.2em;
+  color: black;
+  border-radius: 0.25em;
+  background-color: #f0f8ff;
+}
+input.sk-toggleable__control:checked~div.sk-toggleable__content {
+  max-height: 200px;
+  max-width: 100%;
+  overflow: auto;
+}
+div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {
+  background-color: #d4ebff;
+}
+div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {
+  background-color: #d4ebff;
+}
+input.sk-hidden--visually {
+  border: 0;
+  clip: rect(1px 1px 1px 1px);
+  clip: rect(1px, 1px, 1px, 1px);
+  height: 1px;
+  margin: -1px;
+  overflow: hidden;
+  padding: 0;
+  position: absolute;
+  width: 1px;
+}
+div.sk-estimator {
+  font-family: monospace;
+  background-color: #f0f8ff;
+  margin: 0.25em 0.25em;
+  border: 1px dotted black;
+  border-radius: 0.25em;
+  box-sizing: border-box;
+}
+div.sk-estimator:hover {
+  background-color: #d4ebff;
+}
+div.sk-parallel-item::after {
+  content: "";
+  width: 100%;
+  border-bottom: 1px solid gray;
+  flex-grow: 1;
+}
+div.sk-label:hover label.sk-toggleable__label {
+  background-color: #d4ebff;
+}
+div.sk-serial::before {
+  content: "";
+  position: absolute;
+  border-left: 1px solid gray;
+  box-sizing: border-box;
+  top: 2em;
+  bottom: 0;
+  left: 50%;
+}
+div.sk-serial {
+  display: flex;
+  flex-direction: column;
+  align-items: center;
+  background-color: white;
+}
+div.sk-item {
+  z-index: 1;
+}
+div.sk-parallel {
+  display: flex;
+  align-items: stretch;
+  justify-content: center;
+  background-color: white;
+}
+div.sk-parallel-item {
+  display: flex;
+  flex-direction: column;
+  position: relative;
+  background-color: white;
+}
+div.sk-parallel-item:first-child::after {
+  align-self: flex-end;
+  width: 50%;
+}
+div.sk-parallel-item:last-child::after {
+  align-self: flex-start;
+  width: 50%;
+}
+div.sk-parallel-item:only-child::after {
+  width: 0;
+}
+div.sk-dashed-wrapped {
+  border: 1px dashed gray;
+  margin: 0.2em;
+  box-sizing: border-box;
+  padding-bottom: 0.1em;
+  background-color: white;
+  position: relative;
+}
+div.sk-label label {
+  font-family: monospace;
+  font-weight: bold;
+  background-color: white;
+  display: inline-block;
+  line-height: 1.2em;
+}
+div.sk-label-container {
+  position: relative;
+  z-index: 2;
+  text-align: center;
+}
+div.sk-container {
+  display: inline-block;
+  position: relative;
+}
+""".replace('  ', '').replace('\n', '')  # noqa
+
+
+def estimator_html_repr(estimator):
+    """Build a HTML representation of an estimator.
+
+    Read more in the :ref:`User Guide <visualizing_composite_estimators>`.
+
+    Parameters
+    ----------
+    estimator : estimator object
+        The estimator to visualize.
+
+    Returns
+    -------
+    html: str
+        HTML representation of estimator.
+    """
+    with closing(StringIO()) as out:
+        out.write(f'<style>{_STYLE}</style>'
+                  f'<div class="sk-top-container"><div class="sk-container">')
+        _write_estimator_html(out, estimator, estimator.__class__.__name__,
+                              str(estimator), first_call=True)
+        out.write('</div></div>')
+
+        html_output = out.getvalue()
+        return html_output
diff --git a/sklearn/utils/tests/test_estimator_html_repr.py b/sklearn/utils/tests/test_estimator_html_repr.py
new file mode 100644
index 0000000000000..47d33051bd9a7
--- /dev/null
+++ b/sklearn/utils/tests/test_estimator_html_repr.py
@@ -0,0 +1,267 @@
+from contextlib import closing
+from io import StringIO
+
+import pytest
+
+from sklearn import config_context
+from sklearn.linear_model import LogisticRegression
+from sklearn.neural_network import MLPClassifier
+from sklearn.impute import SimpleImputer
+from sklearn.decomposition import PCA
+from sklearn.decomposition import TruncatedSVD
+from sklearn.pipeline import Pipeline
+from sklearn.pipeline import FeatureUnion
+from sklearn.compose import ColumnTransformer
+from sklearn.ensemble import VotingClassifier
+from sklearn.feature_selection import SelectPercentile
+from sklearn.cluster import Birch
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.svm import LinearSVC
+from sklearn.svm import LinearSVR
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.multiclass import OneVsOneClassifier
+from sklearn.ensemble import StackingClassifier
+from sklearn.ensemble import StackingRegressor
+from sklearn.gaussian_process import GaussianProcessRegressor
+from sklearn.gaussian_process.kernels import RationalQuadratic
+from sklearn.utils._estimator_html_repr import _write_label_html
+from sklearn.utils._estimator_html_repr import _get_visual_block
+from sklearn.utils._estimator_html_repr import estimator_html_repr
+
+
+@pytest.mark.parametrize("checked", [True, False])
+def test_write_label_html(checked):
+    # Test checking logic and labeling
+    name = "LogisticRegression"
+    tool_tip = "hello-world"
+
+    with closing(StringIO()) as out:
+        _write_label_html(out, name, tool_tip, checked=checked)
+        html_label = out.getvalue()
+        assert 'LogisticRegression</label>' in html_label
+        assert html_label.startswith('<div class="sk-label-container">')
+        assert '<pre>hello-world</pre>' in html_label
+        if checked:
+            assert 'checked>' in html_label
+
+
+@pytest.mark.parametrize('est', ['passthrough', 'drop', None])
+def test_get_visual_block_single_str_none(est):
+    # Test estimators that are represnted by strings
+    est_html_info = _get_visual_block(est)
+    assert est_html_info.kind == 'single'
+    assert est_html_info.estimators == est
+    assert est_html_info.names == str(est)
+    assert est_html_info.name_details == str(est)
+
+
+def test_get_visual_block_single_estimator():
+    est = LogisticRegression(C=10.0)
+    est_html_info = _get_visual_block(est)
+    assert est_html_info.kind == 'single'
+    assert est_html_info.estimators == est
+    assert est_html_info.names == est.__class__.__name__
+    assert est_html_info.name_details == str(est)
+
+
+def test_get_visual_block_pipeline():
+    pipe = Pipeline([
+        ('imputer', SimpleImputer()),
+        ('do_nothing', 'passthrough'),
+        ('do_nothing_more', None),
+        ('classifier', LogisticRegression())
+    ])
+    est_html_info = _get_visual_block(pipe)
+    assert est_html_info.kind == 'serial'
+    assert est_html_info.estimators == tuple(step[1] for step in pipe.steps)
+    assert est_html_info.names == ['imputer: SimpleImputer',
+                                   'do_nothing: passthrough',
+                                   'do_nothing_more: passthrough',
+                                   'classifier: LogisticRegression']
+    assert est_html_info.name_details == [str(est) for _, est in pipe.steps]
+
+
+def test_get_visual_block_feature_union():
+    f_union = FeatureUnion([
+        ('pca', PCA()), ('svd', TruncatedSVD())
+    ])
+    est_html_info = _get_visual_block(f_union)
+    assert est_html_info.kind == 'parallel'
+    assert est_html_info.names == ('pca', 'svd')
+    assert est_html_info.estimators == tuple(
+        trans[1] for trans in f_union.transformer_list)
+    assert est_html_info.name_details == (None, None)
+
+
+def test_get_visual_block_voting():
+    clf = VotingClassifier([
+        ('log_reg', LogisticRegression()),
+        ('mlp', MLPClassifier())
+    ])
+    est_html_info = _get_visual_block(clf)
+    assert est_html_info.kind == 'parallel'
+    assert est_html_info.estimators == tuple(trans[1]
+                                             for trans in clf.estimators)
+    assert est_html_info.names == ('log_reg', 'mlp')
+    assert est_html_info.name_details == (None, None)
+
+
+def test_get_visual_block_column_transformer():
+    ct = ColumnTransformer([
+        ('pca', PCA(), ['num1', 'num2']),
+        ('svd', TruncatedSVD, [0, 3])
+    ])
+    est_html_info = _get_visual_block(ct)
+    assert est_html_info.kind == 'parallel'
+    assert est_html_info.estimators == tuple(
+        trans[1] for trans in ct.transformers)
+    assert est_html_info.names == ('pca', 'svd')
+    assert est_html_info.name_details == (['num1', 'num2'], [0, 3])
+
+
+def test_estimator_html_repr_pipeline():
+    num_trans = Pipeline(steps=[
+        ('pass', 'passthrough'),
+        ('imputer', SimpleImputer(strategy='median'))
+    ])
+
+    cat_trans = Pipeline(steps=[
+        ('imputer', SimpleImputer(strategy='constant',
+                                  missing_values='empty')),
+        ('one-hot', OneHotEncoder(drop='first'))
+    ])
+
+    preprocess = ColumnTransformer([
+        ('num', num_trans, ['a', 'b', 'c', 'd', 'e']),
+        ('cat', cat_trans, [0, 1, 2, 3])
+    ])
+
+    feat_u = FeatureUnion([
+            ('pca', PCA(n_components=1)),
+            ('tsvd', Pipeline([('first', TruncatedSVD(n_components=3)),
+                               ('select', SelectPercentile())]))
+    ])
+
+    clf = VotingClassifier([
+        ('lr', LogisticRegression(solver='lbfgs', random_state=1)),
+        ('mlp', MLPClassifier(alpha=0.001))
+    ])
+
+    pipe = Pipeline([
+        ('preprocessor', preprocess), ('feat_u', feat_u), ('classifier', clf)
+    ])
+    html_output = estimator_html_repr(pipe)
+
+    # top level estimators show estimator with changes
+    assert str(pipe) in html_output
+    for _, est in pipe.steps:
+        assert (f"<div class=\"sk-toggleable__content\">"
+                f"<pre>{str(est)}") in html_output
+
+    # low level estimators do not show changes
+    with config_context(print_changed_only=True):
+        assert str(num_trans['pass']) in html_output
+        assert 'passthrough</label>' in html_output
+        assert str(num_trans['imputer']) in html_output
+
+        for _, _, cols in preprocess.transformers:
+            assert f"<pre>{cols}</pre>" in html_output
+
+        # feature union
+        for name, _ in feat_u.transformer_list:
+            assert f"<label>{name}</label>" in html_output
+
+        pca = feat_u.transformer_list[0][1]
+        assert f"<pre>{str(pca)}</pre>" in html_output
+
+        tsvd = feat_u.transformer_list[1][1]
+        first = tsvd['first']
+        select = tsvd['select']
+        assert f"<pre>{str(first)}</pre>" in html_output
+        assert f"<pre>{str(select)}</pre>" in html_output
+
+        # voting classifer
+        for name, est in clf.estimators:
+            assert f"<label>{name}</label>" in html_output
+            assert f"<pre>{str(est)}</pre>" in html_output
+
+
+@pytest.mark.parametrize("final_estimator", [None, LinearSVC()])
+def test_stacking_classsifer(final_estimator):
+    estimators = [('mlp', MLPClassifier(alpha=0.001)),
+                  ('tree', DecisionTreeClassifier())]
+    clf = StackingClassifier(
+        estimators=estimators, final_estimator=final_estimator)
+
+    html_output = estimator_html_repr(clf)
+
+    assert str(clf) in html_output
+    # If final_estimator's default changes from LogisticRegression
+    # this should be updated
+    if final_estimator is None:
+        assert "LogisticRegression(" in html_output
+    else:
+        assert final_estimator.__class__.__name__ in html_output
+
+
+@pytest.mark.parametrize("final_estimator", [None, LinearSVR()])
+def test_stacking_regressor(final_estimator):
+    reg = StackingRegressor(
+        estimators=[('svr', LinearSVR())], final_estimator=final_estimator)
+    html_output = estimator_html_repr(reg)
+
+    assert str(reg.estimators[0][0]) in html_output
+    assert "LinearSVR</label>" in html_output
+    if final_estimator is None:
+        assert "RidgeCV</label>" in html_output
+    else:
+        assert final_estimator.__class__.__name__ in html_output
+
+
+def test_birch_duck_typing_meta():
+    # Test duck typing meta estimators with Birch
+    birch = Birch(n_clusters=AgglomerativeClustering(n_clusters=3))
+    html_output = estimator_html_repr(birch)
+
+    # inner estimators do not show changes
+    with config_context(print_changed_only=True):
+        assert f"<pre>{str(birch.n_clusters)}" in html_output
+        assert "AgglomerativeClustering</label>" in html_output
+
+    # outer estimator contains all changes
+    assert f"<pre>{str(birch)}" in html_output
+
+
+def test_ovo_classifier_duck_typing_meta():
+    # Test duck typing metaestimators with OVO
+    ovo = OneVsOneClassifier(LinearSVC(penalty='l1'))
+    html_output = estimator_html_repr(ovo)
+
+    # inner estimators do not show changes
+    with config_context(print_changed_only=True):
+        assert f"<pre>{str(ovo.estimator)}" in html_output
+        assert "LinearSVC</label>" in html_output
+
+    # outter estimator
+    assert f"<pre>{str(ovo)}" in html_output
+
+
+def test_duck_typing_nested_estimator():
+    # Test duck typing metaestimators with GP
+    kernel = RationalQuadratic(length_scale=1.0, alpha=0.1)
+    gp = GaussianProcessRegressor(kernel=kernel)
+    html_output = estimator_html_repr(gp)
+
+    assert f"<pre>{str(kernel)}" in html_output
+    assert f"<pre>{str(gp)}" in html_output
+
+
+@pytest.mark.parametrize('print_changed_only', [True, False])
+def test_one_estimator_print_change_only(print_changed_only):
+    pca = PCA(n_components=10)
+
+    with config_context(print_changed_only=print_changed_only):
+        pca_repr = str(pca)
+        html_output = estimator_html_repr(pca)
+        assert pca_repr in html_output

From 3a6c8c4b6ce17b84b01d47b66ead1797c04931bc Mon Sep 17 00:00:00 2001
From: Lucy Liu <jliu176@gmail.com>
Date: Thu, 30 Apr 2020 16:59:44 +0200
Subject: [PATCH 097/125] DOC Remove unnecessary comment (#17091)

---
 .../miscellaneous/plot_partial_dependence_visualization_api.py   | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/miscellaneous/plot_partial_dependence_visualization_api.py b/examples/miscellaneous/plot_partial_dependence_visualization_api.py
index 761dad8b1e1fa..cbfa2c5e8ab64 100644
--- a/examples/miscellaneous/plot_partial_dependence_visualization_api.py
+++ b/examples/miscellaneous/plot_partial_dependence_visualization_api.py
@@ -98,7 +98,6 @@
 # which will plot the partial dependence curves of each model on the same axes.
 # The length of the axes list must be equal to the number of plots drawn.
 
-# Sets this image as the thumbnail for sphinx gallery
 # sphinx_gallery_thumbnail_number = 4
 fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 6))
 tree_disp.plot(ax=[ax1, ax2], line_kw={"label": "Decision Tree"})

From 0c0b834d1146257556ce934b38df1b7fe2ce9ef6 Mon Sep 17 00:00:00 2001
From: Bharat Raghunathan <bharatraghunthan9767@gmail.com>
Date: Fri, 1 May 2020 18:02:01 +0530
Subject: [PATCH 098/125] DOC detail fit_intercept docstring (#17096)

---
 sklearn/linear_model/_ridge.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
index 309137bed2b5d..ca3fba196d6d3 100644
--- a/sklearn/linear_model/_ridge.py
+++ b/sklearn/linear_model/_ridge.py
@@ -628,9 +628,9 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge):
         number.
 
     fit_intercept : bool, default=True
-        Whether to calculate the intercept for this model. If set
+        Whether to fit the intercept for this model. If set
         to false, no intercept will be used in calculations
-        (i.e. data is expected to be centered).
+        (i.e. ``X`` and ``y`` are expected to be centered).
 
     normalize : bool, default=False
         This parameter is ignored when ``fit_intercept`` is set to False.

From 863c1d683f888931f96d968faaf5b60c9c821723 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 1 May 2020 11:06:00 -0400
Subject: [PATCH 099/125] DOC Feature highlights for 0.23 (#17062)

---
 doc/modules/ensemble.rst                      |   2 +
 doc/whats_new/v0.23.rst                       |  37 ++--
 .../plot_release_highlights_0_23_0.py         | 165 ++++++++++++++++++
 3 files changed, 188 insertions(+), 16 deletions(-)
 create mode 100644 examples/release_highlights/plot_release_highlights_0_23_0.py

diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index 3cf8987fcfd5a..434cf146c2d4e 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -1018,6 +1018,8 @@ If no missing values were encountered for a given feature during training,
 then samples with missing values are mapped to whichever child has the most
 samples.
 
+.. _sw_hgbdt:
+
 Sample weight support
 ---------------------
 
diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index 1ac63ca473faf..fba75d62cc380 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -9,6 +9,10 @@ Version 0.23.0
 
 **In Development**
 
+For a short description of the main highlights of the release, please
+refer to
+:ref:`sphx_glr_auto_examples_release_highlights_plot_release_highlights_0_23_0.py`.
+
 
 .. include:: changelog_legend.inc
 
@@ -103,9 +107,9 @@ Changelog
   :pr:`16149` by :user:`Jeremie du Boisberranger <jeremiedbb>` and
   :user:`Alex Shacked <alexshacked>`.
 
-- |Efficiency| The critical parts of :class:`cluster.KMeans` have a more
-  optimized implementation. Parallelism is now over the data instead of over
-  initializations allowing better scalability. :pr:`11950` by
+- |Efficiency| |MajorFeature| The critical parts of :class:`cluster.KMeans`
+  have a more optimized implementation. Parallelism is now over the data
+  instead of over initializations allowing better scalability. :pr:`11950` by
   :user:`Jeremie du Boisberranger <jeremiedbb>`.
 
 - |Enhancement| :class:`cluster.KMeans` now supports sparse data when
@@ -124,6 +128,10 @@ Changelog
   could not have a `np.int64` type. :pr:`16484`
   by :user:`Jeremie du Boisberranger <jeremiedbb>`.
 
+- |Fix| :class:`cluster.AgglomerativeCluClustering` add specific error when
+  distance matrix is not square and `affinity=precomputed`.
+  :pr:`16257` by :user:`Simona Maggio <simonamaggio>`.
+
 - |API| The ``n_jobs`` parameter of :class:`cluster.KMeans`,
   :class:`cluster.SpectralCoclustering` and
   :class:`cluster.SpectralBiclustering` is deprecated. They now use OpenMP
@@ -234,7 +242,7 @@ Changelog
   samples in the training set. :pr:`14516` by :user:`Johann Faouzi
   <johannfaouzi>`.
 
-- |Feature| :class:`ensemble.HistGradientBoostingClassifier` and
+- |MajorFeature| :class:`ensemble.HistGradientBoostingClassifier` and
   :class:`ensemble.HistGradientBoostingRegressor` now support monotonic
   constraints, useful when features are supposed to have a positive/negative
   effect on the target. :pr:`15582` by `Nicolas Hug`_.
@@ -340,9 +348,10 @@ Changelog
   :pr:`14300` by :user:`Christian Lorentzen <lorentzenchr>`, `Roman Yurchak`_,
   and `Olivier Grisel`_.
 
-- |Feature| Support of `sample_weight` in :class:`linear_model.ElasticNet` and
-  :class:`linear_model.Lasso` for dense feature matrix `X`.
-  :pr:`15436` by :user:`Christian Lorentzen <lorentzenchr>`.
+- |MajorFeature| Support of `sample_weight` in
+  :class:`linear_model.ElasticNet` and :class:`linear_model.Lasso` for dense
+  feature matrix `X`. :pr:`15436` by :user:`Christian Lorentzen
+  <lorentzenchr>`.
 
 - |Efficiency| :class:`linear_model.RidgeCV` and
   :class:`linear_model.RidgeClassifierCV` now does not allocate a
@@ -567,8 +576,11 @@ Changelog
 :mod:`sklearn.utils`
 ....................
 
-- |Feature| Adds :func:`utils.estimator_html_repr` for returning a
-  HTML representation of an estimator. :pr:`14180` by `Thomas Fan`_.
+- |MajorFeature| Estimators can now be displayed with a rich html
+  representation. This can be enabled in Jupyter notebooks by setting
+  `display='diagram'` in :func:`~sklearn.set_config`. The raw html can be
+  returned by using :func:`utils.estimator_html_repr`.
+  :pr:`14180` by `Thomas Fan`_.
 
 - |Enhancement| improve error message in :func:`utils.validation.column_or_1d`.
   :pr:`15926` by :user:`Loïc Estève <lesteve>`.
@@ -598,13 +610,6 @@ Changelog
 - |FIX| :func:`utils.all_estimators` now only returns public estimators.
   :pr:`15380` by `Thomas Fan`_.
 
-:mod:`sklearn.cluster`
-......................
-
-- |Fix| :class:`cluster.AgglomerativeClustering` add specific error when
-  distance matrix is not square and `affinity=precomputed`.
-  :pr:`16257` by :user:`Simona Maggio <simonamaggio>`.
-
 Miscellaneous
 .............
 
diff --git a/examples/release_highlights/plot_release_highlights_0_23_0.py b/examples/release_highlights/plot_release_highlights_0_23_0.py
new file mode 100644
index 0000000000000..644e0f7747d39
--- /dev/null
+++ b/examples/release_highlights/plot_release_highlights_0_23_0.py
@@ -0,0 +1,165 @@
+# flake8: noqa
+"""
+========================================
+Release Highlights for scikit-learn 0.23
+========================================
+
+.. currentmodule:: sklearn
+
+We are pleased to announce the release of scikit-learn 0.23! Many bug fixes
+and improvements were added, as well as some new key features. We detail
+below a few of the major features of this release. **For an exhaustive list of
+all the changes**, please refer to the :ref:`release notes <changes_0_23>`.
+
+To install the latest version (with pip)::
+
+    pip install --upgrade scikit-learn
+
+or with conda::
+
+    conda install scikit-learn
+"""
+
+##############################################################################
+# Generalized Linear Models, and Poisson loss for gradient boosting
+# -----------------------------------------------------------------
+# Long-awaited Generalized Linear Models with non-normal loss functions are now
+# available. In particular, three new regressors were implemented:
+# :class:`~sklearn.linear_model.PoissonRegressor`,
+# :class:`~sklearn.linear_model.GammaRegressor`, and
+# :class:`~sklearn.linear_model.TweedieRegressor`. The Poisson regressor can be
+# used to model positive integer counts, or relative frequencies. Read more in
+# the :ref:`User Guide <Generalized_linear_regression>`. Additionally,
+# :class:`~sklearn.ensemble.HistGradientBoostingRegressor` supports a new
+# 'poisson' loss as well.
+
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import PoissonRegressor
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingRegressor
+
+n_samples, n_features = 1000, 20
+rng = np.random.RandomState(0)
+X = rng.randn(n_samples, n_features)
+# positive integer target correlated with X[:, 5] with many zeros:
+y = rng.poisson(lam=np.exp(X[:, 5]) / 2)
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
+glm = PoissonRegressor()
+gbdt = HistGradientBoostingRegressor(loss='poisson', learning_rate=.01)
+glm.fit(X_train, y_train)
+gbdt.fit(X_train, y_train)
+print(glm.score(X_test, y_test))
+print(gbdt.score(X_test, y_test))
+
+##############################################################################
+# Rich HTML representation for estimators
+# ---------------------------------------
+# Estimators can now be rendered in html in notebooks by enabling the
+# `display='diagram'` option. This is particularly useful to visualize
+# pipelines and composite estimators. Click on the entries to expand and see
+# details.
+from sklearn import set_config
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn.impute import SimpleImputer
+from sklearn.compose import make_column_transformer
+from sklearn.linear_model import LogisticRegression
+set_config(display='diagram')
+
+num_proc = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())
+
+cat_proc = make_pipeline(
+    SimpleImputer(strategy='constant', fill_value='missing'),
+    OneHotEncoder(handle_unknown='ignore'))
+
+preprocessor = make_column_transformer((num_proc, ('feat1', 'feat3')),
+                                       (cat_proc, ('feat0', 'feat2')))
+
+clf = make_pipeline(preprocessor, LogisticRegression())
+clf
+
+##############################################################################
+# Scalability and stability improvements to KMeans
+# ------------------------------------------------
+# The :class:`~sklearn.cluster.KMeans` estimator was entirely re-worked, and it
+# is now significantly faster and more stable. In addition, the Elkan algorithm
+# is now compatible with sparse matrices. The estimator uses OpenMP based
+# parallelism instead of relying on joblib, so the `n_jobs` parameter has no
+# effect anymore. For more details on how to control the number of threads,
+# please refer to our :ref:`parallelism` notes.
+import scipy
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.cluster import KMeans
+from sklearn.datasets import make_blobs
+from sklearn.metrics import completeness_score
+
+rng = np.random.RandomState(0)
+X, y = make_blobs(random_state=rng)
+X = scipy.sparse.csr_matrix(X)
+X_train, X_test, _, y_test = train_test_split(X, y, random_state=rng)
+kmeans = KMeans(algorithm='elkan').fit(X_train)
+print(completeness_score(kmeans.predict(X_test), y_test))
+
+##############################################################################
+# Improvements to the histogram-based Gradient Boosting estimators
+# ----------------------------------------------------------------
+# Various improvements were made to
+# :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and
+# :class:`~sklearn.ensemble.HistGradientBoostingRegressor`. On top of the
+# Poisson loss mentionned above, these estimators now support :ref:`sample
+# weights <sw_hgbdt>`. Also, an automatic early-stopping criterion was added:
+# early-stopping is enabled by default when the number of samples exceeds 10k.
+# Finally, users can now define :ref:`monotonic constraints
+# <monotonic_cst_gbdt>` to constrain the predictions based on the variations of
+# specific features. In the following example, we construct a target that is
+# generally positively correlated with the first feature, with some noise.
+# Applying monotoinc constraints allows the prediction to capture the global
+# effect of the first feature, instead of fitting the noise.
+import numpy as np
+from matplotlib import pyplot as plt
+from sklearn.model_selection import train_test_split
+from sklearn.inspection import plot_partial_dependence
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingRegressor
+
+n_samples = 500
+rng = np.random.RandomState(0)
+X = rng.randn(n_samples, 2)
+noise = rng.normal(loc=0.0, scale=0.01, size=n_samples)
+y = (5 * X[:, 0] + np.sin(10 * np.pi * X[:, 0]) - noise)
+
+gbdt_no_cst = HistGradientBoostingRegressor().fit(X, y)
+gbdt_cst = HistGradientBoostingRegressor(monotonic_cst=[1, 0]).fit(X, y)
+
+disp = plot_partial_dependence(
+    gbdt_no_cst, X, features=[0], feature_names=['feature 0'],
+    line_kw={'linewidth': 4, 'label': 'unconstrained'})
+plot_partial_dependence(gbdt_cst, X, features=[0],
+    line_kw={'linewidth': 4, 'label': 'constrained'}, ax=disp.axes_)
+disp.axes_[0, 0].plot(X[:, 0], y, 'o', alpha=.5, zorder=-1, label='samples')
+disp.axes_[0, 0].set_ylim(-3, 3); disp.axes_[0, 0].set_xlim(-1, 1)
+plt.legend()
+plt.show()
+
+##############################################################################
+# Sample-weight support for Lasso and ElasticNet
+# ----------------------------------------------
+# The two linear regressors :class:`~sklearn.linear_model.Lasso` and
+# :class:`~sklearn.linear_model.ElasticNet` now support sample weights.
+
+from sklearn.model_selection import train_test_split
+from sklearn.datasets import make_regression
+from sklearn.linear_model import Lasso
+import numpy as np
+
+n_samples, n_features = 1000, 20
+rng = np.random.RandomState(0)
+X, y = make_regression(n_samples, n_features, random_state=rng)
+sample_weight = rng.rand(n_samples)
+X_train, X_test, y_train, y_test, sw_train, sw_test = train_test_split(
+    X, y, sample_weight, random_state=rng)
+reg = Lasso()
+reg.fit(X_train, y_train, sample_weight=sw_train)
+print(reg.score(X_test, y_test, sw_test))

From c71a1c21d14fc7a98493acb1f3d315db720ca4ac Mon Sep 17 00:00:00 2001
From: Gael Varoquaux <gael.varoquaux@normalesup.org>
Date: Fri, 1 May 2020 18:27:05 +0200
Subject: [PATCH 100/125] MISC fix rst syntax (#17098)

* MISC: fix rst syntax

* DOC Adds full link

Co-authored-by: Thomas J Fan <thomasjpfan@gmail.com>
---
 doc/whats_new/v0.23.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index fba75d62cc380..e8a1566349f06 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -445,9 +445,9 @@ Changelog
   type and details.
   :pr:`15622` by :user:`Gregory Morse <GregoryMorse>`.
 
-- |Fix| :func: `cross_val_predict` supports `method="predict_proba"`
-  when `y=None`.
-  :pr:`15918` by :user:`Luca Kubin <lkubin>`.
+- |Fix| :func:`model_selection.cross_val_predict` supports
+  `method="predict_proba"` when `y=None`.:pr:`15918` by
+  :user:`Luca Kubin <lkubin>`.
 
 - |Fix| :func:`model_selection.fit_grid_point` is deprecated in 0.23 and will
   be removed in 0.25. :pr:`16401` by

From 04d2e3290f47ee7ee86767ae534629cd4112ffbf Mon Sep 17 00:00:00 2001
From: Alexandre Gramfort <alexandre.gramfort@m4x.org>
Date: Sat, 2 May 2020 10:11:07 +0200
Subject: [PATCH 101/125] Speedup MultiTaskLasso (#17021)

---
 doc/whats_new/v0.23.rst                       |   7 ++
 sklearn/linear_model/_cd_fast.pyx             | 116 ++++++++++--------
 sklearn/linear_model/_coordinate_descent.py   |  31 +++--
 .../tests/test_coordinate_descent.py          |   4 +-
 4 files changed, 86 insertions(+), 72 deletions(-)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index e8a1566349f06..bd31752973f6a 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -404,6 +404,13 @@ Changelog
   using joblib loky backend. :pr:`14264` by
   :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
+- |Efficiency| Speed up :class:`linear_model.MultiTaskLasso`,
+  :class:`linear_model.MultiTaskLassoCV`, :class:`linear_model.MultiTaskElasticNet`,
+  :class:`linear_model.MultiTaskElasticNetCV` by avoiding slower
+  BLAS Level 2 calls on small arrays
+  :pr:`17021` by :user:`Alex Gramfort <agramfort>` and
+  :user:`Mathurin Massias <mathurinm>`.
+
 :mod:`sklearn.metrics`
 ......................
 
diff --git a/sklearn/linear_model/_cd_fast.pyx b/sklearn/linear_model/_cd_fast.pyx
index fcbe46ce77711..5b47f45c2e248 100644
--- a/sklearn/linear_model/_cd_fast.pyx
+++ b/sklearn/linear_model/_cd_fast.pyx
@@ -19,7 +19,7 @@ from cython cimport floating
 import warnings
 from ..exceptions import ConvergenceWarning
 
-from ..utils._cython_blas cimport (_axpy, _dot, _asum, _ger, _gemv, _nrm2, 
+from ..utils._cython_blas cimport (_axpy, _dot, _asum, _ger, _gemv, _nrm2,
                                    _copy, _scal)
 from ..utils._cython_blas cimport RowMajor, ColMajor, Trans, NoTrans
 
@@ -154,7 +154,7 @@ def enet_coordinate_descent(floating[::1] w,
     with nogil:
         # R = y - np.dot(X, w)
         _copy(n_samples, &y[0], 1, &R[0], 1)
-        _gemv(ColMajor, NoTrans, n_samples, n_features, -1.0, &X[0, 0], 
+        _gemv(ColMajor, NoTrans, n_samples, n_features, -1.0, &X[0, 0],
               n_samples, &w[0], 1, 1.0, &R[0], 1)
 
         # tol *= np.dot(y, y)
@@ -620,18 +620,17 @@ def enet_coordinate_descent_gram(floating[::1] w,
     return np.asarray(w), gap, tol, n_iter + 1
 
 
-def enet_coordinate_descent_multi_task(floating[::1, :] W, floating l1_reg,
-                                       floating l2_reg,
-                                       np.ndarray[floating, ndim=2, mode='fortran'] X,
-                                       np.ndarray[floating, ndim=2] Y,
-                                       int max_iter, floating tol, object rng,
-                                       bint random=0):
+def enet_coordinate_descent_multi_task(
+        floating[::1, :] W, floating l1_reg, floating l2_reg,
+        np.ndarray[floating, ndim=2, mode='fortran'] X,  # TODO: use views in 0.24
+        np.ndarray[floating, ndim=2, mode='fortran'] Y,
+        int max_iter, floating tol, object rng, bint random=0):
     """Cython version of the coordinate descent algorithm
         for Elastic-Net mult-task regression
 
         We minimize
 
-        (1/2) * norm(y - X w, 2)^2 + l1_reg ||w||_21 + (1/2) * l2_reg norm(w, 2)^2
+        0.5 * norm(Y - X W.T, 2)^2 + l1_reg ||W.T||_21 + 0.5 * l2_reg norm(W.T, 2)^2
 
     """
 
@@ -651,11 +650,11 @@ def enet_coordinate_descent_multi_task(floating[::1, :] W, floating l1_reg,
     cdef floating dual_norm_XtA
 
     # initial value of the residuals
-    cdef floating[:, ::1] R = np.zeros((n_samples, n_tasks), dtype=dtype)
+    cdef floating[::1, :] R = np.zeros((n_samples, n_tasks), dtype=dtype, order='F')
 
-    cdef floating[:] norm_cols_X = np.zeros(n_features, dtype=dtype)
+    cdef floating[::1] norm_cols_X = np.zeros(n_features, dtype=dtype)
     cdef floating[::1] tmp = np.zeros(n_tasks, dtype=dtype)
-    cdef floating[:] w_ii = np.zeros(n_tasks, dtype=dtype)
+    cdef floating[::1] w_ii = np.zeros(n_tasks, dtype=dtype)
     cdef floating d_w_max
     cdef floating w_max
     cdef floating d_w_ii
@@ -675,9 +674,7 @@ def enet_coordinate_descent_multi_task(floating[::1, :] W, floating l1_reg,
     cdef UINT32_t* rand_r_state = &rand_r_state_seed
 
     cdef floating* X_ptr = &X[0, 0]
-    cdef floating* W_ptr = &W[0, 0]
     cdef floating* Y_ptr = &Y[0, 0]
-    cdef floating* wii_ptr = &w_ii[0]
 
     if l1_reg == 0:
         warnings.warn("Coordinate descent with l1_reg=0 may lead to unexpected"
@@ -686,15 +683,15 @@ def enet_coordinate_descent_multi_task(floating[::1, :] W, floating l1_reg,
     with nogil:
         # norm_cols_X = (np.asarray(X) ** 2).sum(axis=0)
         for ii in range(n_features):
-            for jj in range(n_samples):
-                norm_cols_X[ii] += X[jj, ii] ** 2
+            norm_cols_X[ii] = _nrm2(n_samples, X_ptr + ii * n_samples, 1) ** 2
 
         # R = Y - np.dot(X, W.T)
-        for ii in range(n_samples):
+        _copy(n_samples * n_tasks, Y_ptr, 1, &R[0, 0], 1)
+        for ii in range(n_features):
             for jj in range(n_tasks):
-                R[ii, jj] = Y[ii, jj] - (
-                    _dot(n_features, X_ptr + ii, n_samples, W_ptr + jj, n_tasks)
-                    )
+                if W[jj, ii] != 0:
+                    _axpy(n_samples, -W[jj, ii], X_ptr + ii * n_samples, 1,
+                          &R[0, jj], 1)
 
         # tol = tol * linalg.norm(Y, ord='fro') ** 2
         tol = tol * _nrm2(n_samples * n_tasks, Y_ptr, 1) ** 2
@@ -712,42 +709,59 @@ def enet_coordinate_descent_multi_task(floating[::1, :] W, floating l1_reg,
                     continue
 
                 # w_ii = W[:, ii] # Store previous value
-                _copy(n_tasks, W_ptr + ii * n_tasks, 1, wii_ptr, 1)
-
-                # if np.sum(w_ii ** 2) != 0.0:  # can do better
-                if _nrm2(n_tasks, wii_ptr, 1) != 0.0:
-                    # R += np.dot(X[:, ii][:, None], w_ii[None, :]) # rank 1 update
-                    _ger(RowMajor, n_samples, n_tasks, 1.0,
-                         X_ptr + ii * n_samples, 1,
-                         wii_ptr, 1, &R[0, 0], n_tasks)
-
+                _copy(n_tasks, &W[0, ii], 1, &w_ii[0], 1)
+
+                # Using Numpy:
+                # R += np.dot(X[:, ii][:, None], w_ii[None, :]) # rank 1 update
+                # Using Blas Level2:
+                # _ger(RowMajor, n_samples, n_tasks, 1.0,
+                #      &X[0, ii], 1,
+                #      &w_ii[0], 1, &R[0, 0], n_tasks)
+                # Using Blas Level1 and for loop to avoid slower threads
+                # for such small vectors
+                for jj in range(n_tasks):
+                    if w_ii[jj] != 0:
+                        _axpy(n_samples, w_ii[jj], X_ptr + ii * n_samples, 1,
+                              &R[0, jj], 1)
+
+                # Using numpy:
                 # tmp = np.dot(X[:, ii][None, :], R).ravel()
-                _gemv(RowMajor, Trans, n_samples, n_tasks, 1.0, &R[0, 0],
-                      n_tasks, X_ptr + ii * n_samples, 1, 0.0, &tmp[0], 1)
+                # Using BLAS Level 2:
+                # _gemv(RowMajor, Trans, n_samples, n_tasks, 1.0, &R[0, 0],
+                #       n_tasks, &X[0, ii], 1, 0.0, &tmp[0], 1)
+                # Using BLAS Level 1 (faster for small vectors like here):
+                for jj in range(n_tasks):
+                    tmp[jj] = _dot(n_samples, X_ptr + ii * n_samples, 1,
+                                   &R[0, jj], 1)
 
                 # nn = sqrt(np.sum(tmp ** 2))
                 nn = _nrm2(n_tasks, &tmp[0], 1)
 
                 # W[:, ii] = tmp * fmax(1. - l1_reg / nn, 0) / (norm_cols_X[ii] + l2_reg)
-                _copy(n_tasks, &tmp[0], 1, W_ptr + ii * n_tasks, 1)
+                _copy(n_tasks, &tmp[0], 1, &W[0, ii], 1)
                 _scal(n_tasks, fmax(1. - l1_reg / nn, 0) / (norm_cols_X[ii] + l2_reg),
-                      W_ptr + ii * n_tasks, 1)
-
-                # if np.sum(W[:, ii] ** 2) != 0.0:  # can do better
-                if _nrm2(n_tasks, W_ptr + ii * n_tasks, 1) != 0.0:
-                    # R -= np.dot(X[:, ii][:, None], W[:, ii][None, :])
-                    # Update residual : rank 1 update
-                    _ger(RowMajor, n_samples, n_tasks, -1.0,
-                         X_ptr + ii * n_samples, 1, W_ptr + ii * n_tasks, 1,
-                         &R[0, 0], n_tasks)
+                      &W[0, ii], 1)
+
+                # Using numpy:
+                # R -= np.dot(X[:, ii][:, None], W[:, ii][None, :])
+                # Using BLAS Level 2:
+                # Update residual : rank 1 update
+                # _ger(RowMajor, n_samples, n_tasks, -1.0,
+                #      &X[0, ii], 1, &W[0, ii], 1,
+                #      &R[0, 0], n_tasks)
+                # Using BLAS Level 1 (faster for small vectors like here):
+                for jj in range(n_tasks):
+                    if W[jj, ii] != 0:
+                        _axpy(n_samples, -W[jj, ii], X_ptr + ii * n_samples, 1,
+                              &R[0, jj], 1)
 
                 # update the maximum absolute coefficient update
-                d_w_ii = diff_abs_max(n_tasks, W_ptr + ii * n_tasks, wii_ptr)
+                d_w_ii = diff_abs_max(n_tasks, &W[0, ii], &w_ii[0])
 
                 if d_w_ii > d_w_max:
                     d_w_max = d_w_ii
 
-                W_ii_abs_max = abs_max(n_tasks, W_ptr + ii * n_tasks)
+                W_ii_abs_max = abs_max(n_tasks, &W[0, ii])
                 if W_ii_abs_max > w_max:
                     w_max = W_ii_abs_max
 
@@ -760,16 +774,14 @@ def enet_coordinate_descent_multi_task(floating[::1, :] W, floating l1_reg,
                 for ii in range(n_features):
                     for jj in range(n_tasks):
                         XtA[ii, jj] = _dot(
-                            n_samples, X_ptr + ii * n_samples, 1,
-                            &R[0, 0] + jj, n_tasks
+                            n_samples, X_ptr + ii * n_samples, 1, &R[0, jj], 1
                             ) - l2_reg * W[jj, ii]
 
                 # dual_norm_XtA = np.max(np.sqrt(np.sum(XtA ** 2, axis=1)))
                 dual_norm_XtA = 0.0
                 for ii in range(n_features):
                     # np.sqrt(np.sum(XtA ** 2, axis=1))
-                    XtA_axis1norm = _nrm2(n_tasks,
-                                          &XtA[0, 0] + ii * n_tasks, 1)
+                    XtA_axis1norm = _nrm2(n_tasks, &XtA[ii, 0], 1)
                     if XtA_axis1norm > dual_norm_XtA:
                         dual_norm_XtA = XtA_axis1norm
 
@@ -777,7 +789,7 @@ def enet_coordinate_descent_multi_task(floating[::1, :] W, floating l1_reg,
                 # R_norm = linalg.norm(R, ord='fro')
                 # w_norm = linalg.norm(W, ord='fro')
                 R_norm = _nrm2(n_samples * n_tasks, &R[0, 0], 1)
-                w_norm = _nrm2(n_features * n_tasks, W_ptr, 1)
+                w_norm = _nrm2(n_features * n_tasks, &W[0, 0], 1)
                 if (dual_norm_XtA > l1_reg):
                     const =  l1_reg / dual_norm_XtA
                     A_norm = R_norm * const
@@ -787,16 +799,12 @@ def enet_coordinate_descent_multi_task(floating[::1, :] W, floating l1_reg,
                     gap = R_norm ** 2
 
                 # ry_sum = np.sum(R * y)
-                ry_sum = 0.0
-                for ii in range(n_samples):
-                    for jj in range(n_tasks):
-                        ry_sum += R[ii, jj] * Y[ii, jj]
+                ry_sum = _dot(n_samples * n_tasks, &R[0, 0], 1, &Y[0, 0], 1)
 
                 # l21_norm = np.sqrt(np.sum(W ** 2, axis=0)).sum()
                 l21_norm = 0.0
                 for ii in range(n_features):
-                    # np.sqrt(np.sum(W ** 2, axis=0))
-                    l21_norm += _nrm2(n_tasks, W_ptr + n_tasks * ii, 1)
+                    l21_norm += _nrm2(n_tasks, &W[0, ii], 1)
 
                 gap += l1_reg * l21_norm - const * ry_sum + \
                      0.5 * l2_reg * (1 + const ** 2) * (w_norm ** 2)
diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index 2d8567b04db56..3ac0d155169af 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -1733,9 +1733,9 @@ class MultiTaskElasticNet(Lasso):
 
     Where::
 
-        ||W||_21 = sum_i sqrt(sum_j w_ij ^ 2)
+        ||W||_21 = sum_i sqrt(sum_j W_ij ^ 2)
 
-    i.e. the sum of norm of each row.
+    i.e. the sum of norms of each row.
 
     Read more in the :ref:`User Guide <multi_task_elastic_net>`.
 
@@ -1829,8 +1829,8 @@ class MultiTaskElasticNet(Lasso):
     -----
     The algorithm used to fit the model is coordinate descent.
 
-    To avoid unnecessary memory duplication the X argument of the fit method
-    should be directly passed as a Fortran-contiguous numpy array.
+    To avoid unnecessary memory duplication the X and y arguments of the fit
+    method should be directly passed as Fortran-contiguous numpy arrays.
     """
     @_deprecate_positional_args
     def __init__(self, alpha=1.0, *, l1_ratio=0.5, fit_intercept=True,
@@ -1867,12 +1867,11 @@ def fit(self, X, y):
         To avoid memory re-allocation it is advised to allocate the
         initial data in memory directly using that format.
         """
-
         # Need to validate separately here.
         # We can't pass multi_ouput=True because that would allow y to be csr.
         check_X_params = dict(dtype=[np.float64, np.float32], order='F',
                               copy=self.copy_X and self.fit_intercept)
-        check_y_params = dict(ensure_2d=False)
+        check_y_params = dict(ensure_2d=False, order='F')
         X, y = self._validate_data(X, y, validate_separately=(check_X_params,
                                                               check_y_params))
         y = y.astype(X.dtype)
@@ -2000,13 +1999,13 @@ class MultiTaskLasso(MultiTaskElasticNet):
     --------
     >>> from sklearn import linear_model
     >>> clf = linear_model.MultiTaskLasso(alpha=0.1)
-    >>> clf.fit([[0,0], [1, 1], [2, 2]], [[0, 0], [1, 1], [2, 2]])
+    >>> clf.fit([[0, 1], [1, 2], [2, 4]], [[0, 0], [1, 1], [2, 3]])
     MultiTaskLasso(alpha=0.1)
     >>> print(clf.coef_)
-    [[0.89393398 0.        ]
-     [0.89393398 0.        ]]
+    [[0.         0.60809415]
+    [0.         0.94592424]]
     >>> print(clf.intercept_)
-    [0.10606602 0.10606602]
+    [-0.41888636 -0.87382323]
 
     See also
     --------
@@ -2018,8 +2017,8 @@ class MultiTaskLasso(MultiTaskElasticNet):
     -----
     The algorithm used to fit the model is coordinate descent.
 
-    To avoid unnecessary memory duplication the X argument of the fit method
-    should be directly passed as a Fortran-contiguous numpy array.
+    To avoid unnecessary memory duplication the X and y arguments of the fit
+    method should be directly passed as Fortran-contiguous numpy arrays.
     """
     @_deprecate_positional_args
     def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False,
@@ -2196,8 +2195,8 @@ class MultiTaskElasticNetCV(RegressorMixin, LinearModelCV):
     -----
     The algorithm used to fit the model is coordinate descent.
 
-    To avoid unnecessary memory duplication the X argument of the fit method
-    should be directly passed as a Fortran-contiguous numpy array.
+    To avoid unnecessary memory duplication the X and y arguments of the fit
+    method should be directly passed as Fortran-contiguous numpy arrays.
     """
     path = staticmethod(enet_path)
 
@@ -2368,8 +2367,8 @@ class MultiTaskLassoCV(RegressorMixin, LinearModelCV):
     -----
     The algorithm used to fit the model is coordinate descent.
 
-    To avoid unnecessary memory duplication the X argument of the fit method
-    should be directly passed as a Fortran-contiguous numpy array.
+    To avoid unnecessary memory duplication the X and y arguments of the fit
+    method should be directly passed as Fortran-contiguous numpy arrays.
     """
     path = staticmethod(lasso_path)
 
diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
index 142c1e9ac2a47..1b2f7c656f015 100644
--- a/sklearn/linear_model/tests/test_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -882,9 +882,9 @@ def test_convergence_warnings():
     X = random_state.standard_normal((1000, 500))
     y = random_state.standard_normal((1000, 3))
 
-    # check that the model fails to converge
+    # check that the model fails to converge (a negative dual gap cannot occur)
     with pytest.warns(ConvergenceWarning):
-        MultiTaskElasticNet(max_iter=1, tol=0).fit(X, y)
+        MultiTaskElasticNet(max_iter=1, tol=-1).fit(X, y)
 
     # check that the model converges w/o warnings
     with pytest.warns(None) as record:

From 5756205920ae072f1f3f3868e4132a85a374a38e Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Sun, 3 May 2020 11:24:25 -0400
Subject: [PATCH 102/125] DOC Adds release highlights to front page (#17071)

---
 doc/conf.py                             | 18 ++++++++++++++++++
 doc/templates/index.html                |  4 ++--
 doc/themes/scikit-learn-modern/nav.html |  1 +
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/doc/conf.py b/doc/conf.py
index d459cdfd3f1af..74e37d01307be 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -17,6 +17,7 @@
 import warnings
 import re
 from packaging.version import parse
+from pathlib import Path
 
 # If extensions (or modules to document with autodoc) are in another
 # directory, add these directories to sys.path here. If the directory
@@ -208,6 +209,23 @@
 # If true, the reST sources are included in the HTML build as _sources/name.
 html_copy_source = True
 
+# Adds variables into templates
+html_context = {}
+# finds latest release highlights and places it into HTML context for
+# index.html
+release_highlights_dir = Path("..") / "examples" / "release_highlights"
+# Finds the highlight with the latest version number
+latest_highlights = sorted(release_highlights_dir.glob(
+                           "plot_release_highlights_*.py"))[-1]
+latest_highlights = latest_highlights.with_suffix('').name
+html_context["release_highlights"] = \
+    f"auto_examples/release_highlights/{latest_highlights}"
+
+# get version from higlight name assuming highlights have the form
+# plot_release_highlights_0_22_0
+highlight_version = ".".join(latest_highlights.split("_")[-3:-1])
+html_context["release_highlights_version"] = highlight_version
+
 # -- Options for LaTeX output ------------------------------------------------
 latex_elements = {
     # The paper size ('letterpaper' or 'a4paper').
diff --git a/doc/templates/index.html b/doc/templates/index.html
index 8d3bdfaec2b28..367e6a3c01902 100644
--- a/doc/templates/index.html
+++ b/doc/templates/index.html
@@ -8,7 +8,7 @@
         <h1 class="sk-landing-header text-white text-monospace">scikit-learn</h1>
         <h4 class="sk-landing-subheader text-white font-italic mb-3">Machine Learning in Python</h4>
         <a class="btn sk-landing-btn mb-1" href="{{ pathto('getting_started') }}" role="button">Getting Started</a>
-        <a class="btn sk-landing-btn mb-1" href="whats_new/v{{ version }}.html" role="button">What's New in {{ release }}</a>
+        <a class="btn sk-landing-btn mb-1" href="{{ pathto(release_highlights) }}" role="button">Release Highlights for {{ release_highlights_version }}</a>
         <a class="btn sk-landing-btn mb-1" href="https://github.com/scikit-learn/scikit-learn" role="button">GitHub</a>
       </div>
       <div class="col-md-6 d-flex">
@@ -160,7 +160,7 @@ <h4 class="sk-landing-call-header">News</h4>
         </li>
         <li><strong>March 2020.</strong> scikit-learn 0.22.2 is available for download (<a href="whats_new/v0.22.html#version-0-22-2">Changelog</a>).
         <li><strong>January 2020.</strong> scikit-learn 0.22.1 is available for download (<a href="whats_new/v0.22.html#version-0-22-1">Changelog</a>).
-        <li><strong>December 2019.</strong> scikit-learn 0.22 is available for download (<a href="whats_new/v0.22.html#version-0-22-0">Changelog</a>).
+        <li><strong>December 2019.</strong> scikit-learn 0.22 is available for download (<a href="whats_new/v0.22.html#version-0-22-0">Changelog</a> and <a href="{{ pathto('auto_examples/release_highlights/plot_release_highlights_0_22_0') }}">Release Highlights</a>).
         </li>
         <li><strong>Scikit-learn from 0.21 requires Python 3.5 or greater.</strong>
         </li>
diff --git a/doc/themes/scikit-learn-modern/nav.html b/doc/themes/scikit-learn-modern/nav.html
index 57c631f6cbee7..4fbd22f48a4dd 100644
--- a/doc/themes/scikit-learn-modern/nav.html
+++ b/doc/themes/scikit-learn-modern/nav.html
@@ -9,6 +9,7 @@
 {%- set drop_down_navigation = [
   ('Getting Started', pathto('getting_started')),
   ('Tutorial', pathto('tutorial/index')),
+  ("What's new", 'whats_new/v' + version + '.html'),
   ('Glossary', pathto('glossary')),
   ('Development', pathto('developers/index')),
   ('FAQ', pathto('faq')),

From 8b1b281de13138aac954c56f5be95cd7ec9ca44b Mon Sep 17 00:00:00 2001
From: Christian Kastner <ckk@kvr.at>
Date: Sun, 3 May 2020 23:30:02 +0200
Subject: [PATCH 103/125] EXA Remove stray executable flag from example
 (#17116)

---
 examples/linear_model/plot_bayesian_ridge_curvefit.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 examples/linear_model/plot_bayesian_ridge_curvefit.py

diff --git a/examples/linear_model/plot_bayesian_ridge_curvefit.py b/examples/linear_model/plot_bayesian_ridge_curvefit.py
old mode 100755
new mode 100644

From 4fae53964b00ece9b32c85298289514d6d646b93 Mon Sep 17 00:00:00 2001
From: Pankaj Jindal <36332727+jindalpankaj@users.noreply.github.com>
Date: Mon, 4 May 2020 08:09:48 -0700
Subject: [PATCH 104/125] DOC Correcting an attribute's name (#17110)

---
 doc/modules/svm.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst
index 23dc7fbf67b65..8acebc79e412e 100644
--- a/doc/modules/svm.rst
+++ b/doc/modules/svm.rst
@@ -90,7 +90,7 @@ After being fitted, the model can then be used to predict new values::
 SVMs decision function (detailed in the :ref:`svm_mathematical_formulation`)
 depends on some subset of the training data, called the support vectors. Some
 properties of these support vectors can be found in attributes
-``support_vectors_``, ``support_`` and ``n_support``::
+``support_vectors_``, ``support_`` and ``n_support_``::
 
     >>> # get support vectors
     >>> clf.support_vectors_

From a0c76ce3cdbcb87e6d61348c46cbc12486677354 Mon Sep 17 00:00:00 2001
From: Christoph Deil <Deil.Christoph@gmail.com>
Date: Mon, 4 May 2020 21:00:53 +0200
Subject: [PATCH 105/125] MNT Remove sklearn logger default StreamHandler
 (#16451)

* Remove sklearn logger default StreamHandler

To avoid duplicate log messages

* DOC Adds whats_new

* CLN Address comments

* MNT Remove setLevel

Co-authored-by: Thomas J Fan <thomasjpfan@gmail.com>
---
 doc/whats_new/v0.23.rst | 6 ++++++
 sklearn/__init__.py     | 2 --
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index bd31752973f6a..d60b16edce903 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -195,6 +195,12 @@ Changelog
   `ValueError` for arguments `n_classes < 1` OR `length < 1`.
   :pr:`16006` by :user:`Rushabh Vasani <rushabh-v>`.
 
+- |API| The `StreamHandler` was removed from `sklearn.logger` to avoid
+  double logging of messages in common cases where a hander is attached
+  to the root logger, and to follow the Python logging documentation
+  recommendation for libraries to leave the log message handling to
+  users and application code. :pr:`16451` by :user:`Christoph Deil <cdeil>`.
+
 :mod:`sklearn.decomposition`
 ............................
 
diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index 7f203a079f22b..4d942319c5eb7 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -19,8 +19,6 @@
 from ._config import get_config, set_config, config_context
 
 logger = logging.getLogger(__name__)
-logger.addHandler(logging.StreamHandler())
-logger.setLevel(logging.INFO)
 
 
 # PEP0440 compatible formatted version, see:

From a670bb9202396065bd093684ebb79b6753901d26 Mon Sep 17 00:00:00 2001
From: Vikas Pandey <vikaspandey707@gmail.com>
Date: Tue, 5 May 2020 01:38:59 +0530
Subject: [PATCH 106/125] add dtreevis to related packages #17105 (#17113)

---
 doc/related_projects.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/doc/related_projects.rst b/doc/related_projects.rst
index 89079971ca29a..15c35a51f9b0c 100644
--- a/doc/related_projects.rst
+++ b/doc/related_projects.rst
@@ -73,6 +73,9 @@ enhance the functionality of scikit-learn's estimators.
 
 **Model inspection and visualisation**
 
+- `dtreeviz <https://github.com/parrt/dtreeviz/>`_ A python library for
+  decision tree visualization and model interpretation.
+
 - `eli5 <https://github.com/TeamHG-Memex/eli5/>`_ A library for
   debugging/inspecting machine learning models and explaining their
   predictions.

From 962849aac4b7df13cd7043646e653ccec5021f3b Mon Sep 17 00:00:00 2001
From: Abo7atm <33042538+Abo7atm@users.noreply.github.com>
Date: Tue, 5 May 2020 01:15:13 +0300
Subject: [PATCH 107/125] DOC Add tslearn to related projects (#17109)

---
 doc/related_projects.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/doc/related_projects.rst b/doc/related_projects.rst
index 15c35a51f9b0c..17e33ed691eb5 100644
--- a/doc/related_projects.rst
+++ b/doc/related_projects.rst
@@ -121,6 +121,9 @@ and tasks.
 
 **Structured learning**
 
+- `tslearn <https://github.com/tslearn-team/tslearn>`_ A machine learning library for time series 
+  that offers tools for pre-processing and feature extraction as well as dedicated models for clustering, classification and regression.
+
 - `sktime <https://github.com/alan-turing-institute/sktime>`_ A scikit-learn compatible toolbox for machine learning with time series including time series classification/regression and (supervised/panel) forecasting.
 
 - `Seqlearn <https://github.com/larsmans/seqlearn>`_  Sequence classification

From 2cad437fc940bfe6fbb7a30b1c4e82d4f670fa25 Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Mon, 4 May 2020 18:19:35 -0400
Subject: [PATCH 108/125] STY Adjust line height of code blocks (#17094)

---
 doc/themes/scikit-learn-modern/static/css/theme.css | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/themes/scikit-learn-modern/static/css/theme.css b/doc/themes/scikit-learn-modern/static/css/theme.css
index 2b80d6fe2b762..ceda27c6de093 100644
--- a/doc/themes/scikit-learn-modern/static/css/theme.css
+++ b/doc/themes/scikit-learn-modern/static/css/theme.css
@@ -90,7 +90,7 @@ div.highlight {
 
 div.highlight pre {
   margin-bottom: 0;
-  line-height: 1rem;
+  line-height: 1.2rem;
 }
 
 div.highlight a {

From bd3fb2af30564bf918e506aed5a28c730e9da0e8 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Tue, 5 May 2020 09:01:09 +0200
Subject: [PATCH 109/125] DOC Remove unmaintained related projects (#17112)

---
 doc/related_projects.rst | 75 ++++------------------------------------
 1 file changed, 7 insertions(+), 68 deletions(-)

diff --git a/doc/related_projects.rst b/doc/related_projects.rst
index 17e33ed691eb5..825498d95ce92 100644
--- a/doc/related_projects.rst
+++ b/doc/related_projects.rst
@@ -23,17 +23,12 @@ enhance the functionality of scikit-learn's estimators.
 
 - `sklearn_pandas <https://github.com/paulgb/sklearn-pandas/>`_ bridge for
   scikit-learn pipelines and pandas data frame with dedicated transformers.
-  
+
 - `sklearn_xarray <https://github.com/phausamann/sklearn-xarray/>`_ provides
   compatibility of scikit-learn estimators with xarray data structures.
 
 **Auto-ML**
 
-- `auto_ml <https://github.com/ClimbsRocks/auto_ml/>`_
-  Automated machine learning for production and analytics, built on scikit-learn
-  and related projects. Trains a pipeline wth all the standard machine learning 
-  steps. Tuned for prediction speed and ease of transfer to production environments. 
-
 - `auto-sklearn <https://github.com/automl/auto-sklearn/>`_
   An automated machine learning toolkit and a drop-in replacement for a
   scikit-learn estimator
@@ -55,22 +50,11 @@ enhance the functionality of scikit-learn's estimators.
 - `REP <https://github.com/yandex/REP>`_ Environment for conducting data-driven
   research in a consistent and reproducible way
 
-- `ML Frontend <https://github.com/jeff1evesque/machine-learning>`_ provides
-  dataset management and SVM fitting/prediction through
-  `web-based <https://github.com/jeff1evesque/machine-learning#web-interface>`_
-  and `programmatic <https://github.com/jeff1evesque/machine-learning#programmatic-interface>`_
-  interfaces.
-
 - `Scikit-Learn Laboratory
   <https://skll.readthedocs.io/en/latest/index.html>`_  A command-line
   wrapper around scikit-learn that makes it easy to run machine learning
   experiments with multiple learners and large feature sets.
 
-- `Xcessiv <https://github.com/reiinakano/xcessiv>`_ is a notebook-like
-  application for quick, scalable, and automated hyperparameter tuning
-  and stacked ensembling. Provides a framework for keeping track of 
-  model-hyperparameter combinations.
-
 **Model inspection and visualisation**
 
 - `dtreeviz <https://github.com/parrt/dtreeviz/>`_ A python library for
@@ -83,9 +67,6 @@ enhance the functionality of scikit-learn's estimators.
 - `mlxtend <https://github.com/rasbt/mlxtend>`_ Includes model visualization
   utilities.
 
-- `scikit-plot <https://github.com/reiinakano/scikit-plot>`_ A visualization library
-  for quick and easy generation of common plots in data analysis and machine learning.
-
 - `yellowbrick <https://github.com/DistrictDataLabs/yellowbrick>`_ A suite of
   custom matplotlib visualizers for scikit-learn estimators to support visual feature
   analysis, model selection, evaluation, and diagnostics.
@@ -105,11 +86,6 @@ enhance the functionality of scikit-learn's estimators.
 - `sklearn-porter <https://github.com/nok/sklearn-porter>`_
   Transpile trained scikit-learn models to C, Java, Javascript and others.
 
-- `sklearn-compiledtrees <https://github.com/ajtulloch/sklearn-compiledtrees/>`_
-  Generate a C++ implementation of the predict function for decision trees (and
-  ensembles) trained by sklearn. Useful for latency-sensitive production
-  environments.
-
 
 Other estimators and tasks
 --------------------------
@@ -126,9 +102,6 @@ and tasks.
 
 - `sktime <https://github.com/alan-turing-institute/sktime>`_ A scikit-learn compatible toolbox for machine learning with time series including time series classification/regression and (supervised/panel) forecasting.
 
-- `Seqlearn <https://github.com/larsmans/seqlearn>`_  Sequence classification
-  using HMMs or structured perceptron.
-
 - `HMMLearn <https://github.com/hmmlearn/hmmlearn>`_ Implementation of hidden
   markov models that was previously part of scikit-learn.
 
@@ -145,12 +118,6 @@ and tasks.
 
 **Deep neural networks etc.**
 
-- `pylearn2 <http://deeplearning.net/software/pylearn2/>`_ A deep learning and
-  neural network library build on theano with scikit-learn like interface.
-
-- `sklearn_theano <https://sklearn-theano.github.io/>`_ scikit-learn compatible
-  estimators, transformers, and datasets which use Theano internally
-
 - `nolearn <https://github.com/dnouri/nolearn>`_ A number of wrappers and
   abstractions around existing neural network libraries
 
@@ -159,8 +126,8 @@ and tasks.
 
 - `lasagne <https://github.com/Lasagne/Lasagne>`_ A lightweight library to
   build and train neural networks in Theano.
-  
-- `skorch <https://github.com/dnouri/skorch>`_ A scikit-learn compatible 
+
+- `skorch <https://github.com/dnouri/skorch>`_ A scikit-learn compatible
   neural network library that wraps PyTorch.
 
 **Broad scope**
@@ -168,9 +135,6 @@ and tasks.
 - `mlxtend <https://github.com/rasbt/mlxtend>`_ Includes a number of additional
   estimators as well as model visualization utilities.
 
-- `sparkit-learn <https://github.com/lensacom/sparkit-learn>`_ Scikit-learn
-  API and functionality for PySpark's distributed modelling.
-
 **Other regression and classification**
 
 - `xgboost <https://github.com/dmlc/xgboost>`_ Optimised gradient boosted decision
@@ -193,18 +157,15 @@ and tasks.
 - `gplearn <https://github.com/trevorstephens/gplearn>`_ Genetic Programming
   for symbolic regression tasks.
 
-- `multiisotonic <https://github.com/alexfields/multiisotonic>`_ Isotonic
-  regression on multidimensional features.
-
-- `scikit-multilearn <https://scikit.ml>`_ Multi-label classification with 
-  focus on label space manipulation.
+- `scikit-multilearn <https://github.com/scikit-multilearn/scikit-multilearn>`_
+  Multi-label classification with focus on label space manipulation.
 
-- `seglearn <https://github.com/dmbee/seglearn>`_ Time series and sequence 
+- `seglearn <https://github.com/dmbee/seglearn>`_ Time series and sequence
   learning using sliding window segmentation.
 
 **Decomposition and clustering**
 
-- `lda <https://github.com/ariddell/lda/>`_: Fast implementation of latent
+- `lda <https://github.com/lda-project/lda/>`_: Fast implementation of latent
   Dirichlet allocation in Cython which uses `Gibbs sampling
   <https://en.wikipedia.org/wiki/Gibbs_sampling>`_ to sample from the true
   posterior distribution. (scikit-learn's
@@ -213,9 +174,6 @@ and tasks.
   <https://en.wikipedia.org/wiki/Variational_Bayesian_methods>`_ to sample from
   a tractable approximation of a topic model's posterior distribution.)
 
-- `Sparse Filtering <https://github.com/jmetzen/sparse-filtering>`_
-  Unsupervised feature learning based on sparse-filtering
-
 - `kmodes <https://github.com/nicodv/kmodes>`_ k-modes clustering algorithm for
   categorical data, and several of its variations.
 
@@ -243,9 +201,6 @@ Other packages useful for data analysis and machine learning.
 - `Pandas <https://pandas.pydata.org/>`_ Tools for working with heterogeneous and
   columnar data, relational queries, time series and basic statistics.
 
-- `theano <http://deeplearning.net/software/theano/>`_ A CPU/GPU array
-  processing framework geared towards deep learning research.
-
 - `statsmodels <https://www.statsmodels.org>`_ Estimating and analysing
   statistical models. More focused on statistical tests and less on prediction
   than scikit-learn.
@@ -259,17 +214,9 @@ Other packages useful for data analysis and machine learning.
 - `Seaborn <https://stanford.edu/~mwaskom/software/seaborn/>`_ Visualization library based on
   matplotlib. It provides a high-level interface for drawing attractive statistical graphics.
 
-- `Deep Learning <http://deeplearning.net/software_links/>`_ A curated list of deep learning
-  software libraries.
-
 Recommendation Engine packages
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
- - `GraphLab
-   <https://turi.com/products/create/docs/graphlab.toolkits.recommender.html>`_
-   Implementation of classical recommendation techniques (in C++, with
-   Python bindings).
-
 - `implicit <https://github.com/benfred/implicit>`_, Library for implicit
   feedback datasets.
 
@@ -303,11 +250,3 @@ Domain specific packages
 
 - `MSMBuilder <http://msmbuilder.org/>`_  Machine learning for protein
   conformational dynamics time series.
-
-- `scikit-surprise <https://surpriselib.com/>`_ A scikit for building and
-  evaluating recommender systems.
-
-Snippets and tidbits
----------------------
-
-The `wiki <https://github.com/scikit-learn/scikit-learn/wiki/Third-party-projects-and-code-snippets>`_ has more!

From b2b88d3ee47cfb2f4b947751ec141d6107717243 Mon Sep 17 00:00:00 2001
From: Adrin Jalali <adrin.jalali@gmail.com>
Date: Tue, 5 May 2020 09:07:25 +0200
Subject: [PATCH 110/125] MNT bump master version to 0.24.dev0 (#17121)

---
 sklearn/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index 4d942319c5eb7..870d0d9a93f0d 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -37,7 +37,7 @@
 # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
 #
-__version__ = '0.23.dev0'
+__version__ = '0.24.dev0'
 
 
 # On OSX, we can get a runtime error due to multiple OpenMP libraries loaded

From f23b940dcabdc86b8b71dc8a9a90ef91505407cc Mon Sep 17 00:00:00 2001
From: Thomas J Fan <thomasjpfan@gmail.com>
Date: Tue, 5 May 2020 03:58:10 -0400
Subject: [PATCH 111/125] FIX Adjusts html_repr based on configuration (#17093)

* ENH Adjusts html_repr based on configuration

* CLN Returns None instead

* CLN Uses property hack

* CLN Address comments
---
 sklearn/base.py            | 19 ++++++++++++++++++-
 sklearn/tests/test_base.py | 12 ++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/sklearn/base.py b/sklearn/base.py
index 666574b491594..77c3223ed75e1 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -437,8 +437,25 @@ def _validate_data(self, X, y=None, reset=True,
 
         return out
 
+    @property
     def _repr_html_(self):
-        """HTML representation of estimator"""
+        """HTML representation of estimator.
+
+        This is redundant with the logic of `_repr_mimebundle_`. The latter
+        should be favorted in the long term, `_repr_html_` is only
+        implemented for consumers who do not interpret `_repr_mimbundle_`.
+        """
+        if get_config()["display"] != 'diagram':
+            raise AttributeError("_repr_html_ is only defined when the "
+                                 "'display' configuration option is set to "
+                                 "'diagram'")
+        return self._repr_html_inner
+
+    def _repr_html_inner(self):
+        """This function is returned by the @property `_repr_html_` to make
+        `hasattr(estimator, "_repr_html_") return `True` or `False` depending
+        on `get_config()["display"]`.
+        """
         return estimator_html_repr(self)
 
     def _repr_mimebundle_(self, **kwargs):
diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
index e20fa440d1933..db5c88051346a 100644
--- a/sklearn/tests/test_base.py
+++ b/sklearn/tests/test_base.py
@@ -525,3 +525,15 @@ def test_repr_mimebundle_():
         output = tree._repr_mimebundle_()
         assert "text/plain" in output
         assert "text/html" in output
+
+
+def test_repr_html_wraps():
+    # Checks the display configuration flag controls the html output
+    tree = DecisionTreeClassifier()
+    msg = "_repr_html_ is only defined when"
+    with pytest.raises(AttributeError, match=msg):
+        output = tree._repr_html_()
+
+    with config_context(display='diagram'):
+        output = tree._repr_html_()
+        assert "<style>" in output

From 70f00620af7573d891af756448316da1fad12a37 Mon Sep 17 00:00:00 2001
From: Chiara Marmo <cmarmo@users.noreply.github.com>
Date: Tue, 5 May 2020 11:54:26 +0200
Subject: [PATCH 112/125] DOC: move missing projects from wiki to
 related_projects (#17129)

---
 doc/related_projects.rst | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/doc/related_projects.rst b/doc/related_projects.rst
index 825498d95ce92..6c8fb57c34aa7 100644
--- a/doc/related_projects.rst
+++ b/doc/related_projects.rst
@@ -21,6 +21,9 @@ enhance the functionality of scikit-learn's estimators.
 
 **Data formats**
 
+- `Fast svmlight / libsvm file loader <https://github.com/mblondel/svmlight-loader>`_
+  Fast and memory-efficient svmlight / libsvm file loader for Python.
+
 - `sklearn_pandas <https://github.com/paulgb/sklearn-pandas/>`_ bridge for
   scikit-learn pipelines and pandas data frame with dedicated transformers.
 
@@ -39,12 +42,6 @@ enhance the functionality of scikit-learn's estimators.
   preprocessors as well as the estimators. Works as a drop-in replacement for a
   scikit-learn estimator.
 
-- `scikit-optimize <https://scikit-optimize.github.io/>`_
-  A library to minimize (very) expensive and noisy black-box functions. It
-  implements several methods for sequential model-based optimization, and
-  includes a replacement for ``GridSearchCV`` or ``RandomizedSearchCV`` to do
-  cross-validated parameter search using any of these strategies.
-
 **Experimentation frameworks**
 
 - `REP <https://github.com/yandex/REP>`_ Environment for conducting data-driven
@@ -71,6 +68,16 @@ enhance the functionality of scikit-learn's estimators.
   custom matplotlib visualizers for scikit-learn estimators to support visual feature
   analysis, model selection, evaluation, and diagnostics.
 
+**Model selection**
+
+- `scikit-optimize <https://scikit-optimize.github.io/>`_
+  A library to minimize (very) expensive and noisy black-box functions. It
+  implements several methods for sequential model-based optimization, and
+  includes a replacement for ``GridSearchCV`` or ``RandomizedSearchCV`` to do
+  cross-validated parameter search using any of these strategies.
+
+- `sklearn-deap <https://github.com/rsteca/sklearn-deap>`_ Use evolutionary
+   algorithms instead of gridsearch in scikit-learn.
 
 **Model export for production**
 
@@ -163,6 +170,11 @@ and tasks.
 - `seglearn <https://github.com/dmbee/seglearn>`_ Time series and sequence
   learning using sliding window segmentation.
 
+- `libOPF <https://github.com/jppbsi/LibOPF>`_ Optimal path forest classifier
+
+- `fastFM <https://github.com/ibayer/fastFM>`_ Fast factorization machine
+  implementation compatible with scikit-learn
+
 **Decomposition and clustering**
 
 - `lda <https://github.com/lda-project/lda/>`_: Fast implementation of latent

From 5ceaad1230c9e0dd4c2721756963169b5e9923d3 Mon Sep 17 00:00:00 2001
From: Adrin Jalali <adrin.jalali@gmail.com>
Date: Tue, 5 May 2020 13:32:42 +0200
Subject: [PATCH 113/125] DOC add authors and release date to whats_new/0.23
 (#17122)

---
 doc/templates/index.html |  9 +--------
 doc/whats_new/v0.23.rst  | 42 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/doc/templates/index.html b/doc/templates/index.html
index 367e6a3c01902..f49fbc2f4c540 100644
--- a/doc/templates/index.html
+++ b/doc/templates/index.html
@@ -155,6 +155,7 @@ <h4 class="sk-landing-call-header">News</h4>
         <ul class="sk-landing-call-list list-unstyled">
         <li><strong>On-going development:</strong>
         <a href="https://scikit-learn.org/dev/whats_new.html"><strong>What's new</strong> (Changelog)</a>
+        <li><strong>May 2020.</strong> scikit-learn 0.23.0 is available for download (<a href="whats_new/v0.23.html#version-0-23-0">Changelog</a>).
         </li>
         <li><strong>Scikit-learn from 0.23 requires Python 3.6 or greater.</strong>
         </li>
@@ -168,14 +169,6 @@ <h4 class="sk-landing-call-header">News</h4>
         </li>
         <li><strong>May 2019.</strong> scikit-learn 0.21.0 to 0.21.2 are available for download (<a href="whats_new/v0.21.html#version-0-21-2">Changelog</a>).
         </li>
-        <li><strong>March 2019.</strong> scikit-learn 0.20.3 is available for download (<a href="whats_new/v0.20.html#version-0-20-3">Changelog</a>).
-        </li>
-        <li><strong>September 2018.</strong> scikit-learn 0.20.0 is available for download (<a href="whats_new/v0.20.html#version-0-20-0">Changelog</a>).
-        </li>
-        <li><strong>July 2018.</strong> scikit-learn 0.19.2 is available for download (<a href="whats_new/v0.19.html#version-0-19-2">Changelog</a>).
-        </li>
-        <li><strong>July 2017.</strong> scikit-learn 0.19.0 is available for download (<a href="whats_new/v0.19.html#version-0-19">Changelog</a>).
-        </li>
         </ul>
       </div>
       <div class="col-md-4">
diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index d60b16edce903..98725d4aa781d 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -7,7 +7,7 @@
 Version 0.23.0
 ==============
 
-**In Development**
+**May 12 2020**
 
 For a short description of the main highlights of the release, please
 refer to
@@ -659,3 +659,43 @@ Code and Documentation Contributors
 
 Thanks to everyone who has contributed to the maintenance and improvement of the
 project since version 0.20, including:
+
+Abbie Popa, Adrin Jalali, Aleksandra Kocot, Alexandre Batisse, Alexandre
+Gramfort, Alex Henrie, Alex Itkes, Alex Liang, alexshacked, Alonso Silva
+Allende, Ana Casado, Andreas Mueller, Angela Ambroz, Ankit810, Arie Pratama
+Sutiono, Arunav Konwar, Baptiste Maingret, Benjamin Beier Liu, bernie gray,
+Bharathi Srinivasan, Bharat Raghunathan, Bibhash Chandra Mitra, Brian Wignall,
+brigi, Brigitta Sipőcz, Carlos H Brandt, CastaChick, castor, cgsavard, Chiara
+Marmo, Chris Gregory, Christian Kastner, Christian Lorentzen, Corrie
+Bartelheimer, Daniël van Gelder, Daphne, David Breuer, david-cortes, dbauer9,
+Divyaprabha M, Edward Qian, Ekaterina Borovikova, ELNS, Emily Taylor, Erich
+Schubert, Eric Leung, Evgeni Chasnovski, Fabiana, Facundo Ferrín, Fan,
+Franziska Boenisch, Gael Varoquaux, Gaurav Sharma, Geoffrey Bolmier, Georgi
+Peev, gholdman1, Gonthier Nicolas, Gregory Morse, Gregory R. Lee, Guillaume
+Lemaitre, Gui Miotto, Hailey Nguyen, Hanmin Qin, Hao Chun Chang, HaoYin, Hélion
+du Mas des Bourboux, Himanshu Garg, Hirofumi Suzuki, huangk10, Hugo van
+Kemenade, Hye Sung Jung, indecisiveuser, inderjeet, J-A16, Jérémie du
+Boisberranger, Jin-Hwan CHO, JJmistry, Joel Nothman, Johann Faouzi, Jon Haitz
+Legarreta Gorroño, Juan Carlos Alfaro Jiménez, judithabk6, jumon, Kathryn
+Poole, Katrina Ni, Kesshi Jordan, Kevin Loftis, Kevin Markham,
+krishnachaitanya9, Lam Gia Thuan, Leland McInnes, Lisa Schwetlick, lkubin, Loic
+Esteve, lopusz, lrjball, lucgiffon, lucyleeow, Lucy Liu, Lukas Kemkes, Maciej J
+Mikulski, Madhura Jayaratne, Magda Zielinska, maikia, Mandy Gu, Manimaran,
+Manish Aradwad, Maren Westermann, Maria, Mariana Meireles, Marie Douriez,
+Marielle, Mateusz Górski, mathurinm, Matt Hall, Maura Pintor, mc4229, meyer89,
+m.fab, Michael Shoemaker, Michał Słapek, Mina Naghshhnejad, mo, Mohamed
+Maskani, Mojca Bertoncelj, narendramukherjee, ngshya, Nicholas Won, Nicolas
+Hug, nicolasservel, Niklas, @nkish, Noa Tamir, Oleksandr Pavlyk, olicairns,
+Oliver Urs Lenz, Olivier Grisel, parsons-kyle-89, Paula, Pete Green, Pierre
+Delanoue, pspachtholz, Pulkit Mehta, Qizhi  Jiang, Quang Nguyễn, rachelcjordan,
+raduspaimoc, Reshama Shaikh, Riccardo Folloni, Rick Mackenbach, Ritchie Ng,
+Roman Feldbauer, Roman Yurchak, Rory Hartong-Redden, Rüdiger Busche, Rushabh
+Vasani, Sambhav Kothari, Samesh Lakhotia, Samuel Duan, SanthoshBala18, Santiago
+M. Mola, Sarat Addepalli, scibol, Sebastian Kießling, SergioDSR, Sergul Aydore,
+Shiki-H, shivamgargsya, SHUBH CHATTERJEE, Siddharth Gupta, simonamaggio,
+smarie, Snowhite, stareh, Stephen Blystone, Stephen Marsh, Sunmi Yoon,
+SylvainLan, talgatomarov, tamirlan1, th0rwas, theoptips, Thomas J Fan, Thomas
+Li, Thomas Schmitt, Tim Nonner, Tim Vink, Tiphaine Viard, Tirth Patel, Titus
+Christian, Tom Dupré la Tour, trimeta, Vachan D A, Vandana Iyer, Venkatachalam
+N, waelbenamara, wconnell, wderose, wenliwyan, Windber, wornbb, Yu-Hang "Maxin"
+Tang

From b9f7b9d719947ea33e9796c1af4355cb21edeb20 Mon Sep 17 00:00:00 2001
From: Adrin Jalali <adrin.jalali@gmail.com>
Date: Tue, 5 May 2020 15:01:06 +0200
Subject: [PATCH 114/125] DOC 0.23/whats_new hotfix (#17131)

---
 doc/whats_new/v0.23.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index 98725d4aa781d..962db0f39c961 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -687,7 +687,7 @@ m.fab, Michael Shoemaker, Michał Słapek, Mina Naghshhnejad, mo, Mohamed
 Maskani, Mojca Bertoncelj, narendramukherjee, ngshya, Nicholas Won, Nicolas
 Hug, nicolasservel, Niklas, @nkish, Noa Tamir, Oleksandr Pavlyk, olicairns,
 Oliver Urs Lenz, Olivier Grisel, parsons-kyle-89, Paula, Pete Green, Pierre
-Delanoue, pspachtholz, Pulkit Mehta, Qizhi  Jiang, Quang Nguyễn, rachelcjordan,
+Delanoue, pspachtholz, Pulkit Mehta, Qizhi  Jiang, Quang Nguyen, rachelcjordan,
 raduspaimoc, Reshama Shaikh, Riccardo Folloni, Rick Mackenbach, Ritchie Ng,
 Roman Feldbauer, Roman Yurchak, Rory Hartong-Redden, Rüdiger Busche, Rushabh
 Vasani, Sambhav Kothari, Samesh Lakhotia, Samuel Duan, SanthoshBala18, Santiago

From f72d9711f5242cd73f91551b75399dcfb88a5861 Mon Sep 17 00:00:00 2001
From: Chiara Marmo <cmarmo@users.noreply.github.com>
Date: Tue, 5 May 2020 16:40:38 +0200
Subject: [PATCH 115/125] ENH Add random_state parameter to AffinityPropagation
 (#16801)

* Added value checks and random state parameter to method

* Changed default parameter to None instead of 0

* Added numpy RandomState to the check

* Replaced inline validation with check_random_state from utils and pointed at glossery

* Needed a different default parameter to pass the default way this has been working in the past

* Updated to conform with flake8 stds

* Add random_state to AffinityPropagation class.

* Add test.

* Add what's new entry and versionadded directive.

* Add PR number.

* Fix lint error due to this PR.

* Use np.array_equal in test.

* Update sklearn/cluster/_affinity_propagation.py

Co-Authored-By: Adrin Jalali <adrin.jalali@gmail.com>

* Homogenize parametre descriptions, default random_state to None.

* Update sklearn/cluster/_affinity_propagation.py

Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com>

* Update sklearn/cluster/_affinity_propagation.py

Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com>

* Update sklearn/cluster/_affinity_propagation.py

Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com>

* Update doc/whats_new/v0.23.rst

Co-Authored-By: Nicolas Hug <contact@nicolas-hug.com>

* Change test name.

* Modify check in test.

* Fix lint errors.

* Address comment.

* Address comment.

* Add 'deprecation' and its correspondent test.

* Fix lint errors.

* Add random_state parameter to tests, to avoid FutureWarnings.

* Move warning in fit. Modify tests.

* Modify example.

* Tentative fix for failures.

* Document default value to 0. Revert docstring.

* Explicit link to Glossary.

* Fix default value.

* Remove some warnings from tests.

* Validate and test docstring.

* Tentative fix.

* Tentative fix.

* Ignore FutureWarning in fit attribute test.

* Set random_state to avoid FutureWarning in test_fit_docstring_attributes.

* [doc build] Force documentation build.

* Clarify warning message.

Co-authored-by: rwoolston.admin <rwoolston.admin@LXO-DS-DEV.afcucorp.local>
Co-authored-by: Adrin Jalali <adrin.jalali@gmail.com>
Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 doc/whats_new/v0.23.rst                       |  4 ++
 sklearn/cluster/_affinity_propagation.py      | 70 +++++++++++++------
 .../tests/test_affinity_propagation.py        | 62 ++++++++++++----
 sklearn/tests/test_docstring_parameters.py    |  4 ++
 4 files changed, 105 insertions(+), 35 deletions(-)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index 962db0f39c961..4ce884d2e8f87 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -143,6 +143,10 @@ Changelog
   deprecated. It has no effect. :pr:`11950` by
   :user:`Jeremie du Boisberranger <jeremiedbb>`.
 
+- |API| The ``random_state`` parameter has been added to 
+  :class:`cluster.AffinityPropagation`. :pr:`16801` by :user:`rcwoolston`
+  and :user:`Chiara Marmo <cmarmo>`.
+
 :mod:`sklearn.compose`
 ......................
 
diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py
index 9516c8e4bdd05..f670c621a94ee 100644
--- a/sklearn/cluster/_affinity_propagation.py
+++ b/sklearn/cluster/_affinity_propagation.py
@@ -10,7 +10,7 @@
 
 from ..exceptions import ConvergenceWarning
 from ..base import BaseEstimator, ClusterMixin
-from ..utils import as_float_array, check_array
+from ..utils import as_float_array, check_array, check_random_state
 from ..utils.validation import check_is_fitted, _deprecate_positional_args
 from ..metrics import euclidean_distances
 from ..metrics import pairwise_distances_argmin
@@ -32,7 +32,7 @@ def all_equal_similarities():
 
 def affinity_propagation(S, preference=None, convergence_iter=15, max_iter=200,
                          damping=0.5, copy=True, verbose=False,
-                         return_n_iter=False):
+                         return_n_iter=False, random_state='warn'):
     """Perform Affinity Propagation Clustering of data
 
     Read more in the :ref:`User Guide <affinity_propagation>`.
@@ -72,6 +72,14 @@ def affinity_propagation(S, preference=None, convergence_iter=15, max_iter=200,
     return_n_iter : bool, default False
         Whether or not to return the number of iterations.
 
+    random_state : int or np.random.RandomStateInstance, default: 0
+        Pseudo-random number generator to control the starting state.
+        Use an int for reproducible results across function calls.
+        See the :term:`Glossary <random_state>`.
+
+        .. versionadded:: 0.23
+            this parameter was previously hardcoded as 0.
+
     Returns
     -------
 
@@ -133,7 +141,16 @@ def affinity_propagation(S, preference=None, convergence_iter=15, max_iter=200,
                     if return_n_iter
                     else (np.array([0]), np.array([0] * n_samples)))
 
-    random_state = np.random.RandomState(0)
+    if random_state == 'warn':
+        warnings.warn(("'random_state' has been introduced in 0.23. "
+                       "It will be set to None starting from 0.25 which "
+                       "means that results will differ at every function "
+                       "call. Set 'random_state' to None to silence this "
+                       "warning, or to 0 to keep the behavior of versions "
+                       "<0.23."),
+                      FutureWarning)
+        random_state = 0
+    random_state = check_random_state(random_state)
 
     # Place preference on the diagonal of S
     S.flat[::(n_samples + 1)] = preference
@@ -274,6 +291,13 @@ class AffinityPropagation(ClusterMixin, BaseEstimator):
     verbose : bool, default=False
         Whether to be verbose.
 
+    random_state : int or np.random.RandomStateInstance, default: 0
+        Pseudo-random number generator to control the starting state.
+        Use an int for reproducible results across function calls.
+        See the :term:`Glossary <random_state>`.
+
+        .. versionadded:: 0.23
+            this parameter was previously hardcoded as 0.
 
     Attributes
     ----------
@@ -292,23 +316,6 @@ class AffinityPropagation(ClusterMixin, BaseEstimator):
     n_iter_ : int
         Number of iterations taken to converge.
 
-    Examples
-    --------
-    >>> from sklearn.cluster import AffinityPropagation
-    >>> import numpy as np
-    >>> X = np.array([[1, 2], [1, 4], [1, 0],
-    ...               [4, 2], [4, 4], [4, 0]])
-    >>> clustering = AffinityPropagation().fit(X)
-    >>> clustering
-    AffinityPropagation()
-    >>> clustering.labels_
-    array([0, 0, 0, 1, 1, 1])
-    >>> clustering.predict([[0, 0], [4, 4]])
-    array([0, 1])
-    >>> clustering.cluster_centers_
-    array([[1, 2],
-           [4, 2]])
-
     Notes
     -----
     For an example, see :ref:`examples/cluster/plot_affinity_propagation.py
@@ -333,11 +340,28 @@ class AffinityPropagation(ClusterMixin, BaseEstimator):
 
     Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
     Between Data Points", Science Feb. 2007
+
+    Examples
+    --------
+    >>> from sklearn.cluster import AffinityPropagation
+    >>> import numpy as np
+    >>> X = np.array([[1, 2], [1, 4], [1, 0],
+    ...               [4, 2], [4, 4], [4, 0]])
+    >>> clustering = AffinityPropagation(random_state=5).fit(X)
+    >>> clustering
+    AffinityPropagation(random_state=5)
+    >>> clustering.labels_
+    array([0, 0, 0, 1, 1, 1])
+    >>> clustering.predict([[0, 0], [4, 4]])
+    array([0, 1])
+    >>> clustering.cluster_centers_
+    array([[1, 2],
+           [4, 2]])
     """
     @_deprecate_positional_args
     def __init__(self, *, damping=.5, max_iter=200, convergence_iter=15,
                  copy=True, preference=None, affinity='euclidean',
-                 verbose=False):
+                 verbose=False, random_state='warn'):
 
         self.damping = damping
         self.max_iter = max_iter
@@ -346,6 +370,7 @@ def __init__(self, *, damping=.5, max_iter=200, convergence_iter=15,
         self.verbose = verbose
         self.preference = preference
         self.affinity = affinity
+        self.random_state = random_state
 
     @property
     def _pairwise(self):
@@ -388,7 +413,8 @@ def fit(self, X, y=None):
             affinity_propagation(
                 self.affinity_matrix_, self.preference, max_iter=self.max_iter,
                 convergence_iter=self.convergence_iter, damping=self.damping,
-                copy=self.copy, verbose=self.verbose, return_n_iter=True)
+                copy=self.copy, verbose=self.verbose, return_n_iter=True,
+                random_state=self.random_state)
 
         if self.affinity != "precomputed":
             self.cluster_centers_ = X[self.cluster_centers_indices_].copy()
diff --git a/sklearn/cluster/tests/test_affinity_propagation.py b/sklearn/cluster/tests/test_affinity_propagation.py
index 6484d36d443d1..826878061291b 100644
--- a/sklearn/cluster/tests/test_affinity_propagation.py
+++ b/sklearn/cluster/tests/test_affinity_propagation.py
@@ -33,16 +33,18 @@ def test_affinity_propagation():
     preference = np.median(S) * 10
     # Compute Affinity Propagation
     cluster_centers_indices, labels = affinity_propagation(
-        S, preference=preference)
+        S, preference=preference, random_state=39)
 
     n_clusters_ = len(cluster_centers_indices)
 
     assert n_clusters == n_clusters_
 
-    af = AffinityPropagation(preference=preference, affinity="precomputed")
+    af = AffinityPropagation(preference=preference, affinity="precomputed",
+                             random_state=28)
     labels_precomputed = af.fit(S).labels_
 
-    af = AffinityPropagation(preference=preference, verbose=True)
+    af = AffinityPropagation(preference=preference, verbose=True,
+                             random_state=37)
     labels = af.fit(X).labels_
 
     assert_array_equal(labels, labels_precomputed)
@@ -55,7 +57,7 @@ def test_affinity_propagation():
 
     # Test also with no copy
     _, labels_no_copy = affinity_propagation(S, preference=preference,
-                                             copy=False)
+                                             copy=False, random_state=74)
     assert_array_equal(labels, labels_no_copy)
 
     # Test input validation
@@ -63,16 +65,16 @@ def test_affinity_propagation():
         affinity_propagation(S[:, :-1])
     with pytest.raises(ValueError):
         affinity_propagation(S, damping=0)
-    af = AffinityPropagation(affinity="unknown")
+    af = AffinityPropagation(affinity="unknown", random_state=78)
     with pytest.raises(ValueError):
         af.fit(X)
-    af_2 = AffinityPropagation(affinity='precomputed')
+    af_2 = AffinityPropagation(affinity='precomputed', random_state=21)
     with pytest.raises(TypeError):
         af_2.fit(csr_matrix((3, 3)))
 
 def test_affinity_propagation_predict():
     # Test AffinityPropagation.predict
-    af = AffinityPropagation(affinity="euclidean")
+    af = AffinityPropagation(affinity="euclidean", random_state=63)
     labels = af.fit_predict(X)
     labels2 = af.predict(X)
     assert_array_equal(labels, labels2)
@@ -87,7 +89,7 @@ def test_affinity_propagation_predict_error():
 
     # Predict not supported when affinity="precomputed".
     S = np.dot(X, X.T)
-    af = AffinityPropagation(affinity="precomputed")
+    af = AffinityPropagation(affinity="precomputed", random_state=57)
     af.fit(S)
     with pytest.raises(ValueError):
         af.predict(X)
@@ -100,7 +102,7 @@ def test_affinity_propagation_fit_non_convergence():
     X = np.array([[0, 0], [1, 1], [-2, -2]])
 
     # Force non-convergence by allowing only a single iteration
-    af = AffinityPropagation(preference=-10, max_iter=1)
+    af = AffinityPropagation(preference=-10, max_iter=1, random_state=82)
 
     assert_warns(ConvergenceWarning, af.fit, X)
     assert_array_equal(np.empty((0, 2)), af.cluster_centers_)
@@ -129,7 +131,7 @@ def test_affinity_propagation_equal_mutual_similarities():
 
     # setting different preferences
     cluster_center_indices, labels = assert_no_warnings(
-        affinity_propagation, S, preference=[-20, -10])
+        affinity_propagation, S, preference=[-20, -10], random_state=37)
 
     # expect one cluster, with highest-preference sample as exemplar
     assert_array_equal([1], cluster_center_indices)
@@ -143,7 +145,8 @@ def test_affinity_propagation_predict_non_convergence():
 
     # Force non-convergence by allowing only a single iteration
     af = assert_warns(ConvergenceWarning,
-                      AffinityPropagation(preference=-10, max_iter=1).fit, X)
+                      AffinityPropagation(preference=-10,
+                                          max_iter=1, random_state=75).fit, X)
 
     # At prediction time, consider new samples as noise since there are no
     # clusters
@@ -156,7 +159,8 @@ def test_affinity_propagation_non_convergence_regressiontest():
     X = np.array([[1, 0, 0, 0, 0, 0],
                   [0, 1, 1, 1, 0, 0],
                   [0, 0, 1, 0, 0, 1]])
-    af = AffinityPropagation(affinity='euclidean', max_iter=2).fit(X)
+    af = AffinityPropagation(affinity='euclidean',
+                             max_iter=2, random_state=34).fit(X)
     assert_array_equal(np.array([-1, -1, -1]), af.labels_)
 
 
@@ -181,6 +185,38 @@ def test_equal_similarities_and_preferences():
     assert _equal_similarities_and_preferences(S, np.array(0))
 
 
+def test_affinity_propagation_random_state():
+    # Significance of random_state parameter
+    # Generate sample data
+    centers = [[1, 1], [-1, -1], [1, -1]]
+    X, labels_true = make_blobs(n_samples=300, centers=centers,
+                                cluster_std=0.5, random_state=0)
+    # random_state = 0
+    ap = AffinityPropagation(convergence_iter=1, max_iter=2, random_state=0)
+    ap.fit(X)
+    centers0 = ap.cluster_centers_
+
+    # random_state = 76
+    ap = AffinityPropagation(convergence_iter=1, max_iter=2, random_state=76)
+    ap.fit(X)
+    centers76 = ap.cluster_centers_
+
+    assert np.mean((centers0 - centers76) ** 2) > 1
+
+
+# FIXME: to be removed in 0.25
+def test_affinity_propagation_random_state_warning():
+    # test that a warning is raised when random_state is not defined.
+    X = np.array([[0, 0], [1, 1], [-2, -2]])
+    match = ("'random_state' has been introduced in 0.23. "
+             "It will be set to None starting from 0.25 which "
+             "means that results will differ at every function "
+             "call. Set 'random_state' to None to silence this "
+             "warning, or to 0 to keep the behavior of versions "
+             "<0.23.")
+    with pytest.warns(FutureWarning, match=match):
+        AffinityPropagation().fit(X)
+
 @pytest.mark.parametrize('centers', [csr_matrix(np.zeros((1, 10))),
                                      np.zeros((1, 10))])
 def test_affinity_propagation_convergence_warning_dense_sparse(centers):
@@ -188,7 +224,7 @@ def test_affinity_propagation_convergence_warning_dense_sparse(centers):
     rng = np.random.RandomState(42)
     X = rng.rand(40, 10)
     y = (4 * rng.rand(40)).astype(np.int)
-    ap = AffinityPropagation()
+    ap = AffinityPropagation(random_state=46)
     ap.fit(X, y)
     ap.cluster_centers_ = centers
     with pytest.warns(None) as record:
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index 8ea0ec97f9fc2..48e2cffe265e1 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -198,6 +198,10 @@ def test_fit_docstring_attributes(name, Estimator):
     if Estimator.__name__ == 'DummyClassifier':
         est.strategy = "stratified"
 
+    # TO BE REMOVED for v0.25 (avoid FutureWarning)
+    if Estimator.__name__ == 'AffinityPropagation':
+        est.random_state = 63
+
     X, y = make_classification(n_samples=20, n_features=3,
                                n_redundant=0, n_classes=2,
                                random_state=2)

From 226e5c478db07495b3ba4f4640e21f11c55f2444 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 5 May 2020 13:52:07 -0400
Subject: [PATCH 116/125] MNT Use Vt instead of V as returned by svd() (#17100)

---
 examples/decomposition/plot_pca_3d.py     |  7 ++--
 sklearn/datasets/_samples_generator.py    |  4 +-
 sklearn/decomposition/_factor_analysis.py | 18 ++++-----
 sklearn/decomposition/_incremental_pca.py |  6 +--
 sklearn/decomposition/_pca.py             | 28 +++++++-------
 sklearn/discriminant_analysis.py          |  8 ++--
 sklearn/utils/extmath.py                  | 12 +++---
 sklearn/utils/tests/test_extmath.py       | 46 +++++++++++------------
 8 files changed, 65 insertions(+), 64 deletions(-)

diff --git a/examples/decomposition/plot_pca_3d.py b/examples/decomposition/plot_pca_3d.py
index 58494f7ef816d..f932d698adc8b 100644
--- a/examples/decomposition/plot_pca_3d.py
+++ b/examples/decomposition/plot_pca_3d.py
@@ -66,14 +66,13 @@ def plot_figs(fig_num, elev, azim):
     Y = np.c_[a, b, c]
 
     # Using SciPy's SVD, this would be:
-    # _, pca_score, V = scipy.linalg.svd(Y, full_matrices=False)
+    # _, pca_score, Vt = scipy.linalg.svd(Y, full_matrices=False)
 
     pca = PCA(n_components=3)
     pca.fit(Y)
-    pca_score = pca.explained_variance_ratio_
-    V = pca.components_
+    V = pca.components_.T
 
-    x_pca_axis, y_pca_axis, z_pca_axis = 3 * V.T
+    x_pca_axis, y_pca_axis, z_pca_axis = 3 * V
     x_pca_plane = np.r_[x_pca_axis[:2], - x_pca_axis[1::-1]]
     y_pca_plane = np.r_[y_pca_axis[:2], - y_pca_axis[1::-1]]
     z_pca_plane = np.r_[z_pca_axis[:2], - z_pca_axis[1::-1]]
diff --git a/sklearn/datasets/_samples_generator.py b/sklearn/datasets/_samples_generator.py
index ee3ac6ab2827f..9ccdf8700fcf0 100644
--- a/sklearn/datasets/_samples_generator.py
+++ b/sklearn/datasets/_samples_generator.py
@@ -1307,8 +1307,8 @@ def make_spd_matrix(n_dim, *, random_state=None):
     generator = check_random_state(random_state)
 
     A = generator.rand(n_dim, n_dim)
-    U, s, V = linalg.svd(np.dot(A.T, A))
-    X = np.dot(np.dot(U, 1.0 + np.diag(generator.rand(n_dim))), V)
+    U, _, Vt = linalg.svd(np.dot(A.T, A))
+    X = np.dot(np.dot(U, 1.0 + np.diag(generator.rand(n_dim))), Vt)
 
     return X
 
diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py
index a09b89bda6d6e..cc0178b70e447 100644
--- a/sklearn/decomposition/_factor_analysis.py
+++ b/sklearn/decomposition/_factor_analysis.py
@@ -201,17 +201,17 @@ def fit(self, X, y=None):
         # to allow for unified computation of loglikelihood
         if self.svd_method == 'lapack':
             def my_svd(X):
-                _, s, V = linalg.svd(X, full_matrices=False)
-                return (s[:n_components], V[:n_components],
+                _, s, Vt = linalg.svd(X, full_matrices=False)
+                return (s[:n_components], Vt[:n_components],
                         squared_norm(s[n_components:]))
         elif self.svd_method == 'randomized':
             random_state = check_random_state(self.random_state)
 
             def my_svd(X):
-                _, s, V = randomized_svd(X, n_components,
-                                         random_state=random_state,
-                                         n_iter=self.iterated_power)
-                return s, V, squared_norm(X) - squared_norm(s)
+                _, s, Vt = randomized_svd(X, n_components,
+                                          random_state=random_state,
+                                          n_iter=self.iterated_power)
+                return s, Vt, squared_norm(X) - squared_norm(s)
         else:
             raise ValueError('SVD method %s is not supported. Please consider'
                              ' the documentation' % self.svd_method)
@@ -219,11 +219,11 @@ def my_svd(X):
         for i in range(self.max_iter):
             # SMALL helps numerics
             sqrt_psi = np.sqrt(psi) + SMALL
-            s, V, unexp_var = my_svd(X / (sqrt_psi * nsqrt))
+            s, Vt, unexp_var = my_svd(X / (sqrt_psi * nsqrt))
             s **= 2
             # Use 'maximum' here to avoid sqrt problems.
-            W = np.sqrt(np.maximum(s - 1., 0.))[:, np.newaxis] * V
-            del V
+            W = np.sqrt(np.maximum(s - 1., 0.))[:, np.newaxis] * Vt
+            del Vt
             W *= sqrt_psi
 
             # loglikelihood
diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py
index bc34c17326f19..77d9b13da0dfa 100644
--- a/sklearn/decomposition/_incremental_pca.py
+++ b/sklearn/decomposition/_incremental_pca.py
@@ -295,13 +295,13 @@ def partial_fit(self, X, y=None, check_input=True):
             X = np.vstack((self.singular_values_.reshape((-1, 1)) *
                            self.components_, X, mean_correction))
 
-        U, S, V = linalg.svd(X, full_matrices=False)
-        U, V = svd_flip(U, V, u_based_decision=False)
+        U, S, Vt = linalg.svd(X, full_matrices=False)
+        U, Vt = svd_flip(U, Vt, u_based_decision=False)
         explained_variance = S ** 2 / (n_total_samples - 1)
         explained_variance_ratio = S ** 2 / np.sum(col_var * n_total_samples)
 
         self.n_samples_seen_ = n_total_samples
-        self.components_ = V[:self.n_components_]
+        self.components_ = Vt[:self.n_components_]
         self.singular_values_ = S[:self.n_components_]
         self.mean_ = col_mean
         self.var_ = col_var
diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py
index 14a993c56dce8..b30637ac50a14 100644
--- a/sklearn/decomposition/_pca.py
+++ b/sklearn/decomposition/_pca.py
@@ -373,14 +373,14 @@ def fit_transform(self, X, y=None):
         This method returns a Fortran-ordered array. To convert it to a
         C-ordered array, use 'np.ascontiguousarray'.
         """
-        U, S, V = self._fit(X)
+        U, S, Vt = self._fit(X)
         U = U[:, :self.n_components_]
 
         if self.whiten:
             # X_new = X * V / S * sqrt(n_samples) = U * sqrt(n_samples)
             U *= sqrt(X.shape[0] - 1)
         else:
-            # X_new = X * V = U * S * V^T * V = U * S
+            # X_new = X * V = U * S * Vt * V = U * S
             U *= S[:self.n_components_]
 
         return U
@@ -451,11 +451,11 @@ def _fit_full(self, X, n_components):
         self.mean_ = np.mean(X, axis=0)
         X -= self.mean_
 
-        U, S, V = linalg.svd(X, full_matrices=False)
+        U, S, Vt = linalg.svd(X, full_matrices=False)
         # flip eigenvectors' sign to enforce deterministic output
-        U, V = svd_flip(U, V)
+        U, Vt = svd_flip(U, Vt)
 
-        components_ = V
+        components_ = Vt
 
         # Get variance explained by singular values
         explained_variance_ = (S ** 2) / (n_samples - 1)
@@ -491,7 +491,7 @@ def _fit_full(self, X, n_components):
             explained_variance_ratio_[:n_components]
         self.singular_values_ = singular_values_[:n_components]
 
-        return U, S, V
+        return U, S, Vt
 
     def _fit_truncated(self, X, n_components, svd_solver):
         """Fit the model by computing truncated SVD (by ARPACK or randomized)
@@ -530,22 +530,22 @@ def _fit_truncated(self, X, n_components, svd_solver):
         if svd_solver == 'arpack':
             # random init solution, as ARPACK does it internally
             v0 = random_state.uniform(-1, 1, size=min(X.shape))
-            U, S, V = svds(X, k=n_components, tol=self.tol, v0=v0)
+            U, S, Vt = svds(X, k=n_components, tol=self.tol, v0=v0)
             # svds doesn't abide by scipy.linalg.svd/randomized_svd
             # conventions, so reverse its outputs.
             S = S[::-1]
             # flip eigenvectors' sign to enforce deterministic output
-            U, V = svd_flip(U[:, ::-1], V[::-1])
+            U, Vt = svd_flip(U[:, ::-1], Vt[::-1])
 
         elif svd_solver == 'randomized':
             # sign flipping is done inside
-            U, S, V = randomized_svd(X, n_components=n_components,
-                                     n_iter=self.iterated_power,
-                                     flip_sign=True,
-                                     random_state=random_state)
+            U, S, Vt = randomized_svd(X, n_components=n_components,
+                                      n_iter=self.iterated_power,
+                                      flip_sign=True,
+                                      random_state=random_state)
 
         self.n_samples_, self.n_features_ = n_samples, n_features
-        self.components_ = V
+        self.components_ = Vt
         self.n_components_ = n_components
 
         # Get variance explained by singular values
@@ -562,7 +562,7 @@ def _fit_truncated(self, X, n_components, svd_solver):
         else:
             self.noise_variance_ = 0.
 
-        return U, S, V
+        return U, S, Vt
 
     def score_samples(self, X):
         """Return the log-likelihood of each sample.
diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py
index b07570a3f0a75..96a17fc5a34a5 100644
--- a/sklearn/discriminant_analysis.py
+++ b/sklearn/discriminant_analysis.py
@@ -378,11 +378,11 @@ def _solve_svd(self, X, y):
         # 2) Within variance scaling
         X = np.sqrt(fac) * (Xc / std)
         # SVD of centered (within)scaled data
-        U, S, V = linalg.svd(X, full_matrices=False)
+        U, S, Vt = linalg.svd(X, full_matrices=False)
 
         rank = np.sum(S > self.tol)
         # Scaling of within covariance is: V' 1/S
-        scalings = (V[:rank] / std).T / S[:rank]
+        scalings = (Vt[:rank] / std).T / S[:rank]
 
         # 3) Between variance scaling
         # Scale weighted centers
@@ -391,12 +391,12 @@ def _solve_svd(self, X, y):
         # Centers are living in a space with n_classes-1 dim (maximum)
         # Use SVD to find projection in the space spanned by the
         # (n_classes) centers
-        _, S, V = linalg.svd(X, full_matrices=0)
+        _, S, Vt = linalg.svd(X, full_matrices=0)
 
         self.explained_variance_ratio_ = (S**2 / np.sum(
             S**2))[:self._max_components]
         rank = np.sum(S > self.tol * S[0])
-        self.scalings_ = np.dot(scalings, V.T[:, :rank])
+        self.scalings_ = np.dot(scalings, Vt.T[:, :rank])
         coef = np.dot(self.means_ - self.xbar_, self.scalings_)
         self.intercept_ = (-0.5 * np.sum(coef ** 2, axis=1) +
                            np.log(self.priors_))
diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py
index d11b307e7500e..488b142d9cae9 100644
--- a/sklearn/utils/extmath.py
+++ b/sklearn/utils/extmath.py
@@ -349,24 +349,24 @@ def randomized_svd(M, n_components, n_oversamples=10, n_iter='auto',
     B = safe_sparse_dot(Q.T, M)
 
     # compute the SVD on the thin matrix: (k + p) wide
-    Uhat, s, V = linalg.svd(B, full_matrices=False)
+    Uhat, s, Vt = linalg.svd(B, full_matrices=False)
 
     del B
     U = np.dot(Q, Uhat)
 
     if flip_sign:
         if not transpose:
-            U, V = svd_flip(U, V)
+            U, Vt = svd_flip(U, Vt)
         else:
             # In case of transpose u_based_decision=false
             # to actually flip based on u and not v.
-            U, V = svd_flip(U, V, u_based_decision=False)
+            U, Vt = svd_flip(U, Vt, u_based_decision=False)
 
     if transpose:
         # transpose back the results according to the input convention
-        return V[:n_components, :].T, s[:n_components], U[:, :n_components].T
+        return Vt[:n_components, :].T, s[:n_components], U[:, :n_components].T
     else:
-        return U[:, :n_components], s[:n_components], V[:n_components, :]
+        return U[:, :n_components], s[:n_components], Vt[:n_components, :]
 
 
 def weighted_mode(a, w, axis=0):
@@ -508,6 +508,8 @@ def svd_flip(u, v, u_based_decision=True):
         u and v are the output of `linalg.svd` or
         :func:`~sklearn.utils.extmath.randomized_svd`, with matching inner
         dimensions so one can compute `np.dot(u * s, v)`.
+        The input v should really be called vt to be consistent with scipy's
+        ouput.
 
     u_based_decision : boolean, (default=True)
         If True, use the columns of u as the basis for sign flipping.
diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py
index 2abcbfa3c74e7..80151be8c7e20 100644
--- a/sklearn/utils/tests/test_extmath.py
+++ b/sklearn/utils/tests/test_extmath.py
@@ -100,12 +100,12 @@ def check_randomized_svd_low_rank(dtype):
     assert X.shape == (n_samples, n_features)
 
     # compute the singular values of X using the slow exact method
-    U, s, V = linalg.svd(X, full_matrices=False)
+    U, s, Vt = linalg.svd(X, full_matrices=False)
 
     # Convert the singular values to the specific dtype
     U = U.astype(dtype, copy=False)
     s = s.astype(dtype, copy=False)
-    V = V.astype(dtype, copy=False)
+    Vt = Vt.astype(dtype, copy=False)
 
     for normalizer in ['auto', 'LU', 'QR']:  # 'none' would not be stable
         # compute the singular values of X using the fast approximate method
@@ -133,7 +133,7 @@ def check_randomized_svd_low_rank(dtype):
         assert_almost_equal(s[:k], sa, decimal=decimal)
 
         # check the singular vectors too (while not checking the sign)
-        assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va),
+        assert_almost_equal(np.dot(U[:, :k], Vt[:k, :]), np.dot(Ua, Va),
                             decimal=decimal)
 
         # check the sparse matrix representation
@@ -306,28 +306,28 @@ def test_randomized_svd_power_iteration_normalizer():
     n_components = 50
 
     # Check that it diverges with many (non-normalized) power iterations
-    U, s, V = randomized_svd(X, n_components, n_iter=2,
-                             power_iteration_normalizer='none')
-    A = X - U.dot(np.diag(s).dot(V))
+    U, s, Vt = randomized_svd(X, n_components, n_iter=2,
+                              power_iteration_normalizer='none')
+    A = X - U.dot(np.diag(s).dot(Vt))
     error_2 = linalg.norm(A, ord='fro')
-    U, s, V = randomized_svd(X, n_components, n_iter=20,
-                             power_iteration_normalizer='none')
-    A = X - U.dot(np.diag(s).dot(V))
+    U, s, Vt = randomized_svd(X, n_components, n_iter=20,
+                              power_iteration_normalizer='none')
+    A = X - U.dot(np.diag(s).dot(Vt))
     error_20 = linalg.norm(A, ord='fro')
     assert np.abs(error_2 - error_20) > 100
 
     for normalizer in ['LU', 'QR', 'auto']:
-        U, s, V = randomized_svd(X, n_components, n_iter=2,
-                                 power_iteration_normalizer=normalizer,
-                                 random_state=0)
-        A = X - U.dot(np.diag(s).dot(V))
+        U, s, Vt = randomized_svd(X, n_components, n_iter=2,
+                                  power_iteration_normalizer=normalizer,
+                                  random_state=0)
+        A = X - U.dot(np.diag(s).dot(Vt))
         error_2 = linalg.norm(A, ord='fro')
 
         for i in [5, 10, 50]:
-            U, s, V = randomized_svd(X, n_components, n_iter=i,
-                                     power_iteration_normalizer=normalizer,
-                                     random_state=0)
-            A = X - U.dot(np.diag(s).dot(V))
+            U, s, Vt = randomized_svd(X, n_components, n_iter=i,
+                                      power_iteration_normalizer=normalizer,
+                                      random_state=0)
+            A = X - U.dot(np.diag(s).dot(Vt))
             error = linalg.norm(A, ord='fro')
             assert 15 > np.abs(error_2 - error)
 
@@ -355,20 +355,20 @@ def test_svd_flip():
     X = rs.randn(n_samples, n_features)
 
     # Check matrix reconstruction
-    U, S, V = linalg.svd(X, full_matrices=False)
-    U1, V1 = svd_flip(U, V, u_based_decision=False)
+    U, S, Vt = linalg.svd(X, full_matrices=False)
+    U1, V1 = svd_flip(U, Vt, u_based_decision=False)
     assert_almost_equal(np.dot(U1 * S, V1), X, decimal=6)
 
     # Check transposed matrix reconstruction
     XT = X.T
-    U, S, V = linalg.svd(XT, full_matrices=False)
-    U2, V2 = svd_flip(U, V, u_based_decision=True)
+    U, S, Vt = linalg.svd(XT, full_matrices=False)
+    U2, V2 = svd_flip(U, Vt, u_based_decision=True)
     assert_almost_equal(np.dot(U2 * S, V2), XT, decimal=6)
 
     # Check that different flip methods are equivalent under reconstruction
-    U_flip1, V_flip1 = svd_flip(U, V, u_based_decision=True)
+    U_flip1, V_flip1 = svd_flip(U, Vt, u_based_decision=True)
     assert_almost_equal(np.dot(U_flip1 * S, V_flip1), XT, decimal=6)
-    U_flip2, V_flip2 = svd_flip(U, V, u_based_decision=False)
+    U_flip2, V_flip2 = svd_flip(U, Vt, u_based_decision=False)
     assert_almost_equal(np.dot(U_flip2 * S, V_flip2), XT, decimal=6)
 
 

From 2a5c22a6aa615b2f69322458d82939aba4b6f55c Mon Sep 17 00:00:00 2001
From: mathurinm <mathurinm@users.noreply.github.com>
Date: Wed, 6 May 2020 19:29:31 +0200
Subject: [PATCH 117/125] Delegate choice of final model to sub class in
 LinearModelCV  (#17099)

* init

* flake8 error by vscode autoformatting

* remove returns

* get_model > get_estimator

* cosmit Tom + autoformat on save error again
---
 sklearn/linear_model/_coordinate_descent.py | 53 ++++++++++++++-------
 1 file changed, 37 insertions(+), 16 deletions(-)

diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index 3ac0d155169af..51f37a9379a41 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -1152,6 +1152,14 @@ def __init__(self, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True,
         self.random_state = random_state
         self.selection = selection
 
+    @abstractmethod
+    def _get_estimator(self):
+        """Model to be fitted after the best alpha has been determined."""
+
+    @abstractmethod
+    def _is_multitask(self):
+        """Bool indicating if class is meant for multidimensional target."""
+
     def fit(self, X, y):
         """Fit linear model with coordinate descent
 
@@ -1216,19 +1224,10 @@ def fit(self, X, y):
         if y.shape[0] == 0:
             raise ValueError("y has 0 samples: %r" % y)
 
-        if hasattr(self, 'l1_ratio'):
-            model_str = 'ElasticNet'
-        else:
-            model_str = 'Lasso'
-
-        if isinstance(self, ElasticNetCV) or isinstance(self, LassoCV):
-            if model_str == 'ElasticNet':
-                model = ElasticNet()
-            else:
-                model = Lasso()
+        if not self._is_multitask():
             if y.ndim > 1 and y.shape[1] > 1:
                 raise ValueError("For multi-task outputs, use "
-                                 "MultiTask%sCV" % (model_str))
+                                 "MultiTask%s" % self.__class__.__name__)
             y = column_or_1d(y, warn=True)
         else:
             if sparse.isspmatrix(X):
@@ -1236,11 +1235,9 @@ def fit(self, X, y):
                                 "passed")
             elif y.ndim == 1:
                 raise ValueError("For mono-task outputs, use "
-                                 "%sCV" % (model_str))
-            if model_str == 'ElasticNet':
-                model = MultiTaskElasticNet()
-            else:
-                model = MultiTaskLasso()
+                                 "%sCV" % self.__class__.__name__[9:])
+
+        model = self._get_estimator()
 
         if self.selection not in ["random", "cyclic"]:
             raise ValueError("selection should be either random or cyclic.")
@@ -1507,6 +1504,12 @@ def __init__(self, *, eps=1e-3, n_alphas=100, alphas=None,
             cv=cv, verbose=verbose, n_jobs=n_jobs, positive=positive,
             random_state=random_state, selection=selection)
 
+    def _get_estimator(self):
+        return Lasso()
+
+    def _is_multitask(self):
+        return False
+
     def _more_tags(self):
         return {'multioutput': False}
 
@@ -1715,6 +1718,12 @@ def __init__(self, *, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
         self.random_state = random_state
         self.selection = selection
 
+    def _get_estimator(self):
+        return ElasticNet()
+
+    def _is_multitask(self):
+        return False
+
     def _more_tags(self):
         return {'multioutput': False}
 
@@ -2221,6 +2230,12 @@ def __init__(self, *, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
         self.random_state = random_state
         self.selection = selection
 
+    def _get_estimator(self):
+        return MultiTaskElasticNet()
+
+    def _is_multitask(self):
+        return True
+
     def _more_tags(self):
         return {'multioutput_only': True}
 
@@ -2385,5 +2400,11 @@ def __init__(self, *, eps=1e-3, n_alphas=100, alphas=None,
             cv=cv, verbose=verbose, n_jobs=n_jobs, random_state=random_state,
             selection=selection)
 
+    def _get_estimator(self):
+        return MultiTaskLasso()
+
+    def _is_multitask(self):
+        return True
+
     def _more_tags(self):
         return {'multioutput_only': True}

From 8529567507ad080c317d84f7289bfad5ae6023d9 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Thu, 7 May 2020 19:30:13 +1000
Subject: [PATCH 118/125] DOC highlights improvements re estimator
 visualisation (#17136)

---
 .../plot_release_highlights_0_23_0.py             | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/examples/release_highlights/plot_release_highlights_0_23_0.py b/examples/release_highlights/plot_release_highlights_0_23_0.py
index 644e0f7747d39..5ad3fb35bd708 100644
--- a/examples/release_highlights/plot_release_highlights_0_23_0.py
+++ b/examples/release_highlights/plot_release_highlights_0_23_0.py
@@ -53,12 +53,15 @@
 print(gbdt.score(X_test, y_test))
 
 ##############################################################################
-# Rich HTML representation for estimators
-# ---------------------------------------
-# Estimators can now be rendered in html in notebooks by enabling the
-# `display='diagram'` option. This is particularly useful to visualize
-# pipelines and composite estimators. Click on the entries to expand and see
-# details.
+# Rich visual representation of estimators
+# -----------------------------------------
+# Estimators can now be visualized in notebooks by enabling the
+# `display='diagram'` option. This is particularly useful to summarise the
+# structure of pipelines and other composite estimators, with interactivity to
+# provide detail.  Click on the example image below to expand Pipeline
+# elements.  See :ref:`visualizing_composite_estimators` for how you can use
+# this feature.
+
 from sklearn import set_config
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import OneHotEncoder, StandardScaler

From 89fcef7ad2b5d8b2fec39cacfe3b10c5db613f43 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Markus=20L=C3=B6ning?= <markus.loning.17@ucl.ac.uk>
Date: Thu, 7 May 2020 16:05:06 +0100
Subject: [PATCH 119/125] MNT Removed duplicate method calls in pickle
 estimator test (#17152)

---
 sklearn/utils/estimator_checks.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index ec28cb22919f0..5fe5fa1458ecc 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -1567,11 +1567,6 @@ def check_estimators_pickle(name, estimator_orig):
     set_random_state(estimator)
     estimator.fit(X, y)
 
-    result = dict()
-    for method in check_methods:
-        if hasattr(estimator, method):
-            result[method] = getattr(estimator, method)(X)
-
     # pickle and unpickle!
     pickled_estimator = pickle.dumps(estimator)
     if estimator.__module__.startswith('sklearn.'):

From 92ed38598f543c94bdbb9103862bc696cae01b13 Mon Sep 17 00:00:00 2001
From: Lucy Liu <jliu176@gmail.com>
Date: Fri, 8 May 2020 02:01:04 +0200
Subject: [PATCH 120/125] DOC Fix deprecation warning in plot_roc_crossval
 (#17155)

---
 examples/model_selection/plot_roc_crossval.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/model_selection/plot_roc_crossval.py b/examples/model_selection/plot_roc_crossval.py
index e468357da9dec..24633f10b93b3 100644
--- a/examples/model_selection/plot_roc_crossval.py
+++ b/examples/model_selection/plot_roc_crossval.py
@@ -32,7 +32,6 @@
 print(__doc__)
 
 import numpy as np
-from scipy import interp
 import matplotlib.pyplot as plt
 
 from sklearn import svm, datasets
@@ -72,7 +71,7 @@
     viz = plot_roc_curve(classifier, X[test], y[test],
                          name='ROC fold {}'.format(i),
                          alpha=0.3, lw=1, ax=ax)
-    interp_tpr = interp(mean_fpr, viz.fpr, viz.tpr)
+    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
     interp_tpr[0] = 0.0
     tprs.append(interp_tpr)
     aucs.append(viz.roc_auc)

From ad6fc804252a8d3454c8d8a3908fa9a73a8382ae Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 8 May 2020 16:03:43 -0400
Subject: [PATCH 121/125] DOC fix indentation in cv parameter description
 (#17161)

---
 examples/model_selection/plot_learning_curve.py | 1 +
 sklearn/model_selection/_search.py              | 1 +
 2 files changed, 2 insertions(+)

diff --git a/examples/model_selection/plot_learning_curve.py b/examples/model_selection/plot_learning_curve.py
index c3ee182ad3061..7be082de5e005 100644
--- a/examples/model_selection/plot_learning_curve.py
+++ b/examples/model_selection/plot_learning_curve.py
@@ -57,6 +57,7 @@ def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
     cv : int, cross-validation generator or an iterable, optional
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
+
           - None, to use the default 5-fold cross-validation,
           - integer, to specify the number of folds.
           - :term:`CV splitter`,
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index e44511a9394b4..27bf36d38e6c0 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -1287,6 +1287,7 @@ class RandomizedSearchCV(BaseSearchCV):
     cv : int, cross-validation generator or an iterable, default=None
         Determines the cross-validation splitting strategy.
         Possible inputs for cv are:
+
         - None, to use the default 5-fold cross validation,
         - integer, to specify the number of folds in a `(Stratified)KFold`,
         - :term:`CV splitter`,

From c7a4bc9b3a3e883fddfa9d7ac483cd21ddf7a5bf Mon Sep 17 00:00:00 2001
From: Lucy Liu <jliu176@gmail.com>
Date: Fri, 8 May 2020 23:53:15 +0200
Subject: [PATCH 122/125] TST Replace Boston dataset in test_validation
 (#17146)

---
 sklearn/model_selection/tests/test_validation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py
index 579726043f099..ec005f51fc260 100644
--- a/sklearn/model_selection/tests/test_validation.py
+++ b/sklearn/model_selection/tests/test_validation.py
@@ -42,7 +42,7 @@
 from sklearn.model_selection._validation import _score
 
 from sklearn.datasets import make_regression
-from sklearn.datasets import load_boston
+from sklearn.datasets import load_diabetes
 from sklearn.datasets import load_iris
 from sklearn.datasets import load_digits
 from sklearn.metrics import explained_variance_score
@@ -768,7 +768,7 @@ def test_cross_val_score_multilabel():
 
 
 def test_cross_val_predict():
-    X, y = load_boston(return_X_y=True)
+    X, y = load_diabetes(return_X_y=True)
     cv = KFold()
 
     est = Ridge()

From 0d29d2e11d064637b2ec27fe6738039bca724dfe Mon Sep 17 00:00:00 2001
From: Lucy Liu <jliu176@gmail.com>
Date: Sat, 9 May 2020 01:15:10 +0200
Subject: [PATCH 123/125] DOC Fix links and clarify plot_random_dataset.py
 (#17150)

---
 examples/datasets/plot_random_dataset.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/examples/datasets/plot_random_dataset.py b/examples/datasets/plot_random_dataset.py
index 8de51a124f950..2f8d4be8ac383 100644
--- a/examples/datasets/plot_random_dataset.py
+++ b/examples/datasets/plot_random_dataset.py
@@ -3,14 +3,15 @@
 Plot randomly generated classification dataset
 ==============================================
 
-Plot several randomly generated 2D classification datasets.
-This example illustrates the :func:`datasets.make_classification`
-:func:`datasets.make_blobs` and :func:`datasets.make_gaussian_quantiles`
-functions.
+This example plots several randomly generated classification datasets.
+For easy visualization, all datasets have 2 features, plotted on the x and y
+axis. The color of each point represents its class label.
 
-For ``make_classification``, three binary and two multi-class classification
-datasets are generated, with different numbers of informative features and
-clusters per class.  """
+The first 4 plots use the :func:`~sklearn.datasets.make_classification` with
+different numbers of informative features, clusters per class and classes.
+The final 2 plots use :func:`~sklearn.datasets.make_blobs` and
+:func:`~sklearn.datasets.make_gaussian_quantiles`.
+"""
 
 print(__doc__)
 

From e2256c111d7ad7a3cd8e03cd5b12250f087cbdb3 Mon Sep 17 00:00:00 2001
From: Lucy Liu <jliu176@gmail.com>
Date: Sat, 9 May 2020 01:52:34 +0200
Subject: [PATCH 124/125] TST Replace Boston dataset in
 test_coordinate_descent.py (#17153)

---
 sklearn/linear_model/tests/test_coordinate_descent.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
index 1b2f7c656f015..b1a072ae38996 100644
--- a/sklearn/linear_model/tests/test_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -10,7 +10,7 @@
 import joblib
 from distutils.version import LooseVersion
 
-from sklearn.datasets import load_boston
+from sklearn.datasets import load_diabetes
 from sklearn.datasets import make_regression
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.utils._testing import assert_array_almost_equal
@@ -596,7 +596,7 @@ def test_warm_start_convergence():
 
 
 def test_warm_start_convergence_with_regularizer_decrement():
-    X, y = load_boston(return_X_y=True)
+    X, y = load_diabetes(return_X_y=True)
 
     # Train a model to converge on a lightly regularized problem
     final_alpha = 1e-5

From c36c1047fe8046d74554e71102425332941baa09 Mon Sep 17 00:00:00 2001
From: Hannah Bohle <32333241+hhnnhh@users.noreply.github.com>
Date: Sat, 9 May 2020 17:28:00 +0200
Subject: [PATCH 125/125] DOC Ensure All Attributes are Documented: OAS:
 location_ added  (#17163)

---
 sklearn/covariance/_shrunk_covariance.py   | 15 +++++++++------
 sklearn/tests/test_docstring_parameters.py |  2 +-
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/sklearn/covariance/_shrunk_covariance.py b/sklearn/covariance/_shrunk_covariance.py
index fcc13a84e803e..ec6748aaf721a 100644
--- a/sklearn/covariance/_shrunk_covariance.py
+++ b/sklearn/covariance/_shrunk_covariance.py
@@ -82,12 +82,12 @@ class ShrunkCovariance(EmpiricalCovariance):
 
     Attributes
     ----------
-    location_ : ndarray of shape (n_features,)
-        Estimated location, i.e. the estimated mean.
-
     covariance_ : ndarray of shape (n_features, n_features)
         Estimated covariance matrix
 
+    location_ : ndarray of shape (n_features,)
+        Estimated location, i.e. the estimated mean.
+
     precision_ : ndarray of shape (n_features, n_features)
         Estimated pseudo inverse matrix.
         (stored only if store_precision is True)
@@ -344,12 +344,12 @@ class LedoitWolf(EmpiricalCovariance):
 
     Attributes
     ----------
-    location_ : ndarray of shape (n_features,)
-        Estimated location, i.e. the estimated mean.
-
     covariance_ : ndarray of shape (n_features, n_features)
         Estimated covariance matrix.
 
+    location_ : ndarray of shape (n_features,)
+        Estimated location, i.e. the estimated mean.
+
     precision_ : ndarray of shape (n_features, n_features)
         Estimated pseudo inverse matrix.
         (stored only if store_precision is True)
@@ -526,6 +526,9 @@ class OAS(EmpiricalCovariance):
     covariance_ : ndarray of shape (n_features, n_features)
         Estimated covariance matrix.
 
+    location_ : ndarray of shape (n_features,)
+        Estimated location, i.e. the estimated mean.
+
     precision_ : ndarray of shape (n_features, n_features)
         Estimated pseudo inverse matrix.
         (stored only if store_precision is True)
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index 48e2cffe265e1..aa858670a54ce 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -239,7 +239,7 @@ def test_fit_docstring_attributes(name, Estimator):
                'MiniBatchKMeans', 'MLPClassifier', 'MLPRegressor',
                'MultiTaskElasticNet', 'MultiTaskElasticNetCV',
                'MultiTaskLasso', 'MultiTaskLassoCV', 'NearestNeighbors',
-               'NuSVR', 'OAS', 'OneClassSVM', 'OrthogonalMatchingPursuit',
+               'NuSVR', 'OneClassSVM', 'OrthogonalMatchingPursuit',
                'PLSCanonical', 'PLSRegression', 'PLSSVD',
                'PassiveAggressiveClassifier', 'Perceptron', 'RBFSampler',
                'RadiusNeighborsClassifier', 'RadiusNeighborsRegressor',