diff --git a/Makefile b/Makefile
index 43fc5afe63361..b2171d06b6747 100644
--- a/Makefile
+++ b/Makefile
@@ -67,4 +67,4 @@ code-analysis:
 	pylint -E -i y sklearn/ -d E1103,E0611,E1101
 
 flake8-diff:
-	./build_tools/circle/linting.sh
+	git diff upstream/master -u -- "*.py" | flake8 --diff
diff --git a/README.rst b/README.rst
index 171a19785dd73..0fac479bba81a 100644
--- a/README.rst
+++ b/README.rst
@@ -31,12 +31,12 @@ SciPy and is distributed under the 3-Clause BSD license.
 
 The project was started in 2007 by David Cournapeau as a Google Summer
 of Code project, and since then many volunteers have contributed. See
-the `About us <http://scikit-learn.org/dev/about.html#authors>`__ page
+the `About us <https://scikit-learn.org/dev/about.html#authors>`__ page
 for a list of core contributors.
 
 It is currently maintained by a team of volunteers.
 
-Website: http://scikit-learn.org
+Website: https://scikit-learn.org
 
 
 Installation
@@ -73,13 +73,13 @@ or ``conda``::
 
     conda install scikit-learn
 
-The documentation includes more detailed `installation instructions <http://scikit-learn.org/stable/install.html>`_.
+The documentation includes more detailed `installation instructions <https://scikit-learn.org/stable/install.html>`_.
 
 
 Changelog
 ---------
 
-See the `changelog <http://scikit-learn.org/dev/whats_new.html>`__
+See the `changelog <https://scikit-learn.org/dev/whats_new.html>`__
 for a history of notable changes to scikit-learn.
 
 Development
@@ -87,7 +87,7 @@ Development
 
 We welcome new contributors of all experience levels. The scikit-learn
 community goals are to be helpful, welcoming, and effective. The
-`Development Guide <http://scikit-learn.org/stable/developers/index.html>`_
+`Development Guide <https://scikit-learn.org/stable/developers/index.html>`_
 has detailed information about contributing code, documentation, tests, and
 more. We've included some basic information in this README.
 
@@ -120,7 +120,7 @@ source directory (you will need to have ``pytest`` >= 3.3.0 installed)::
 
     pytest sklearn
 
-See the web page http://scikit-learn.org/dev/developers/advanced_installation.html#testing
+See the web page https://scikit-learn.org/dev/developers/advanced_installation.html#testing
 for more information.
 
     Random number generation can be controlled during testing by setting
@@ -131,7 +131,7 @@ Submitting a Pull Request
 
 Before opening a Pull Request, have a look at the
 full Contributing page to make sure your code complies
-with our guidelines: http://scikit-learn.org/stable/developers/index.html
+with our guidelines: https://scikit-learn.org/stable/developers/index.html
 
 
 Project History
@@ -139,7 +139,7 @@ Project History
 
 The project was started in 2007 by David Cournapeau as a Google Summer
 of Code project, and since then many volunteers have contributed. See
-the `About us <http://scikit-learn.org/dev/about.html#authors>`__ page
+the `About us <https://scikit-learn.org/dev/about.html#authors>`__ page
 for a list of core contributors.
 
 The project is currently maintained by a team of volunteers.
@@ -153,9 +153,9 @@ Help and Support
 Documentation
 ~~~~~~~~~~~~~
 
-- HTML documentation (stable release): http://scikit-learn.org
-- HTML documentation (development version): http://scikit-learn.org/dev/
-- FAQ: http://scikit-learn.org/stable/faq.html
+- HTML documentation (stable release): https://scikit-learn.org
+- HTML documentation (development version): https://scikit-learn.org/dev/
+- FAQ: https://scikit-learn.org/stable/faq.html
 
 Communication
 ~~~~~~~~~~~~~
@@ -163,9 +163,9 @@ Communication
 - Mailing list: https://mail.python.org/mailman/listinfo/scikit-learn
 - IRC channel: ``#scikit-learn`` at ``webchat.freenode.net``
 - Stack Overflow: https://stackoverflow.com/questions/tagged/scikit-learn
-- Website: http://scikit-learn.org
+- Website: https://scikit-learn.org
 
 Citation
 ~~~~~~~~
 
-If you use scikit-learn in a scientific publication, we would appreciate citations: http://scikit-learn.org/stable/about.html#citing-scikit-learn
+If you use scikit-learn in a scientific publication, we would appreciate citations: https://scikit-learn.org/stable/about.html#citing-scikit-learn
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 047988e25f648..196d4ca34f434 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -17,7 +17,10 @@ jobs:
       displayName: Add conda to PATH
     - bash: sudo chown -R $USER $CONDA
       displayName: Take ownership of conda installation
-    - bash: conda create --name flake8_env --yes flake8
+    - bash: |
+        conda create --name flake8_env --yes python=3.8
+        conda activate flake8_env
+        pip install flake8 mypy==0.770
       displayName: Install flake8
     - bash: |
         if [[ $BUILD_SOURCEVERSIONMESSAGE =~ \[lint\ skip\] ]]; then
@@ -25,10 +28,20 @@ jobs:
           echo "Skipping linting"
           exit 0
         else
-          source activate flake8_env
+          conda activate flake8_env
           ./build_tools/circle/linting.sh
         fi
       displayName: Run linting
+    - bash: |
+        if [[ $BUILD_SOURCEVERSIONMESSAGE =~ \[lint\ skip\] ]]; then
+          # skip linting
+          echo "Skipping linting"
+          exit 0
+        else
+          conda activate flake8_env
+          mypy sklearn/ --ignore-missing-imports
+        fi
+      displayName: Run mypy
     - bash: |
         if [[ $BUILD_SOURCEVERSIONMESSAGE =~ \[scipy-dev\] ]] || \
            [[ $BUILD_REASON == "Schedule" ]]; then
diff --git a/benchmarks/bench_hist_gradient_boosting_higgsboson.py b/benchmarks/bench_hist_gradient_boosting_higgsboson.py
index ec75760cd39f7..2c74bb8818343 100644
--- a/benchmarks/bench_hist_gradient_boosting_higgsboson.py
+++ b/benchmarks/bench_hist_gradient_boosting_higgsboson.py
@@ -25,12 +25,14 @@
 parser.add_argument('--learning-rate', type=float, default=1.)
 parser.add_argument('--subsample', type=int, default=None)
 parser.add_argument('--max-bins', type=int, default=255)
+parser.add_argument('--no-predict', action="store_true", default=False)
+parser.add_argument('--cache-loc', type=str, default='/tmp')
 args = parser.parse_args()
 
 HERE = os.path.dirname(__file__)
 URL = ("https://archive.ics.uci.edu/ml/machine-learning-databases/00280/"
        "HIGGS.csv.gz")
-m = Memory(location='/tmp', mmap_mode='r')
+m = Memory(location=args.cache_loc, mmap_mode='r')
 
 n_leaf_nodes = args.n_leaf_nodes
 n_trees = args.n_trees
@@ -56,6 +58,27 @@ def load_data():
     return df
 
 
+def fit(est, data_train, target_train, libname):
+    print(f"Fitting a {libname} model...")
+    tic = time()
+    est.fit(data_train, target_train)
+    toc = time()
+    print(f"fitted in {toc - tic:.3f}s")
+
+
+def predict(est, data_test, target_test):
+    if args.no_predict:
+        return
+    tic = time()
+    predicted_test = est.predict(data_test)
+    predicted_proba_test = est.predict_proba(data_test)
+    toc = time()
+    roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
+    acc = accuracy_score(target_test, predicted_test)
+    print(f"predicted in {toc - tic:.3f}s, "
+          f"ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")
+
+
 df = load_data()
 target = df.values[:, 0]
 data = np.ascontiguousarray(df.values[:, 1:])
@@ -68,56 +91,28 @@ def load_data():
 n_samples, n_features = data_train.shape
 print(f"Training set with {n_samples} records with {n_features} features.")
 
-print("Fitting a sklearn model...")
-tic = time()
 est = HistGradientBoostingClassifier(loss='binary_crossentropy',
                                      learning_rate=lr,
                                      max_iter=n_trees,
                                      max_bins=max_bins,
                                      max_leaf_nodes=n_leaf_nodes,
-                                     n_iter_no_change=None,
+                                     early_stopping=False,
                                      random_state=0,
                                      verbose=1)
-est.fit(data_train, target_train)
-toc = time()
-predicted_test = est.predict(data_test)
-predicted_proba_test = est.predict_proba(data_test)
-roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
-acc = accuracy_score(target_test, predicted_test)
-print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")
+fit(est, data_train, target_train, 'sklearn')
+predict(est, data_test, target_test)
 
 if args.lightgbm:
-    print("Fitting a LightGBM model...")
-    tic = time()
-    lightgbm_est = get_equivalent_estimator(est, lib='lightgbm')
-    lightgbm_est.fit(data_train, target_train)
-    toc = time()
-    predicted_test = lightgbm_est.predict(data_test)
-    predicted_proba_test = lightgbm_est.predict_proba(data_test)
-    roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
-    acc = accuracy_score(target_test, predicted_test)
-    print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")
+    est = get_equivalent_estimator(est, lib='lightgbm')
+    fit(est, data_train, target_train, 'lightgbm')
+    predict(est, data_test, target_test)
 
 if args.xgboost:
-    print("Fitting an XGBoost model...")
-    tic = time()
-    xgboost_est = get_equivalent_estimator(est, lib='xgboost')
-    xgboost_est.fit(data_train, target_train)
-    toc = time()
-    predicted_test = xgboost_est.predict(data_test)
-    predicted_proba_test = xgboost_est.predict_proba(data_test)
-    roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
-    acc = accuracy_score(target_test, predicted_test)
-    print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")
+    est = get_equivalent_estimator(est, lib='xgboost')
+    fit(est, data_train, target_train, 'xgboost')
+    predict(est, data_test, target_test)
 
 if args.catboost:
-    print("Fitting a Catboost model...")
-    tic = time()
-    catboost_est = get_equivalent_estimator(est, lib='catboost')
-    catboost_est.fit(data_train, target_train)
-    toc = time()
-    predicted_test = catboost_est.predict(data_test)
-    predicted_proba_test = catboost_est.predict_proba(data_test)
-    roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
-    acc = accuracy_score(target_test, predicted_test)
-    print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")
+    est = get_equivalent_estimator(est, lib='catboost')
+    fit(est, data_train, target_train, 'catboost')
+    predict(est, data_test, target_test)
diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
index 0a3ca4e034efd..d1849a940d96c 100755
--- a/build_tools/azure/install.sh
+++ b/build_tools/azure/install.sh
@@ -97,6 +97,10 @@ elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then
     make_conda "python=$PYTHON_VERSION"
     python -m pip install -U pip
     python -m pip install pytest==$PYTEST_VERSION pytest-cov pytest-xdist
+
+    # TODO: Remove pin when https://github.com/python-pillow/Pillow/issues/4518 gets fixed
+    python -m pip install "pillow>=4.3.0,!=7.1.0,!=7.1.1"
+
     python -m pip install pandas matplotlib pyamg scikit-image
     # do not install dependencies for lightgbm since it requires scikit-learn
     python -m pip install lightgbm --no-deps
diff --git a/conftest.py b/conftest.py
index 17c3f4b144346..2b9e87bf9f292 100644
--- a/conftest.py
+++ b/conftest.py
@@ -87,6 +87,11 @@ def pytest_collection_modifyitems(config, items):
 def pytest_configure(config):
     import sys
     sys._is_pytest_session = True
+    # declare our custom markers to avoid PytestUnknownMarkWarning
+    config.addinivalue_line(
+        "markers",
+        "network: mark a test for execution if network available."
+    )
 
 
 def pytest_unconfigure(config):
diff --git a/doc/about.rst b/doc/about.rst
index 9926f62dcc824..a6cdd54eb9201 100644
--- a/doc/about.rst
+++ b/doc/about.rst
@@ -13,7 +13,7 @@ this project as part of his thesis.
 In 2010 Fabian Pedregosa, Gael Varoquaux, Alexandre Gramfort and Vincent
 Michel of INRIA took leadership of the project and made the first public
 release, February the 1st 2010. Since then, several releases have appeared
-following a ~3 month cycle, and a thriving international community has
+following a ~ 3-month cycle, and a thriving international community has
 been leading the development.
 
 Governance
@@ -520,7 +520,7 @@ budget of the project [#f1]_.
 
 .. rubric:: Notes
 
-.. [#f1] Regarding the organization budget in particular, we might use some of
+.. [#f1] Regarding the organization budget, in particular, we might use some of
          the donated funds to pay for other project expenses such as DNS,
          hosting or continuous integration services.
 
diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
index 00d0c9a240c60..99c59ec3392c6 100644
--- a/doc/developers/contributing.rst
+++ b/doc/developers/contributing.rst
@@ -181,12 +181,12 @@ Contributing code
   If in doubt about duplicated work, or if you want to work on a non-trivial
   feature, it's recommended to first open an issue in
   the `issue tracker <https://github.com/scikit-learn/scikit-learn/issues>`_
-  to get some feedbacks from core developers. 
-  
-  One easy way to find an issue to work on is by applying the "help wanted" 
-  label in your search. This lists all the issues that have been unclaimed 
-  so far. In order to claim an issue for yourself, please comment exactly 
-  ``take`` on it for the CI to automatically assign the issue to you.  
+  to get some feedbacks from core developers.
+
+  One easy way to find an issue to work on is by applying the "help wanted"
+  label in your search. This lists all the issues that have been unclaimed
+  so far. In order to claim an issue for yourself, please comment exactly
+  ``take`` on it for the CI to automatically assign the issue to you.
 
 How to contribute
 -----------------
@@ -215,7 +215,7 @@ how to set up your git repository:
 
 4. Install the development dependencies::
 
-       $ pip install cython pytest pytest-cov flake8
+       $ pip install cython pytest pytest-cov flake8 mypy
 
 5. Install scikit-learn in editable mode::
 
@@ -224,6 +224,8 @@ how to set up your git repository:
    for more details about advanced installation, see the
    :ref:`install_bleeding_edge` section.
 
+.. _upstream:
+
 6. Add the ``upstream`` remote. This saves a reference to the main
    scikit-learn repository, which you can use to keep your repository
    synchronized with the latest changes::
@@ -356,13 +358,17 @@ complies with the following rules before marking a PR as ``[MRG]``. The
    non-regression tests should fail for the code base in the master branch
    and pass for the PR code.
 
-5. **Make sure that your PR does not add PEP8 violations**. On a Unix-like
-   system, you can run `make flake8-diff`. `flake8 path_to_file`, would work
-   for any system, but please avoid reformatting parts of the file that your
-   pull request doesn't change, as it distracts from code review.
+5. **Make sure that your PR does not add PEP8 violations**. To check the
+   code that you changed, you can run the following command (see
+   :ref:`above <upstream>` to set up the upstream remote)::
+
+        git diff upstream/master -u -- "*.py" | flake8 --diff
+
+   or `make flake8-diff` which should work on unix-like system.
 
 6. Follow the :ref:`coding-guidelines`.
 
+
 7. When applicable, use the validation tools and scripts in the
    ``sklearn.utils`` submodule.  A list of utility routines available
    for developers can be found in the :ref:`developers-utils` page.
@@ -408,6 +414,18 @@ You can check for common programming errors with the following tools:
 
   see also :ref:`testing_coverage`
 
+* A moderate use of type annotations is encouraged but is not mandatory. See
+  [mypy quickstart](https://mypy.readthedocs.io/en/latest/getting_started.html)
+  for an introduction, as well as [pandas contributing documentation](
+  https://pandas.pydata.org/pandas-docs/stable/development/contributing.html#type-hints)
+  for style guidelines. Whether you add type annotation or not::
+
+    mypy --ignore-missing-import sklearn
+
+  must not produce new errors in your pull request. Using `# type: ignore` annotation can be a workaround for a few cases that are not supported by mypy, in particular,
+   - when importing C or Cython modules
+   - on properties with decorators
+
 Bonus points for contributions that include a performance analysis with
 a benchmark script and profiling output (please report on the mailing
 list or on the GitHub issue).
@@ -662,7 +680,7 @@ In general have the following in mind:
     4. 1D or 2D data can be a subset of
        ``{array-like, ndarray, sparse matrix, dataframe}``. Note that ``array-like``
        can also be a ``list``, while ``ndarray`` is explicitly only a ``numpy.ndarray``.
-    5. When specifying the data type of a list, use ``of`` as a delimiter: 
+    5. When specifying the data type of a list, use ``of`` as a delimiter:
        ``list of int``.
     6. When specifying the dtype of an ndarray, use e.g. ``dtype=np.int32``
        after defining the shape:
diff --git a/doc/developers/maintainer.rst b/doc/developers/maintainer.rst
index 2a42bee301554..6fdf17ccc927f 100644
--- a/doc/developers/maintainer.rst
+++ b/doc/developers/maintainer.rst
@@ -289,6 +289,14 @@ submodule/subpackage of the public subpackage, e.g.
 ``sklearn/impute/_iterative.py``. This is needed so that pickles still work
 in the future when the features aren't experimental anymore
 
+To avoid type checker (e.g. mypy) errors a direct import of experimenal
+estimators should be done in the parent module, protected by the
+``if typing.TYPE_CHECKING`` check. See `sklearn/ensemble/__init__.py
+<https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/ensemble/__init__.py>`_,
+or `sklearn/impute/__init__.py
+<https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/impute/__init__.py>`_
+for an example.
+
 Please also write basic tests following those in
 `test_enable_hist_gradient_boosting.py
 <https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/experimental/tests/test_enable_hist_gradient_boosting.py>`_.
diff --git a/doc/faq.rst b/doc/faq.rst
index 883ac290a3f16..9f43656ef2262 100644
--- a/doc/faq.rst
+++ b/doc/faq.rst
@@ -97,7 +97,7 @@ What are the inclusion criteria for new algorithms ?
 ----------------------------------------------------
 
 We only consider well-established algorithms for inclusion. A rule of thumb is
-at least 3 years since publication, 200+ citations and wide use and
+at least 3 years since publication, 200+ citations, and wide use and
 usefulness. A technique that provides a clear-cut improvement (e.g. an
 enhanced data structure or a more efficient approximation technique) on
 a widely-used method will also be considered for inclusion.
@@ -123,7 +123,7 @@ Inclusion of a new algorithm speeding up an existing model is easier if:
   n_samples",
 - benchmarks clearly show a speed up.
 
-Also note that your implementation need not be in scikit-learn to be used
+Also, note that your implementation need not be in scikit-learn to be used
 together with scikit-learn tools. You can implement your favorite algorithm
 in a scikit-learn compatible way, upload it to GitHub and let us know. We
 will be happy to list it under :ref:`related_projects`. If you already have
@@ -135,7 +135,7 @@ interested to look at `scikit-learn-contrib
 
 Why are you so selective on what algorithms you include in scikit-learn?
 ------------------------------------------------------------------------
-Code is maintenance cost, and we need to balance the amount of
+Code comes with maintenance cost, and we need to balance the amount of
 code we have with the size of the team (and add to this the fact that
 complexity scales non linearly with the number of features).
 The package relies on core developers using their free time to
@@ -250,7 +250,7 @@ Why do I sometime get a crash/freeze with n_jobs > 1 under OSX or Linux?
 
 Several scikit-learn tools such as ``GridSearchCV`` and ``cross_val_score``
 rely internally on Python's `multiprocessing` module to parallelize execution
-onto several Python processes by passing ``n_jobs > 1`` as argument.
+onto several Python processes by passing ``n_jobs > 1`` as an argument.
 
 The problem is that Python ``multiprocessing`` does a ``fork`` system call
 without following it with an ``exec`` system call for performance reasons. Many
diff --git a/doc/glossary.rst b/doc/glossary.rst
index 70dd230c7ecd3..bfa675c50a21c 100644
--- a/doc/glossary.rst
+++ b/doc/glossary.rst
@@ -41,7 +41,7 @@ General Concepts
         contributor documentation <api_overview>`.
 
         The specific interfaces that constitute Scikit-learn's public API are
-        largely documented in :ref:`api_ref`. However we less formally consider
+        largely documented in :ref:`api_ref`. However, we less formally consider
         anything as public API if none of the identifiers required to access it
         begins with ``_``.  We generally try to maintain :term:`backwards
         compatibility` for all objects in the public API.
@@ -106,12 +106,12 @@ General Concepts
         are documented under an estimator's *Parameters* documentation.
 
     backwards compatibility
-        We generally try to maintain backwards compatibility (i.e. interfaces
+        We generally try to maintain backward compatibility (i.e. interfaces
         and behaviors may be extended but not changed or removed) from release
         to release but this comes with some exceptions:
 
         Public API only
-            The behaviour of objects accessed through private identifiers
+            The behavior of objects accessed through private identifiers
             (those beginning ``_``) may be changed arbitrarily between
             versions.
         As documented
@@ -145,8 +145,8 @@ General Concepts
             assumed but not formally tested.
 
         Despite this informal contract with our users, the software is provided
-        as is, as stated in the licence.  When a release inadvertently
-        introduces changes that are not backwards compatible, these are known
+        as is, as stated in the license.  When a release inadvertently
+        introduces changes that are not backward compatible, these are known
         as software regressions.
 
     callable
@@ -647,7 +647,7 @@ General Concepts
         first axis and a fixed, finite set of :term:`features` on the second
         is called rectangular.
 
-        This term excludes samples with non-vectorial structure, such as text,
+        This term excludes samples with non-vectorial structures, such as text,
         an image of arbitrary size, a time series of arbitrary length, a set of
         vectors, etc. The purpose of a :term:`vectorizer` is to produce
         rectangular forms of such data.
@@ -684,7 +684,7 @@ General Concepts
         versions happen via a :ref:`SLEP <slep>` and follows the
         decision-making process outlined in :ref:`governance`.
         For all votes, a proposal must have been made public and discussed before the
-        vote. Such proposal must be a consolidated document, in the form of a
+        vote. Such a proposal must be a consolidated document, in the form of a
         ‘Scikit-Learn Enhancement Proposal’ (SLEP), rather than a long discussion on an
         issue. A SLEP must be submitted as a pull-request to
         `enhancement proposals <https://scikit-learn-enhancement-proposals.readthedocs.io>`_ using the
@@ -881,12 +881,12 @@ Class APIs and Estimator Types
         In a meta-estimator's :term:`fit` method, any contained estimators
         should be :term:`cloned` before they are fit (although FIXME: Pipeline
         and FeatureUnion do not do this currently). An exception to this is
-        that an estimator may explicitly document that it accepts a prefitted
+        that an estimator may explicitly document that it accepts a pre-fitted
         estimator (e.g. using ``prefit=True`` in
         :class:`feature_selection.SelectFromModel`). One known issue with this
-        is that the prefitted estimator will lose its model if the
+        is that the pre-fitted estimator will lose its model if the
         meta-estimator is cloned.  A meta-estimator should have ``fit`` called
-        before prediction, even if all contained estimators are prefitted.
+        before prediction, even if all contained estimators are pre-fitted.
 
         In cases where a meta-estimator's primary behaviors (e.g.
         :term:`predict` or :term:`transform` implementation) are functions of
@@ -1008,7 +1008,7 @@ Target Types
 
     binary
         A classification problem consisting of two classes.  A binary target
-        may represented as for a :term:`multiclass` problem but with only two
+        may  be represented as for a :term:`multiclass` problem but with only two
         labels.  A binary decision function is represented as a 1d array.
 
         Semantically, one class is often considered the "positive" class.
@@ -1028,7 +1028,7 @@ Target Types
 
     continuous
         A regression problem where each sample's target is a finite floating
-        point number, represented as a 1-dimensional array of floats (or
+        point number represented as a 1-dimensional array of floats (or
         sometimes ints).
 
         :func:`~utils.multiclass.type_of_target` will return 'continuous' for
@@ -1078,7 +1078,7 @@ Target Types
         A classification problem where each sample's target consists of
         ``n_outputs`` :term:`outputs`, each a class label, for a fixed int
         ``n_outputs > 1`` in a particular dataset.  Each output has a
-        fixed set of available classes, and each sample is labelled with a
+        fixed set of available classes, and each sample is labeled with a
         class for each output. An output may be binary or multiclass, and in
         the case where all outputs are binary, the target is
         :term:`multilabel`.
@@ -1213,10 +1213,10 @@ Methods
         and ``transform`` separately would be less efficient than together.
         :class:`base.TransformerMixin` provides a default implementation,
         providing a consistent interface across transformers where
-        ``fit_transform`` is or is not specialised.
+        ``fit_transform`` is or is not specialized.
 
         In :term:`inductive` learning -- where the goal is to learn a
-        generalised model that can be applied to new data -- users should be
+        generalized model that can be applied to new data -- users should be
         careful not to apply ``fit_transform`` to the entirety of a dataset
         (i.e. training and test data together) before further modelling, as
         this results in :term:`data leakage`.
@@ -1225,7 +1225,7 @@ Methods
         Primarily for :term:`feature extractors`, but also used for other
         transformers to provide string names for each column in the output of
         the estimator's :term:`transform` method.  It outputs a list of
-        strings, and may take a list of strings as input, corresponding
+        strings and may take a list of strings as input, corresponding
         to the names of input columns from which output column names can
         be generated.  By default input features are named x0, x1, ....
 
@@ -1250,7 +1250,7 @@ Methods
     ``partial_fit``
         Facilitates fitting an estimator in an online fashion.  Unlike ``fit``,
         repeatedly calling ``partial_fit`` does not clear the model, but
-        updates it with respect to the data provided. The portion of data
+        updates it with the data provided. The portion of data
         provided to ``partial_fit`` may be called a mini-batch.
         Each mini-batch must be of consistent shape, etc. In iterative
         estimators, ``partial_fit`` often only performs a single iteration.
@@ -1322,7 +1322,7 @@ Methods
         to facilitate numerical stability.
 
     ``predict_proba``
-        A method in :term:`classifiers` and :term:`clusterers` that are able to
+        A method in :term:`classifiers` and :term:`clusterers` that can
         return probability estimates for each class/cluster.  Its input is
         usually only some observed data, :term:`X`.
 
@@ -1381,7 +1381,7 @@ Methods
         In a :term:`transformer`, transforms the input, usually only :term:`X`,
         into some transformed space (conventionally notated as :term:`Xt`).
         Output is an array or sparse matrix of length :term:`n_samples` and
-        with number of columns fixed after :term:`fitting`.
+        with the number of columns fixed after :term:`fitting`.
 
         If the estimator was not already :term:`fitted`, calling this method
         should raise a :class:`exceptions.NotFittedError`.
@@ -1405,8 +1405,8 @@ functions or non-estimator constructors.
         :term:`multioutput` (including :term:`multilabel`) tasks, the weights
         are multiplied across outputs (i.e. columns of ``y``).
 
-        By default all samples have equal weight such that classes are
-        effectively weighted by their their prevalence in the training data.
+        By default, all samples have equal weight such that classes are
+        effectively weighted by their prevalence in the training data.
         This could be achieved explicitly with ``class_weight={label1: 1,
         label2: 1, ...}`` for all class labels.
 
@@ -1541,16 +1541,20 @@ functions or non-estimator constructors.
         mean that randomization is always used, as it may be dependent on
         another parameter, e.g. ``shuffle``, being set.
 
-        ``random_state``'s value may be:
+        The passed value will have an effect on the reproducibility of the
+        results returned by the function (:term:`fit`, :term:`split`, or any
+        other function like :func:`~sklearn.cluster.k_means`). `random_state`'s
+        value may be:
 
         None (default)
-            Use the global random state from :mod:`numpy.random`.
+            Use the global random state instance from :mod:`numpy.random`.
+            Calling the function multiple times will reuse
+            the same instance, and will produce different results.
 
         An integer
             Use a new random number generator seeded by the given integer.
-            To make a randomized algorithm deterministic (i.e. running it
-            multiple times will produce the same result), an arbitrary
-            integer ``random_state`` can be used. However, it may be
+            Using an int will produce the same results across different calls.
+            However, it may be
             worthwhile checking that your results are stable across a
             number of different distinct random seeds. Popular integer
             random seeds are 0 and `42
@@ -1558,9 +1562,9 @@ functions or non-estimator constructors.
 
         A :class:`numpy.random.RandomState` instance
             Use the provided random state, only affecting other users
-            of the same random state instance. Calling fit multiple times
-            will reuse the same instance, and will produce different
-            results.
+            of that same random state instance. Calling the function
+            multiple times will reuse the same instance, and
+            will produce different results.
 
         :func:`utils.check_random_state` is used internally to validate the
         input ``random_state`` and return a :class:`~numpy.random.RandomState`
@@ -1577,10 +1581,11 @@ functions or non-estimator constructors.
         in the User Guide.
 
         Where multiple metrics can be evaluated, ``scoring`` may be given
-        either as a list of unique strings or a dict with names as keys and
-        callables as values. Note that this does *not* specify which score
-        function is to be maximised, and another parameter such as ``refit``
-        may be used for this purpose.
+        either as a list of unique strings or a dictionary with names as keys
+        and callables as values. Note that this does *not* specify which score
+        function is to be maximized, and another parameter such as ``refit``
+        maybe used for this purpose.
+
 
         The ``scoring`` parameter is validated and interpreted using
         :func:`metrics.check_scoring`.
@@ -1600,9 +1605,9 @@ functions or non-estimator constructors.
         When fitting an estimator repeatedly on the same dataset, but for
         multiple parameter values (such as to find the value maximizing
         performance as in :ref:`grid search <grid_search>`), it may be possible
-        to reuse aspects of the model learnt from the previous parameter value,
+        to reuse aspects of the model learned from the previous parameter value,
         saving time.  When ``warm_start`` is true, the existing :term:`fitted`
-        model :term:`attributes` are used to initialise the new model
+        model :term:`attributes` are used to initialize the new model
         in a subsequent call to :term:`fit`.
 
         Note that this is only applicable for some models and some
@@ -1697,8 +1702,8 @@ See concept :term:`sample property`.
 .. glossary::
 
     ``groups``
-        Used in cross validation routines to identify samples which are
-        correlated.  Each value is an identifier such that, in a supporting
+        Used in cross-validation routines to identify samples that are correlated.
+        Each value is an identifier such that, in a supporting
         :term:`CV splitter`, samples from some ``groups`` value may not
         appear in both a training set and its corresponding test set.
         See :ref:`group_cv`.
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index c138f51f6c06f..3d9924638b69b 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1346,6 +1346,9 @@ Model validation
    :no-members:
    :no-inherited-members:
 
+**User guide:** See the :ref:`combining_estimators` section for further
+details.
+
 .. currentmodule:: sklearn
 
 .. autosummary::
diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst
index 8145cdef984bb..51a933dcbee47 100644
--- a/doc/modules/compose.rst
+++ b/doc/modules/compose.rst
@@ -383,11 +383,6 @@ and ignored by setting to ``'drop'``::
 ColumnTransformer for heterogeneous data
 ========================================
 
-.. warning::
-
-    The :class:`compose.ColumnTransformer <sklearn.compose.ColumnTransformer>`
-    class is experimental and the API is subject to change.
-
 Many datasets contain features of different types, say text, floats, and dates,
 where each type of feature requires separate preprocessing or feature
 extraction steps.  Often it is easiest to preprocess data before applying
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index b7c0e49f9c477..1416b9d3a6045 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -897,7 +897,7 @@ based on permutation of the features.
 Histogram-Based Gradient Boosting
 =================================
 
-Scikit-learn 0.21 introduces two new experimental implementations of
+Scikit-learn 0.21 introduced two new experimental implementations of
 gradient boosting trees, namely :class:`HistGradientBoostingClassifier`
 and :class:`HistGradientBoostingRegressor`, inspired by
 `LightGBM <https://github.com/Microsoft/LightGBM>`__ (See [LightGBM]_).
@@ -1050,6 +1050,51 @@ multiplying the gradients (and the hessians) by the sample weights. Note that
 the binning stage (specifically the quantiles computation) does not take the
 weights into account.
 
+.. _monotonic_cst_gbdt:
+
+Monotonic Constraints
+---------------------
+
+Depending on the problem at hand, you may have prior knowledge indicating
+that a given feature should in general have a positive (or negative) effect
+on the target value. For example, all else being equal, a higher credit
+score should increase the probability of getting approved for a loan.
+Monotonic constraints allow you to incorporate such prior knowledge into the
+model.
+
+A positive monotonic constraint is a constraint of the form:
+
+:math:`x_1 \leq x_1' \implies F(x_1, x_2) \leq F(x_1', x_2)`,
+where :math:`F` is the predictor with two features.
+
+Similarly, a negative monotonic constraint is of the form:
+
+:math:`x_1 \leq x_1' \implies F(x_1, x_2) \geq F(x_1', x_2)`.
+
+Note that monotonic constraints only constraint the output "all else being
+equal". Indeed, the following relation **is not enforced** by a positive
+constraint: :math:`x_1 \leq x_1' \implies F(x_1, x_2) \leq F(x_1', x_2')`.
+
+You can specify a monotonic constraint on each feature using the
+`monotonic_cst` parameter. For each feature, a value of 0 indicates no
+constraint, while -1 and 1 indicate a negative and positive constraint,
+respectively::
+
+  >>> from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+  >>> from sklearn.ensemble import HistGradientBoostingRegressor
+
+  ... # positive, negative, and no constraint on the 3 features
+  >>> gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, -1, 0])
+
+In a binary classification context, imposing a monotonic constraint means
+that the feature is supposed to have a positive / negative effect on the
+probability to belong to the positive class. Monotonic constraints are not
+supported for multiclass context.
+
+.. topic:: Examples:
+
+  * :ref:`sphx_glr_auto_examples_ensemble_plot_monotonic_constraints.py`
+
 Low-level parallelism
 ---------------------
 
diff --git a/doc/modules/feature_selection.rst b/doc/modules/feature_selection.rst
index 6a319cfb94336..2683a35eaf72b 100644
--- a/doc/modules/feature_selection.rst
+++ b/doc/modules/feature_selection.rst
@@ -152,7 +152,8 @@ The features are considered unimportant and removed, if the corresponding
 ``threshold`` parameter. Apart from specifying the threshold numerically,
 there are built-in heuristics for finding a threshold using a string argument.
 Available heuristics are "mean", "median" and float multiples of these like
-"0.1*mean".
+"0.1*mean". In combination with the `threshold` criteria, one can use the
+`max_features` parameter to set a limit on the number of features to select.
 
 For examples on how it is to be used refer to the sections below.
 
diff --git a/doc/modules/naive_bayes.rst b/doc/modules/naive_bayes.rst
index 457ec6c630b99..b2dd4cf5a7cd3 100644
--- a/doc/modules/naive_bayes.rst
+++ b/doc/modules/naive_bayes.rst
@@ -16,7 +16,7 @@ vector :math:`x_1` through :math:`x_n`, :
 
 .. math::
 
-   P(y \mid x_1, \dots, x_n) = \frac{P(y) P(x_1, \dots x_n \mid y)}
+   P(y \mid x_1, \dots, x_n) = \frac{P(y) P(x_1, \dots, x_n \mid y)}
                                     {P(x_1, \dots, x_n)}
 
 Using the naive conditional independence assumption that
diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst
index 706a9ff559aa8..23dc7fbf67b65 100644
--- a/doc/modules/svm.rst
+++ b/doc/modules/svm.rst
@@ -4,6 +4,9 @@
 Support Vector Machines
 =======================
 
+.. TODO: Describe tol parameter
+.. TODO: Describe max_iter parameter
+
 .. currentmodule:: sklearn.svm
 
 **Support vector machines (SVMs)** are a set of supervised learning
@@ -49,7 +52,7 @@ Classification
 ==============
 
 :class:`SVC`, :class:`NuSVC` and :class:`LinearSVC` are classes
-capable of performing multi-class classification on a dataset.
+capable of performing binary and multi-class classification on a dataset.
 
 
 .. figure:: ../auto_examples/svm/images/sphx_glr_plot_iris_svc_001.png
@@ -60,16 +63,16 @@ capable of performing multi-class classification on a dataset.
 :class:`SVC` and :class:`NuSVC` are similar methods, but accept
 slightly different sets of parameters and have different mathematical
 formulations (see section :ref:`svm_mathematical_formulation`). On the
-other hand, :class:`LinearSVC` is another implementation of Support
+other hand, :class:`LinearSVC` is another (faster) implementation of Support
 Vector Classification for the case of a linear kernel. Note that
-:class:`LinearSVC` does not accept keyword ``kernel``, as this is
-assumed to be linear. It also lacks some of the members of
+:class:`LinearSVC` does not accept parameter ``kernel``, as this is
+assumed to be linear. It also lacks some of the attributes of
 :class:`SVC` and :class:`NuSVC`, like ``support_``.
 
 As other classifiers, :class:`SVC`, :class:`NuSVC` and
-:class:`LinearSVC` take as input two arrays: an array X of size ``[n_samples,
-n_features]`` holding the training samples, and an array y of class labels
-(strings or integers), size ``[n_samples]``::
+:class:`LinearSVC` take as input two arrays: an array `X` of shape
+`(n_samples, n_features)` holding the training samples, and an array `y` of
+class labels (strings or integers), of shape `(n_samples)`::
 
 
     >>> from sklearn import svm
@@ -84,10 +87,10 @@ After being fitted, the model can then be used to predict new values::
     >>> clf.predict([[2., 2.]])
     array([1])
 
-SVMs decision function depends on some subset of the training data,
-called the support vectors. Some properties of these support vectors
-can be found in members ``support_vectors_``, ``support_`` and
-``n_support``::
+SVMs decision function (detailed in the :ref:`svm_mathematical_formulation`)
+depends on some subset of the training data, called the support vectors. Some
+properties of these support vectors can be found in attributes
+``support_vectors_``, ``support_`` and ``n_support``::
 
     >>> # get support vectors
     >>> clf.support_vectors_
@@ -100,19 +103,25 @@ can be found in members ``support_vectors_``, ``support_`` and
     >>> clf.n_support_
     array([1, 1]...)
 
+.. topic:: Examples:
+
+ * :ref:`sphx_glr_auto_examples_svm_plot_separating_hyperplane.py`,
+ * :ref:`sphx_glr_auto_examples_svm_plot_svm_nonlinear.py`
+ * :ref:`sphx_glr_auto_examples_svm_plot_svm_anova.py`,
+
 .. _svm_multi_class:
 
 Multi-class classification
 --------------------------
 
-:class:`SVC` and :class:`NuSVC` implement the "one-against-one"
-approach (Knerr et al., 1990) for multi- class classification. If
-``n_class`` is the number of classes, then ``n_class * (n_class - 1) / 2``
+:class:`SVC` and :class:`NuSVC` implement the "one-versus-one"
+approach for multi-class classification. In total,
+``n_classes * (n_classes - 1) / 2``
 classifiers are constructed and each one trains data from two classes.
 To provide a consistent interface with other classifiers, the
-``decision_function_shape`` option allows to monotically transform the results of the
-"one-against-one" classifiers to a decision function of shape ``(n_samples,
-n_classes)``.
+``decision_function_shape`` option allows to monotonically transform the
+results of the "one-versus-one" classifiers to a "one-vs-rest" decision
+function of shape ``(n_samples, n_classes)``.
 
     >>> X = [[0], [1], [2], [3]]
     >>> Y = [0, 1, 2, 3]
@@ -128,8 +137,7 @@ n_classes)``.
     4
 
 On the other hand, :class:`LinearSVC` implements "one-vs-the-rest"
-multi-class strategy, thus training n_class models. If there are only
-two classes, only one model is trained::
+multi-class strategy, thus training `n_classes` models.
 
     >>> lin_clf = svm.LinearSVC()
     >>> lin_clf.fit(X, Y)
@@ -142,39 +150,37 @@ See :ref:`svm_mathematical_formulation` for a complete description of
 the decision function.
 
 Note that the :class:`LinearSVC` also implements an alternative multi-class
-strategy, the so-called multi-class SVM formulated by Crammer and Singer, by
-using the option ``multi_class='crammer_singer'``. This method is consistent,
-which is not true for one-vs-rest classification.
-In practice, one-vs-rest classification is usually preferred, since the results
-are mostly similar, but the runtime is significantly less.
+strategy, the so-called multi-class SVM formulated by Crammer and Singer
+[#8]_, by using the option ``multi_class='crammer_singer'``. In practice,
+one-vs-rest classification is usually preferred, since the results are mostly
+similar, but the runtime is significantly less.
 
 For "one-vs-rest" :class:`LinearSVC` the attributes ``coef_`` and ``intercept_``
-have the shape ``[n_class, n_features]`` and ``[n_class]`` respectively.
-Each row of the coefficients corresponds to one of the ``n_class`` many
+have the shape ``(n_classes, n_features)`` and ``(n_classes,)`` respectively.
+Each row of the coefficients corresponds to one of the ``n_classes``
 "one-vs-rest" classifiers and similar for the intercepts, in the
 order of the "one" class.
 
-In the case of "one-vs-one" :class:`SVC`, the layout of the attributes
-is a little more involved. In the case of having a linear kernel, the
-attributes ``coef_`` and ``intercept_`` have the shape
-``[n_class * (n_class - 1) / 2, n_features]`` and
-``[n_class * (n_class - 1) / 2]`` respectively. This is similar to the
-layout for :class:`LinearSVC` described above, with each row now corresponding
+In the case of "one-vs-one" :class:`SVC` and :class:`NuSVC`, the layout of
+the attributes is a little more involved. In the case of a linear
+kernel, the attributes ``coef_`` and ``intercept_`` have the shape
+``(n_classes * (n_classes - 1) / 2, n_features)`` and ``(n_classes *
+(n_classes - 1) / 2)`` respectively. This is similar to the layout for
+:class:`LinearSVC` described above, with each row now corresponding
 to a binary classifier. The order for classes
 0 to n is "0 vs 1", "0 vs 2" , ... "0 vs n", "1 vs 2", "1 vs 3", "1 vs n", . .
 . "n-1 vs n".
 
-The shape of ``dual_coef_`` is ``[n_class-1, n_SV]`` with
+The shape of ``dual_coef_`` is ``(n_classes-1, n_SV)`` with
 a somewhat hard to grasp layout.
 The columns correspond to the support vectors involved in any
-of the ``n_class * (n_class - 1) / 2`` "one-vs-one" classifiers.
-Each of the support vectors is used in ``n_class - 1`` classifiers.
-The ``n_class - 1`` entries in each row correspond to the dual coefficients
+of the ``n_classes * (n_classes - 1) / 2`` "one-vs-one" classifiers.
+Each of the support vectors is used in ``n_classes - 1`` classifiers.
+The ``n_classes - 1`` entries in each row correspond to the dual coefficients
 for these classifiers.
 
-This might be made more clear by an example:
-
-Consider a three class problem with class 0 having three support vectors
+This might be clearer with an example: consider a three class problem with
+class 0 having three support vectors
 :math:`v^{0}_0, v^{1}_0, v^{2}_0` and class 1 and 2 having two support vectors
 :math:`v^{0}_1, v^{1}_1` and :math:`v^{0}_2, v^{1}_2` respectively.  For each
 support vector :math:`v^{j}_i`, there are two dual coefficients.  Let's call
@@ -198,6 +204,9 @@ Then ``dual_coef_`` looks like this:
 |:math:`\alpha^{1}_{2,0}`|:math:`\alpha^{1}_{2,1}`|                  |
 +------------------------+------------------------+------------------+
 
+.. topic:: Examples:
+
+ * :ref:`sphx_glr_auto_examples_svm_plot_iris_svc.py`,
 
 .. _scores_probabilities:
 
@@ -209,18 +218,29 @@ per-class scores for each sample (or a single score per sample in the binary
 case). When the constructor option ``probability`` is set to ``True``,
 class membership probability estimates (from the methods ``predict_proba`` and
 ``predict_log_proba``) are enabled. In the binary case, the probabilities are
-calibrated using Platt scaling: logistic regression on the SVM's scores,
+calibrated using Platt scaling [#1]_: logistic regression on the SVM's scores,
 fit by an additional cross-validation on the training data.
-In the multiclass case, this is extended as per Wu et al. (2004).
+In the multiclass case, this is extended as per [#2]_.
+
+.. note::
 
-Needless to say, the cross-validation involved in Platt scaling
+  The same probability calibration procedure is available for all estimators
+  via the :class:`~sklearn.calibration.CalibratedClassifierCV` (see
+  :ref:`calibration`). In the case of :class:`SVC` and :class:`NuSVC`, this
+  procedure is builtin in `libsvm`_ which is used under the hood, so it does
+  not rely on scikit-learn's
+  :class:`~sklearn.calibration.CalibratedClassifierCV`.
+
+The cross-validation involved in Platt scaling
 is an expensive operation for large datasets.
-In addition, the probability estimates may be inconsistent with the scores,
-in the sense that the "argmax" of the scores
-may not be the argmax of the probabilities.
-(E.g., in binary classification,
-a sample may be labeled by ``predict`` as belonging to a class
-that has probability <½ according to ``predict_proba``.)
+In addition, the probability estimates may be inconsistent with the scores:
+
+- the "argmax" of the scores may not be the argmax of the probabilities
+- in binary classification, a sample may be labeled by ``predict`` as
+  belonging to the positive class even if the output of `predict_proba` is
+  less than 0.5; and similarly, it could be labeled as negative even if the
+  output of `predict_proba` is more than 0.5.
+
 Platt's method is also known to have theoretical issues.
 If confidence scores are required, but these do not have to be probabilities,
 then it is advisable to set ``probability=False``
@@ -231,35 +251,23 @@ unlike ``decision_function``, the ``predict`` method does not try to break ties
 by default. You can set ``break_ties=True`` for the output of ``predict`` to be
 the same as ``np.argmax(clf.decision_function(...), axis=1)``, otherwise the
 first class among the tied classes will always be returned; but have in mind
-that it comes with a computational cost.
-
-.. figure:: ../auto_examples/svm/images/sphx_glr_plot_svm_tie_breaking_001.png
-   :target: ../auto_examples/svm/plot_svm_tie_breaking.html
-   :align: center
-
-.. topic:: References:
-
- * Wu, Lin and Weng,
-   `"Probability estimates for multi-class classification by pairwise coupling"
-   <https://www.csie.ntu.edu.tw/~cjlin/papers/svmprob/svmprob.pdf>`_,
-   JMLR 5:975-1005, 2004.
- 
- 
- * Platt
-   `"Probabilistic outputs for SVMs and comparisons to regularized likelihood methods"
-   <https://www.cs.colorado.edu/~mozer/Teaching/syllabi/6622/papers/Platt1999.pdf>`_.
+that it comes with a computational cost. See
+:ref:`sphx_glr_auto_examples_svm_plot_svm_tie_breaking.py` for an example on
+tie breaking.
 
 Unbalanced problems
 --------------------
 
 In problems where it is desired to give more importance to certain
-classes or certain individual samples keywords ``class_weight`` and
+classes or certain individual samples, the parameters ``class_weight`` and
 ``sample_weight`` can be used.
 
-:class:`SVC` (but not :class:`NuSVC`) implement a keyword
+:class:`SVC` (but not :class:`NuSVC`) implements the parameter
 ``class_weight`` in the ``fit`` method. It's a dictionary of the form
 ``{class_label : value}``, where value is a floating point number > 0
 that sets the parameter ``C`` of class ``class_label`` to ``C * value``.
+The figure below illustrates the decision boundary of an unbalanced problem,
+with and without weight correction.
 
 .. figure:: ../auto_examples/svm/images/sphx_glr_plot_separating_hyperplane_unbalanced_001.png
    :target: ../auto_examples/svm/plot_separating_hyperplane_unbalanced.html
@@ -269,24 +277,21 @@ that sets the parameter ``C`` of class ``class_label`` to ``C * value``.
 
 :class:`SVC`, :class:`NuSVC`, :class:`SVR`, :class:`NuSVR`, :class:`LinearSVC`,
 :class:`LinearSVR` and :class:`OneClassSVM` implement also weights for
-individual samples in method ``fit`` through keyword ``sample_weight``. Similar
-to ``class_weight``, these set the parameter ``C`` for the i-th example to
-``C * sample_weight[i]``.
-
+individual samples in the `fit` method through the ``sample_weight`` parameter.
+Similar to ``class_weight``, this sets the parameter ``C`` for the i-th
+example to ``C * sample_weight[i]``, which will encourage the classifier to
+get these samples right. The figure below illustrates the effect of sample
+weighting on the decision boundary. The size of the circles is proportional
+to the sample weights:
 
 .. figure:: ../auto_examples/svm/images/sphx_glr_plot_weighted_samples_001.png
    :target: ../auto_examples/svm/plot_weighted_samples.html
    :align: center
    :scale: 75
 
-
 .. topic:: Examples:
 
- * :ref:`sphx_glr_auto_examples_svm_plot_iris_svc.py`,
- * :ref:`sphx_glr_auto_examples_svm_plot_separating_hyperplane.py`,
  * :ref:`sphx_glr_auto_examples_svm_plot_separating_hyperplane_unbalanced.py`
- * :ref:`sphx_glr_auto_examples_svm_plot_svm_anova.py`,
- * :ref:`sphx_glr_auto_examples_svm_plot_svm_nonlinear.py`
  * :ref:`sphx_glr_auto_examples_svm_plot_weighted_samples.py`,
 
 
@@ -303,13 +308,13 @@ above) depends only on a subset of the training data, because the cost
 function for building the model does not care about training points
 that lie beyond the margin. Analogously, the model produced by Support
 Vector Regression depends only on a subset of the training data,
-because the cost function for building the model ignores any training
-data close to the model prediction.
+because the cost function ignores samples whose prediction is close to their
+target.
 
 There are three different implementations of Support Vector Regression:
 :class:`SVR`, :class:`NuSVR` and :class:`LinearSVR`. :class:`LinearSVR`
 provides a faster implementation than :class:`SVR` but only considers
-linear kernels, while :class:`NuSVR` implements a slightly different
+the linear kernel, while :class:`NuSVR` implements a slightly different
 formulation than :class:`SVR` and :class:`LinearSVR`. See
 :ref:`svm_implementation_details` for further details.
 
@@ -331,8 +336,6 @@ floating point values instead of integer values::
 
  * :ref:`sphx_glr_auto_examples_svm_plot_svm_regression.py`
 
-
-
 .. _svm_outlier_detection:
 
 Density estimation, novelty detection
@@ -350,14 +353,14 @@ Support Vector Machines are powerful tools, but their compute and
 storage requirements increase rapidly with the number of training
 vectors. The core of an SVM is a quadratic programming problem (QP),
 separating support vectors from the rest of the training data. The QP
-solver used by this `libsvm`_-based implementation scales between
+solver used by the `libsvm`_-based implementation scales between
 :math:`O(n_{features} \times n_{samples}^2)` and
 :math:`O(n_{features} \times n_{samples}^3)` depending on how efficiently
 the `libsvm`_ cache is used in practice (dataset dependent). If the data
 is very sparse :math:`n_{features}` should be replaced by the average number
 of non-zero features in a sample vector.
 
-Also note that for the linear case, the algorithm used in
+For the linear case, the algorithm used in
 :class:`LinearSVC` by the `liblinear`_ implementation is much more
 efficient than its `libsvm`_-based :class:`SVC` counterpart and can
 scale almost linearly to millions of samples and/or features.
@@ -369,16 +372,16 @@ Tips on Practical Use
 
   * **Avoiding data copy**: For :class:`SVC`, :class:`SVR`, :class:`NuSVC` and
     :class:`NuSVR`, if the data passed to certain methods is not C-ordered
-    contiguous, and double precision, it will be copied before calling the
+    contiguous and double precision, it will be copied before calling the
     underlying C implementation. You can check whether a given numpy array is
     C-contiguous by inspecting its ``flags`` attribute.
 
     For :class:`LinearSVC` (and :class:`LogisticRegression
     <sklearn.linear_model.LogisticRegression>`) any input passed as a numpy
-    array will be copied and converted to the liblinear internal sparse data
+    array will be copied and converted to the `liblinear`_ internal sparse data
     representation (double precision floats and int32 indices of non-zero
     components). If you want to fit a large-scale linear classifier without
-    copying a dense numpy C-contiguous double precision array as input we
+    copying a dense numpy C-contiguous double precision array as input, we
     suggest to use the :class:`SGDClassifier
     <sklearn.linear_model.SGDClassifier>` class instead.  The objective
     function can be configured to be almost the same as the :class:`LinearSVC`
@@ -390,26 +393,44 @@ Tips on Practical Use
     recommended to set ``cache_size`` to a higher value than the default of
     200(MB), such as 500(MB) or 1000(MB).
 
+
   * **Setting C**: ``C`` is ``1`` by default and it's a reasonable default
-    choice.  If you have a lot of noisy observations you should decrease it.
-    It corresponds to regularize more the estimation.
+    choice.  If you have a lot of noisy observations you should decrease it:
+    decreasing C corresponds to more regularization.
     
     :class:`LinearSVC` and :class:`LinearSVR` are less sensitive to ``C`` when
     it becomes large, and prediction results stop improving after a certain 
     threshold. Meanwhile, larger ``C`` values will take more time to train, 
-    sometimes up to 10 times longer, as shown by Fan et al. (2008)
+    sometimes up to 10 times longer, as shown in [#3]_.
 
   * Support Vector Machine algorithms are not scale invariant, so **it
     is highly recommended to scale your data**. For example, scale each
     attribute on the input vector X to [0,1] or [-1,+1], or standardize it
     to have mean 0 and variance 1. Note that the *same* scaling must be
-    applied to the test vector to obtain meaningful results. See section
-    :ref:`preprocessing` for more details on scaling and normalization.
+    applied to the test vector to obtain meaningful results. This can be done
+    easily by using a :class:`~sklearn.pipeline.Pipeline`::
+
+        >>> from sklearn.pipeline import make_pipeline
+        >>> from sklearn.preprocessing import StandardScaler
+        >>> from sklearn.svm import SVC
+
+        >>> clf = make_pipeline(StandardScaler(), SVC())
+    
+    See section :ref:`preprocessing` for more details on scaling and
+    normalization.
+  
+  .. _shrinking_svm:
+
+  * Regarding the `shrinking` parameter, quoting [#4]_: *We found that if the
+    number of iterations is large, then shrinking can shorten the training
+    time. However, if we loosely solve the optimization problem (e.g., by
+    using a large stopping tolerance), the code without using shrinking may
+    be much faster*
 
   * Parameter ``nu`` in :class:`NuSVC`/:class:`OneClassSVM`/:class:`NuSVR`
     approximates the fraction of training errors and support vectors.
 
-  * In :class:`SVC`, if data for classification are unbalanced (e.g. many
+  * In :class:`SVC`, if the data is unbalanced (e.g. many
     positive and few negative), set ``class_weight='balanced'`` and/or try
     different penalty parameters ``C``.
 
@@ -425,9 +446,9 @@ Tips on Practical Use
 
     The underlying :class:`LinearSVC` implementation uses a random number
     generator to select features when fitting the model with a dual coordinate
-    descent (i.e when ``dual`` is set to ``True``). It is thus not uncommon,
+    descent (i.e when ``dual`` is set to ``True``). It is thus not uncommon
     to have slightly different results for the same input data. If that
-    happens, try with a smaller tol parameter. This randomness can also be
+    happens, try with a smaller `tol` parameter. This randomness can also be
     controlled with the ``random_state`` parameter. When ``dual`` is
     set to ``False`` the underlying implementation of :class:`LinearSVC` is
     not random and ``random_state`` has no effect on the results.
@@ -435,18 +456,11 @@ Tips on Practical Use
   * Using L1 penalization as provided by ``LinearSVC(loss='l2', penalty='l1',
     dual=False)`` yields a sparse solution, i.e. only a subset of feature
     weights is different from zero and contribute to the decision function.
-    Increasing ``C`` yields a more complex model (more feature are selected).
+    Increasing ``C`` yields a more complex model (more features are selected).
     The ``C`` value that yields a "null" model (all weights equal to zero) can
     be calculated using :func:`l1_min_c`.
 
 
-.. topic:: References:
-
- * Fan, Rong-En, et al.,
-   `"LIBLINEAR: A library for large linear classification."
-   <https://www.csie.ntu.edu.tw/~cjlin/papers/liblinear.pdf>`_,
-   Journal of machine learning research 9.Aug (2008): 1871-1874.
-
 .. _svm_kernels:
 
 Kernel functions
@@ -457,15 +471,15 @@ The *kernel function* can be any of the following:
   * linear: :math:`\langle x, x'\rangle`.
 
   * polynomial: :math:`(\gamma \langle x, x'\rangle + r)^d`, where
-    :math:`d` is specified by keyword ``degree``, :math:`r` by ``coef0``.
+    :math:`d` is specified by parameter ``degree``, :math:`r` by ``coef0``.
 
   * rbf: :math:`\exp(-\gamma \|x-x'\|^2)`, where :math:`\gamma` is
-    specified by keyword ``gamma``, must be greater than 0.
+    specified by parameter ``gamma``, must be greater than 0.
 
   * sigmoid :math:`\tanh(\gamma \langle x,x'\rangle + r)`,
     where :math:`r` is specified by ``coef0``.
 
-Different kernels are specified by keyword kernel at initialization::
+Different kernels are specified by the `kernel` parameter::
 
     >>> linear_svc = svm.SVC(kernel='linear')
     >>> linear_svc.kernel
@@ -474,6 +488,26 @@ Different kernels are specified by keyword kernel at initialization::
     >>> rbf_svc.kernel
     'rbf'
 
+Parameters of the RBF Kernel
+----------------------------
+
+When training an SVM with the *Radial Basis Function* (RBF) kernel, two
+parameters must be considered: ``C`` and ``gamma``.  The parameter ``C``,
+common to all SVM kernels, trades off misclassification of training examples
+against simplicity of the decision surface. A low ``C`` makes the decision
+surface smooth, while a high ``C`` aims at classifying all training examples
+correctly.  ``gamma`` defines how much influence a single training example has.
+The larger ``gamma`` is, the closer other examples must be to be affected.
+
+Proper choice of ``C`` and ``gamma`` is critical to the SVM's performance.  One
+is advised to use :class:`sklearn.model_selection.GridSearchCV` with 
+``C`` and ``gamma`` spaced exponentially far apart to choose good values.
+
+.. topic:: Examples:
+
+ * :ref:`sphx_glr_auto_examples_svm_plot_rbf_parameters.py`
+ * :ref:`sphx_glr_auto_examples_svm_plot_svm_nonlinear.py`
+
 
 Custom Kernels
 --------------
@@ -495,8 +529,8 @@ classifiers, except that:
 Using Python functions as kernels
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-You can also use your own defined kernels by passing a function to the
-keyword ``kernel`` in the constructor.
+You can use your own defined kernels by passing a function to the
+``kernel`` parameter.
 
 Your kernel must take as arguments two matrices of shape
 ``(n_samples_1, n_features)``, ``(n_samples_2, n_features)``
@@ -519,77 +553,81 @@ instance that will use that kernel::
 Using the Gram matrix
 ~~~~~~~~~~~~~~~~~~~~~
 
-Set ``kernel='precomputed'`` and pass the Gram matrix instead of X in the fit
-method. At the moment, the kernel values between *all* training vectors and the
-test vectors must be provided.
+You can pass pre-computed kernels by using the ``kernel='precomputed'``
+option. You should then pass Gram matrix instead of X to the `fit` and
+`predict` methods. The kernel values between *all* training vectors and the
+test vectors must be provided:
 
     >>> import numpy as np
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.model_selection import train_test_split 
     >>> from sklearn import svm
-    >>> X = np.array([[0, 0], [1, 1]])
-    >>> y = [0, 1]
+    >>> X, y = make_classification(n_samples=10, random_state=0)
+    >>> X_train , X_test , y_train, y_test = train_test_split(X, y, random_state=0)
     >>> clf = svm.SVC(kernel='precomputed')
     >>> # linear kernel computation
-    >>> gram = np.dot(X, X.T)
-    >>> clf.fit(gram, y)
+    >>> gram_train = np.dot(X_train, X_train.T)
+    >>> clf.fit(gram_train, y_train)
     SVC(kernel='precomputed')
     >>> # predict on training examples
-    >>> clf.predict(gram)
-    array([0, 1])
-
-Parameters of the RBF Kernel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-When training an SVM with the *Radial Basis Function* (RBF) kernel, two
-parameters must be considered: ``C`` and ``gamma``.  The parameter ``C``,
-common to all SVM kernels, trades off misclassification of training examples
-against simplicity of the decision surface. A low ``C`` makes the decision
-surface smooth, while a high ``C`` aims at classifying all training examples
-correctly.  ``gamma`` defines how much influence a single training example has.
-The larger ``gamma`` is, the closer other examples must be to be affected.
+    >>> gram_test = np.dot(X_test, X_train.T)
+    >>> clf.predict(gram_test)
+    array([0, 1, 0])
 
-Proper choice of ``C`` and ``gamma`` is critical to the SVM's performance.  One
-is advised to use :class:`sklearn.model_selection.GridSearchCV` with 
-``C`` and ``gamma`` spaced exponentially far apart to choose good values.
-
-.. topic:: Examples:
-
- * :ref:`sphx_glr_auto_examples_svm_plot_rbf_parameters.py`
 
 .. _svm_mathematical_formulation:
 
 Mathematical formulation
 ========================
 
-A support vector machine constructs a hyper-plane or set of hyper-planes
-in a high or infinite dimensional space, which can be used for
+A support vector machine constructs a hyper-plane or set of hyper-planes in a
+high or infinite dimensional space, which can be used for
 classification, regression or other tasks. Intuitively, a good
 separation is achieved by the hyper-plane that has the largest distance
 to the nearest training data points of any class (so-called functional
 margin), since in general the larger the margin the lower the
-generalization error of the classifier.
-
+generalization error of the classifier. The figure below shows the decision
+function for a linearly separable problem, with three samples on the
+margin boundaries, called "support vectors":
 
 .. figure:: ../auto_examples/svm/images/sphx_glr_plot_separating_hyperplane_001.png
    :align: center
    :scale: 75
 
+In general, when the problem isn't linearly separable, the support vectors
+are the samples *within* the margin boundaries.
+
+We recommend [#5]_ and [#6]_ as good references for the theory and
+practicalities of SVMs.
+
 SVC
 ---
 
 Given training vectors :math:`x_i \in \mathbb{R}^p`, i=1,..., n, in two classes, and a
-vector :math:`y \in \{1, -1\}^n`, SVC solves the following primal problem:
+vector :math:`y \in \{1, -1\}^n`, our goal is to find :math:`w \in
+\mathbb{R}^p` and :math:`b \in \mathbb{R}` such that the prediction given by
+:math:`\text{sign} (w^T\phi(x) + b)` is correct for most samples.
 
+SVC solves the following primal problem:
 
 .. math::
 
     \min_ {w, b, \zeta} \frac{1}{2} w^T w + C \sum_{i=1}^{n} \zeta_i
 
-
-
     \textrm {subject to } & y_i (w^T \phi (x_i) + b) \geq 1 - \zeta_i,\\
     & \zeta_i \geq 0, i=1, ..., n
 
-Its dual is
+Intuitively, we're trying to maximize the margin (by minimizing
+:math:`||w||^2 = w^Tw`), while incurring a penalty when a sample is
+misclassified or within the margin boundary. Ideally, the value :math:`y_i
+(w^T \phi (x_i) + b)` would be :math:`\geq 1` for all samples, which
+indicates a perfect prediction. But problems are usually not always perfectly
+separable with a hyperplane, so we allow some samples to be at a distance :math:`\zeta_i` from
+their correct margin boundary. The penalty term `C` controls the strengh of
+this penalty, and as a result, acts as an inverse regularization parameter
+(see note below).
+
+The dual problem to the primal is
 
 .. math::
 
@@ -599,16 +637,29 @@ Its dual is
    \textrm {subject to } & y^T \alpha = 0\\
    & 0 \leq \alpha_i \leq C, i=1, ..., n
 
-where :math:`e` is the vector of all ones, :math:`C > 0` is the upper bound,
-:math:`Q` is an :math:`n` by :math:`n` positive semidefinite matrix,
+where :math:`e` is the vector of all ones,
+and :math:`Q` is an :math:`n` by :math:`n` positive semidefinite matrix,
 :math:`Q_{ij} \equiv y_i y_j K(x_i, x_j)`, where :math:`K(x_i, x_j) = \phi (x_i)^T \phi (x_j)`
-is the kernel. Here training vectors are implicitly mapped into a higher
-(maybe infinite) dimensional space by the function :math:`\phi`.
+is the kernel. The terms :math:`\alpha_i` are called the dual coefficients,
+and they are upper-bounded by :math:`C`.
+This dual representation highlights the fact that training vectors are
+implicitly mapped into a higher (maybe infinite)
+dimensional space by the function :math:`\phi`: see `kernel trick
+<https://en.wikipedia.org/wiki/Kernel_method>`_.
 
+Once the optimization problem is solved, the output of
+:term:`decision_function` for a given sample :math:`x` becomes:
 
-The decision function is:
+.. math:: \sum_{i\in SV} y_i \alpha_i K(x_i, x) + b,
 
-.. math:: \operatorname{sgn}(\sum_{i=1}^n y_i \alpha_i K(x_i, x) + \rho)
+and the predicted class correspond to its sign. We only need to sum over the
+support vectors (i.e. the samples that lie within the margin) because the
+dual coefficients :math:`\alpha_i` are zero for the other samples.
+
+These parameters can be accessed through the attributes ``dual_coef_``
+which holds the product :math:`y_i \alpha_i`, ``support_vectors_`` which
+holds the support vectors, and ``intercept_`` which holds the independent
+term :math:`b`
 
 .. note::
 
@@ -619,37 +670,37 @@ The decision function is:
     estimator used is :class:`sklearn.linear_model.Ridge <ridge>` regression,
     the relation between them is given as :math:`C = \frac{1}{alpha}`.
 
-.. TODO multiclass case ?/
+LinearSVC
+---------
 
-This parameters can be accessed through the members ``dual_coef_``
-which holds the product :math:`y_i \alpha_i`, ``support_vectors_`` which
-holds the support vectors, and ``intercept_`` which holds the independent
-term :math:`\rho` :
-
-.. topic:: References:
-
- * `"Automatic Capacity Tuning of Very Large VC-dimension Classifiers"
-   <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.17.7215>`_,
-   I. Guyon, B. Boser, V. Vapnik - Advances in neural information
-   processing 1993.
+The primal problem can be equivalently formulated as
 
+.. math::
 
- * `"Support-vector networks"
-   <https://link.springer.com/article/10.1007%2FBF00994018>`_,
-   C. Cortes, V. Vapnik - Machine Learning, 20, 273-297 (1995).
+    \min_ {w, b} \frac{1}{2} w^T w + C \sum_{i=1}\max(0, y_i (w^T \phi(x_i) + b)),
 
+where we make use of the `hinge loss
+<https://en.wikipedia.org/wiki/Hinge_loss>`_. This is the form that is
+directly optimized by :class:`LinearSVC`, but unlike the dual form, this one
+does not involve inner products between samples, so the famous kernel trick
+cannot be applied. This is why only the linear kernel is supported by
+:class:`LinearSVC` (:math:`\phi` is the identity function).
 
+.. _nu_svc:
 
 NuSVC
 -----
 
-We introduce a new parameter :math:`\nu` which controls the number of
-support vectors and training errors. The parameter :math:`\nu \in (0,
-1]` is an upper bound on the fraction of training errors and a lower
-bound of the fraction of support vectors.
+The :math:`\nu`-SVC formulation [#7]_ is a reparameterization of the
+:math:`C`-SVC and therefore mathematically equivalent.
 
-It can be shown that the :math:`\nu`-SVC formulation is a reparameterization
-of the :math:`C`-SVC and therefore mathematically equivalent.
+We introduce a new parameter :math:`\nu` (instead of :math:`C`) which
+controls the number of support vectors and *margin errors*:
+:math:`\nu \in (0, 1]` is an upper bound on the fraction of margin errors and
+a lower bound of the fraction of support vectors. A margin error corresponds
+to a sample that lies on the wrong side of its margin boundary: it is either
+misclassified, or it is correctly classified but does not lie beyond the
+margin.
 
 
 SVR
@@ -669,7 +720,12 @@ vector :math:`y \in \mathbb{R}^n` :math:`\varepsilon`-SVR solves the following p
                           & w^T \phi (x_i) + b - y_i \leq \varepsilon + \zeta_i^*,\\
                           & \zeta_i, \zeta_i^* \geq 0, i=1, ..., n
 
-Its dual is
+Here, we are penalizing samples whose prediction is at least :math:`\varepsilon`
+away from their true target. These samples penalize the objective by
+:math:`\zeta_i` or :math:`\zeta_i^*`, depending on whether their predictions
+lie above or below the :math:`\varepsilon` tube.
+
+The dual problem is
 
 .. math::
 
@@ -679,49 +735,80 @@ Its dual is
    \textrm {subject to } & e^T (\alpha - \alpha^*) = 0\\
    & 0 \leq \alpha_i, \alpha_i^* \leq C, i=1, ..., n
 
-where :math:`e` is the vector of all ones, :math:`C > 0` is the upper bound,
+where :math:`e` is the vector of all ones,
 :math:`Q` is an :math:`n` by :math:`n` positive semidefinite matrix,
 :math:`Q_{ij} \equiv K(x_i, x_j) = \phi (x_i)^T \phi (x_j)`
 is the kernel. Here training vectors are implicitly mapped into a higher
 (maybe infinite) dimensional space by the function :math:`\phi`.
 
-The decision function is:
+The prediction is:
 
-.. math:: \sum_{i=1}^n (\alpha_i - \alpha_i^*) K(x_i, x) + \rho
+.. math:: \sum_{i \in SV}(\alpha_i - \alpha_i^*) K(x_i, x) + b
 
-These parameters can be accessed through the members ``dual_coef_``
+These parameters can be accessed through the attributes ``dual_coef_``
 which holds the difference :math:`\alpha_i - \alpha_i^*`, ``support_vectors_`` which
 holds the support vectors, and ``intercept_`` which holds the independent
-term :math:`\rho`
+term :math:`b`
 
-.. topic:: References:
+LinearSVR
+---------
+
+The primal problem can be equivalently formulated as
 
- * `"A Tutorial on Support Vector Regression"
-   <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.114.4288>`_,
-   Alex J. Smola, Bernhard Schölkopf - Statistics and Computing archive
-   Volume 14 Issue 3, August 2004, p. 199-222. 
+.. math::
+
+    \min_ {w, b} \frac{1}{2} w^T w + C \sum_{i=1}\max(0, |y_i - (w^T \phi(x_i) + b)| - \varepsilon),
 
+where we make use of the epsilon-insensitive loss, i.e. errors of less than
+:math:`\varepsilon` are ignored. This is the form that is directly optimized
+by :class:`LinearSVR`.
 
 .. _svm_implementation_details:
 
 Implementation details
 ======================
 
-Internally, we use `libsvm`_ and `liblinear`_ to handle all
+Internally, we use `libsvm`_ [#4]_ and `liblinear`_ [#3]_ to handle all
 computations. These libraries are wrapped using C and Cython.
+For a description of the implementation and details of the algorithms
+used, please refer to their respective papers.
+
 
 .. _`libsvm`: https://www.csie.ntu.edu.tw/~cjlin/libsvm/
 .. _`liblinear`: https://www.csie.ntu.edu.tw/~cjlin/liblinear/
 
 .. topic:: References:
 
-  For a description of the implementation and details of the algorithms
-  used, please refer to
+   .. [#1] Platt `"Probabilistic outputs for SVMs and comparisons to
+      regularized likelihood methods"
+      <https://www.cs.colorado.edu/~mozer/Teaching/syllabi/6622/papers/Platt1999.pdf>`_.
 
-    - `LIBSVM: A Library for Support Vector Machines
+   .. [#2] Wu, Lin and Weng, `"Probability estimates for multi-class
+      classification by pairwise coupling"
+      <https://www.csie.ntu.edu.tw/~cjlin/papers/svmprob/svmprob.pdf>`_, JMLR
+      5:975-1005, 2004.
+ 
+   .. [#3] Fan, Rong-En, et al.,
+      `"LIBLINEAR: A library for large linear classification."
+      <https://www.csie.ntu.edu.tw/~cjlin/papers/liblinear.pdf>`_,
+      Journal of machine learning research 9.Aug (2008): 1871-1874.
+
+   .. [#4] Chang and Lin, `LIBSVM: A Library for Support Vector Machines
       <https://www.csie.ntu.edu.tw/~cjlin/papers/libsvm.pdf>`_.
 
-    - `LIBLINEAR -- A Library for Large Linear Classification
-      <https://www.csie.ntu.edu.tw/~cjlin/liblinear/>`_.
+   .. [#5] Bishop, `Pattern recognition and machine learning
+      <https://www.microsoft.com/en-us/research/uploads/prod/2006/01/Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf>`_,
+      chapter 7 Sparse Kernel Machines
 
+   .. [#6] `"A Tutorial on Support Vector Regression"
+      <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.114.4288>`_,
+      Alex J. Smola, Bernhard Schölkopf - Statistics and Computing archive
+      Volume 14 Issue 3, August 2004, p. 199-222.
 
+   .. [#7] Schölkopf et. al `New Support Vector Algorithms
+      <https://www.stat.purdue.edu/~yuzhu/stat598m3/Papers/NewSVM.pdf>`_
+    
+   .. [#8] Crammer and Singer `On the Algorithmic Implementation ofMulticlass
+      Kernel-based Vector Machines
+      <http://jmlr.csail.mit.edu/papers/volume2/crammer01a/crammer01a.pdf>`_,
+      JMLR 2001.
diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index 6141dcce41609..4c489c1887815 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -25,6 +25,12 @@ random sampling procedures.
 - :class:`ensemble.BaggingClassifier`, :class:`ensemble.BaggingRegressor`,
   and :class:`ensemble.IsolationForest`. |Fix|
 
+- Any model using the :func:`svm.libsvm` or the :func:`svm.liblinear` solver,
+  including :class:`svm.LinearSVC`, :class:`svm.LinearSVR`,
+  :class:`svm.NuSVC`, :class:`svm.NuSVR`, :class:`svm.OneClassSVM`,
+  :class:`svm.SVC`, :class:`svm.SVR`, :class:`linear_model.LogisticRegression`.
+  |Efficiency| |Fix|
+
 Details are listed in the changelog below.
 
 (While we are trying to better inform users by providing this information, we
@@ -133,10 +139,12 @@ Changelog
 - |Fix| :class:`decomposition.PCA` with a float `n_components` parameter, will
    exclusively choose the components that explain the variance greater than
    `n_components`. :pr:`15669` by :user:`Krishna Chaitanya <krishnachaitanya9>`
-- |Fix| :func:`decomposition._pca._assess_dimension` now correctly handles small
-   eigenvalues. :pr: `4441` by :user:`Lisa Schwetlick <lschwetlick>`, and
-   :user:`Gelavizh Ahmadi <gelavizh1>` and
-   :user:`Marija Vlajic Wheeler <marijavlajic>`.
+
+- |Fix| :class:`decomposition.PCA` with `n_components='mle'` now correctly
+  handles small eigenvalues, and does not infer 0 as the correct number of
+  components. :pr: `4441` by :user:`Lisa Schwetlick <lschwetlick>`, and
+  :user:`Gelavizh Ahmadi <gelavizh1>` and :user:`Marija Vlajic Wheeler
+  <marijavlajic>` and :pr:`16841` by `Nicolas Hug`_.
 
 - |Enhancement| :class:`decomposition.NMF` and
   :func:`decomposition.non_negative_factorization` now preserves float32 dtype.
@@ -184,6 +192,11 @@ Changelog
   samples in the training set. :pr:`14516` by :user:`Johann Faouzi
   <johannfaouzi>`.
 
+- |Feature| :class:`ensemble.HistGradientBoostingClassifier` and
+  :class:`ensemble.HistGradientBoostingRegressor` now support monotonic
+  constraints, useful when features are supposed to have a positive/negative
+  effect on the target. :pr:`15582` by `Nicolas Hug`_.
+
 - |Fix| Fixed a bug in :class:`ensemble.BaggingClassifier`,
   :class:`ensemble.BaggingRegressor` and :class:`ensemble.IsolationForest`
   where the attribute `estimators_samples_` did not generate the proper indices
@@ -291,7 +304,7 @@ Changelog
 - |API| Changed the formatting of values in
   :meth:`metrics.ConfusionMatrixDisplay.plot` and
   :func:`metrics.plot_confusion_matrix` to pick the shorter format (either '2g'
-  or 'd'). :pr:`16159` by :user:`Rick Mackenbach <Rick-Mackenbach>` and 
+  or 'd'). :pr:`16159` by :user:`Rick Mackenbach <Rick-Mackenbach>` and
   `Thomas Fan`_.
 
 - |Enhancement| :func:`metrics.pairwise.pairwise_distances_chunked` now allows
@@ -358,7 +371,7 @@ Changelog
 
 - |Feature| argument `drop` of :class:`preprocessing.OneHotEncoder`
   will now accept value 'if_binary' and will drop the first category of
-  each feature with two categories. :pr:`#16245`
+  each feature with two categories. :pr:`16245`
   by :user:`Rushabh Vasani <rushabh-v>`.
 
 - |Fix| Fix a bug in :class:`preprocessing.StandardScaler` which was incorrectly
@@ -373,6 +386,26 @@ Changelog
 :mod:`sklearn.svm`
 ..................
 
+- |Fix| |Efficiency| Improved ``libsvm`` and ``liblinear`` random number
+  generators used to randomly select coordinates in the coordinate descent
+  algorithms. Platform-dependent C ``rand()`` was used, which is only able to
+  generate numbers up to ``32767`` on windows platform (see this `blog
+  post <https://codeforces.com/blog/entry/61587>`) and also has poor
+  randomization power as suggested by `this presentation
+  <https://channel9.msdn.com/Events/GoingNative/2013/rand-Considered-Harmful>`.
+  It was replaced with C++11 ``mt19937``, a Mersenne Twister that correctly
+  generates 31bits/63bits random numbers on all platforms. In addition, the
+  crude "modulo" postprocessor used to get a random number in a bounded
+  interval was replaced by the tweaked Lemire method as suggested by `this blog
+  post <http://www.pcg-random.org/posts/bounded-rands.html>`.
+  Any model using the :func:`svm.libsvm` or the :func:`svm.liblinear` solver,
+  including :class:`svm.LinearSVC`, :class:`svm.LinearSVR`,
+  :class:`svm.NuSVC`, :class:`svm.NuSVR`, :class:`svm.OneClassSVM`,
+  :class:`svm.SVC`, :class:`svm.SVR`, :class:`linear_model.LogisticRegression`,
+  is affected. In particular users can expect a better convergence when the
+  number of samples (LibSVM) or the number of features (LibLinear) is large.
+  :pr:`13511` by :user:`Sylvain Marié <smarie>`.
+
 - |API| :class:`svm.SVR` and :class:`svm.OneClassSVM` attributes, `probA_` and
   `probB_`, are now deprecated as they were not useful. :pr:`15558` by
   `Thomas Fan`_.
@@ -408,6 +441,10 @@ Changelog
   pandas sparse DataFrame.
   :pr:`16021` by :user:`Rushabh Vasani <rushabh-v>`.
 
+- |Enhancement| :func:`utils.validation.check_array` now constructs a sparse
+  matrix from a pandas DataFrame that contains only `SparseArray`s.
+  :pr:`16728` by `Thomas Fan`_.
+
 :mod:`sklearn.cluster`
 ......................
 
diff --git a/examples/applications/plot_outlier_detection_housing.py b/examples/applications/plot_outlier_detection_wine.py
similarity index 60%
rename from examples/applications/plot_outlier_detection_housing.py
rename to examples/applications/plot_outlier_detection_wine.py
index 41c697e2e2d2b..6f245b7e6c1cb 100644
--- a/examples/applications/plot_outlier_detection_housing.py
+++ b/examples/applications/plot_outlier_detection_wine.py
@@ -24,26 +24,13 @@
 
 First example
 -------------
-The first example illustrates how robust covariance estimation can help
-concentrating on a relevant cluster when another one exists. Here, many
-observations are confounded into one and break down the empirical covariance
-estimation.
-Of course, some screening tools would have pointed out the presence of two
-clusters (Support Vector Machines, Gaussian Mixture Models, univariate
-outlier detection, ...). But had it been a high-dimensional example, none
-of these could be applied that easily.
-
-Second example
---------------
-The second example shows the ability of the Minimum Covariance Determinant
-robust estimator of covariance to concentrate on the main mode of the data
-distribution: the location seems to be well estimated, although the covariance
-is hard to estimate due to the banana-shaped distribution. Anyway, we can
-get rid of some outlying observations.
-The One-Class SVM is able to capture the real data structure, but the
-difficulty is to adjust its kernel bandwidth parameter so as to obtain
-a good compromise between the shape of the data scatter matrix and the
-risk of over-fitting the data.
+The first example illustrates how the Minimum Covariance Determinant
+robust estimator can help concentrate on a relevant cluster when outlying
+points exist. Here the empirical covariance estimation is skewed by points
+outside of the main cluster. Of course, some screening tools would have pointed
+out the presence of two clusters (Support Vector Machines, Gaussian Mixture
+Models, univariate outlier detection, ...). But had it been a high-dimensional
+example, none of these could be applied that easily.
 
 """
 print(__doc__)
@@ -56,26 +43,24 @@
 from sklearn.svm import OneClassSVM
 import matplotlib.pyplot as plt
 import matplotlib.font_manager
-from sklearn.datasets import load_boston
-
-# Get data
-X1 = load_boston()['data'][:, [8, 10]]  # two clusters
-X2 = load_boston()['data'][:, [5, 12]]  # "banana"-shaped
+from sklearn.datasets import load_wine
 
 # Define "classifiers" to be used
 classifiers = {
     "Empirical Covariance": EllipticEnvelope(support_fraction=1.,
-                                             contamination=0.261),
+                                             contamination=0.25),
     "Robust Covariance (Minimum Covariance Determinant)":
-    EllipticEnvelope(contamination=0.261),
-    "OCSVM": OneClassSVM(nu=0.261, gamma=0.05)}
+    EllipticEnvelope(contamination=0.25),
+    "OCSVM": OneClassSVM(nu=0.25, gamma=0.35)}
 colors = ['m', 'g', 'b']
 legend1 = {}
 legend2 = {}
 
+# Get data
+X1 = load_wine()['data'][:, [1, 2]]  # two clusters
+
 # Learn a frontier for outlier detection with several classifiers
-xx1, yy1 = np.meshgrid(np.linspace(-8, 28, 500), np.linspace(3, 40, 500))
-xx2, yy2 = np.meshgrid(np.linspace(3, 10, 500), np.linspace(-5, 45, 500))
+xx1, yy1 = np.meshgrid(np.linspace(0, 6, 500), np.linspace(1, 4.5, 500))
 for i, (clf_name, clf) in enumerate(classifiers.items()):
     plt.figure(1)
     clf.fit(X1)
@@ -83,25 +68,19 @@
     Z1 = Z1.reshape(xx1.shape)
     legend1[clf_name] = plt.contour(
         xx1, yy1, Z1, levels=[0], linewidths=2, colors=colors[i])
-    plt.figure(2)
-    clf.fit(X2)
-    Z2 = clf.decision_function(np.c_[xx2.ravel(), yy2.ravel()])
-    Z2 = Z2.reshape(xx2.shape)
-    legend2[clf_name] = plt.contour(
-        xx2, yy2, Z2, levels=[0], linewidths=2, colors=colors[i])
 
 legend1_values_list = list(legend1.values())
 legend1_keys_list = list(legend1.keys())
 
 # Plot the results (= shape of the data points cloud)
 plt.figure(1)  # two clusters
-plt.title("Outlier detection on a real data set (boston housing)")
+plt.title("Outlier detection on a real data set (wine recognition)")
 plt.scatter(X1[:, 0], X1[:, 1], color='black')
 bbox_args = dict(boxstyle="round", fc="0.8")
 arrow_args = dict(arrowstyle="->")
-plt.annotate("several confounded points", xy=(24, 19),
+plt.annotate("outlying points", xy=(4, 2),
              xycoords="data", textcoords="data",
-             xytext=(13, 10), bbox=bbox_args, arrowprops=arrow_args)
+             xytext=(3, 1.25), bbox=bbox_args, arrowprops=arrow_args)
 plt.xlim((xx1.min(), xx1.max()))
 plt.ylim((yy1.min(), yy1.max()))
 plt.legend((legend1_values_list[0].collections[0],
@@ -109,15 +88,43 @@
             legend1_values_list[2].collections[0]),
            (legend1_keys_list[0], legend1_keys_list[1], legend1_keys_list[2]),
            loc="upper center",
-           prop=matplotlib.font_manager.FontProperties(size=12))
-plt.ylabel("accessibility to radial highways")
-plt.xlabel("pupil-teacher ratio by town")
+           prop=matplotlib.font_manager.FontProperties(size=11))
+plt.ylabel("ash")
+plt.xlabel("malic_acid")
+
+plt.show()
+
+##############################################################################
+# Second example
+# --------------
+# The second example shows the ability of the Minimum Covariance Determinant
+# robust estimator of covariance to concentrate on the main mode of the data
+# distribution: the location seems to be well estimated, although the
+# covariance is hard to estimate due to the banana-shaped distribution. Anyway,
+# we can get rid of some outlying observations. The One-Class SVM is able to
+# capture the real data structure, but the difficulty is to adjust its kernel
+# bandwidth parameter so as to obtain a good compromise between the shape of
+# the data scatter matrix and the risk of over-fitting the data.
+
+# Get data
+X2 = load_wine()['data'][:, [6, 9]]  # "banana"-shaped
+
+# Learn a frontier for outlier detection with several classifiers
+xx2, yy2 = np.meshgrid(np.linspace(-1, 5.5, 500), np.linspace(-2.5, 19, 500))
+for i, (clf_name, clf) in enumerate(classifiers.items()):
+    plt.figure(2)
+    clf.fit(X2)
+    Z2 = clf.decision_function(np.c_[xx2.ravel(), yy2.ravel()])
+    Z2 = Z2.reshape(xx2.shape)
+    legend2[clf_name] = plt.contour(
+        xx2, yy2, Z2, levels=[0], linewidths=2, colors=colors[i])
 
 legend2_values_list = list(legend2.values())
 legend2_keys_list = list(legend2.keys())
 
+# Plot the results (= shape of the data points cloud)
 plt.figure(2)  # "banana" shape
-plt.title("Outlier detection on a real data set (boston housing)")
+plt.title("Outlier detection on a real data set (wine recognition)")
 plt.scatter(X2[:, 0], X2[:, 1], color='black')
 plt.xlim((xx2.min(), xx2.max()))
 plt.ylim((yy2.min(), yy2.max()))
@@ -126,8 +133,8 @@
             legend2_values_list[2].collections[0]),
            (legend2_keys_list[0], legend2_keys_list[1], legend2_keys_list[2]),
            loc="upper center",
-           prop=matplotlib.font_manager.FontProperties(size=12))
-plt.ylabel("% lower status of the population")
-plt.xlabel("average number of rooms per dwelling")
+           prop=matplotlib.font_manager.FontProperties(size=11))
+plt.ylabel("color_intensity")
+plt.xlabel("flavanoids")
 
 plt.show()
diff --git a/examples/ensemble/plot_gradient_boosting_quantile.py b/examples/ensemble/plot_gradient_boosting_quantile.py
index 343bae08ef4a6..ef40a2247bcc5 100644
--- a/examples/ensemble/plot_gradient_boosting_quantile.py
+++ b/examples/ensemble/plot_gradient_boosting_quantile.py
@@ -61,7 +61,7 @@ def f(x):
 # Make the prediction on the meshed x-axis
 y_pred = clf.predict(xx)
 
-# Plot the function, the prediction and the 90% confidence interval based on
+# Plot the function, the prediction and the 95% confidence interval based on
 # the MSE
 fig = plt.figure()
 plt.plot(xx, f(xx), 'g:', label=r'$f(x) = x\,\sin(x)$')
@@ -71,7 +71,7 @@ def f(x):
 plt.plot(xx, y_lower, 'k-')
 plt.fill(np.concatenate([xx, xx[::-1]]),
          np.concatenate([y_upper, y_lower[::-1]]),
-         alpha=.5, fc='b', ec='None', label='90% prediction interval')
+         alpha=.5, fc='b', ec='None', label='95% prediction interval')
 plt.xlabel('$x$')
 plt.ylabel('$f(x)$')
 plt.ylim(-10, 20)
diff --git a/examples/ensemble/plot_gradient_boosting_regression.py b/examples/ensemble/plot_gradient_boosting_regression.py
index ae1f0ab42c258..3dbe7dbaac296 100644
--- a/examples/ensemble/plot_gradient_boosting_regression.py
+++ b/examples/ensemble/plot_gradient_boosting_regression.py
@@ -6,25 +6,25 @@
 This example demonstrates Gradient Boosting to produce a predictive
 model from an ensemble of weak predictive models. Gradient boosting can be used
 for regression and classification problems. Here, we will train a model to
-tackle a diabetes regression task.
-
-We will obtain the results from
+tackle a diabetes regression task. We will obtain the results from
 :class:`~sklearn.ensemble.GradientBoostingRegressor` with least squares loss
 and 500 regression trees of depth 4.
 
+Note: For larger datasets (n_samples >= 10000), please refer to
+:class:`sklearn.ensemble.HistGradientBoostingRegressor`
 """
 print(__doc__)
 
 # Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #         Maria Telenczuk <https://github.com/maikia>
+#         Katrina Ni <https://github.com/nilichen>
 #
 # License: BSD 3 clause
 
-import numpy as np
 import matplotlib.pyplot as plt
-
-from sklearn import ensemble
-from sklearn import datasets
+import numpy as np
+from sklearn import datasets, ensemble
+from sklearn.inspection import permutation_importance
 from sklearn.metrics import mean_squared_error
 from sklearn.model_selection import train_test_split
 
@@ -47,23 +47,25 @@
 # regression model. You can play with those parameters to see how the
 # results change:
 #
-# Here:
-# n_estimators : is the number of boosting stages which will be performed.
-#     Later, we will plot and see how the deviance changes with those boosting
-#     operations.
-# max_depth : this limits the number of nodes in the tree. The best value
-#     depends on the interaction of the input variables.
-# min_samples_split : is the minimum number of samples required to split an
-#     internal node.
-# learning_rate: tells how much the contribution of each tree will shrink
-# loss: here, we decided to use least squeares as a loss function, however
-#     there are many other options (check
-#     :class:`~sklearn.ensemble.GradientBoostingRegressor` to see what are
-#     other possibilities)
-
-X_train, X_test, y_train, y_test = train_test_split(X, y,
-                                                    test_size=0.1,
-                                                    random_state=13)
+# n_estimators : the number of boosting stages which will be performed.
+# Later, we will plot and see how the deviance changes with those boosting
+# operations.
+#
+# max_depth : limits the number of nodes in the tree.
+# The best value depends on the interaction of the input variables.
+#
+# min_samples_split : the minimum number of samples required to split an
+# internal node.
+#
+# learning_rate : how much the contribution of each tree will shrink
+#
+# loss : here, we decided to use least squeares as a loss function.
+# However there are many other options (check
+# :class:`~sklearn.ensemble.GradientBoostingRegressor` to see what are
+# other possibilities)
+
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.1, random_state=13)
 
 params = {'n_estimators': 500,
           'max_depth': 4,
@@ -92,13 +94,11 @@
 # test set deviance and then plot it.
 
 test_score = np.zeros((params['n_estimators'],), dtype=np.float64)
-
 for i, y_pred in enumerate(clf.staged_predict(X_test)):
     test_score[i] = clf.loss_(y_test, y_pred)
 
-fig = plt.figure(figsize=(12, 8))
-
-plt.subplot(1, 2, 1)
+fig = plt.figure(figsize=(6, 6))
+plt.subplot(1, 1, 1)
 plt.title('Deviance')
 plt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',
          label='Training Set Deviance')
@@ -107,25 +107,39 @@
 plt.legend(loc='upper right')
 plt.xlabel('Boosting Iterations')
 plt.ylabel('Deviance')
+fig.tight_layout()
+plt.show()
 
 ##############################################################################
-# Plot impurity-based feature importance
+# Plot feature importance
 # -------------------------------------
 #
 # Careful, impurity-based feature importances can be misleading for
-# high cardinality features (many unique values). See
-# :func:`sklearn.inspection.permutation_importance` as an alternative.
+# high cardinality features (many unique values). As an alternative,
+# the permutation importances of ``clf`` are computed on a
+# held out test set. See :ref:`permutation_importance` for more details.
+#
+# In this case, the two methods agree to identify the same top 2 features
+# as strongly predictive features but not in the same order. The third most
+# predictive feature, "bp", is also the same for the 2 methods. The remaining
+# features are less predictive and the error bars of the permutation plot
+# show that they overlap with 0.
 
 feature_importance = clf.feature_importances_
-# make importances relative to max importance
-feature_importance = 100.0 * (feature_importance / feature_importance.max())
 sorted_idx = np.argsort(feature_importance)
 pos = np.arange(sorted_idx.shape[0]) + .5
-plt.subplot(1, 2, 2)
+fig = plt.figure(figsize=(12, 6))
+plt.subplot(1, 2, 1)
 plt.barh(pos, feature_importance[sorted_idx], align='center')
 plt.yticks(pos, np.array(diabetes.feature_names)[sorted_idx])
-plt.xlabel('Relative Importance')
-plt.title('Variable Importance')
-fig.tight_layout()
+plt.title('Feature Importance (MDI)')
 
+result = permutation_importance(clf, X_test, y_test, n_repeats=10,
+                                random_state=42, n_jobs=2)
+sorted_idx = result.importances_mean.argsort()
+plt.subplot(1, 2, 2)
+plt.boxplot(result.importances[sorted_idx].T,
+            vert=False, labels=np.array(diabetes.feature_names)[sorted_idx])
+plt.title("Permutation Importance (test set)")
+fig.tight_layout()
 plt.show()
diff --git a/examples/ensemble/plot_monotonic_constraints.py b/examples/ensemble/plot_monotonic_constraints.py
new file mode 100644
index 0000000000000..887c2f2bbe2ed
--- /dev/null
+++ b/examples/ensemble/plot_monotonic_constraints.py
@@ -0,0 +1,70 @@
+"""
+=====================
+Monotonic Constraints
+=====================
+
+This example illustrates the effect of monotonic constraints on a gradient
+boosting estimator.
+
+We build an artificial dataset where the target value is in general
+positively correlated with the first feature (with some random and
+non-random variations), and in general negatively correlated with the second
+feature.
+
+By imposing a positive (increasing) or negative (decreasing) constraint on
+the features during the learning process, the estimator is able to properly
+follow the general trend instead of being subject to the variations.
+
+This example was inspired by the `XGBoost documentation
+<https://xgboost.readthedocs.io/en/latest/tutorials/monotonic.html>`_.
+"""
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingRegressor
+from sklearn.inspection import plot_partial_dependence
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+print(__doc__)
+
+rng = np.random.RandomState(0)
+
+n_samples = 5000
+f_0 = rng.rand(n_samples)  # positive correlation with y
+f_1 = rng.rand(n_samples)  # negative correlation with y
+X = np.c_[f_0, f_1]
+noise = rng.normal(loc=0.0, scale=0.01, size=n_samples)
+y = (5 * f_0 + np.sin(10 * np.pi * f_0) -
+     5 * f_1 - np.cos(10 * np.pi * f_1) +
+     noise)
+
+fig, ax = plt.subplots()
+
+
+# Without any constraint
+gbdt = HistGradientBoostingRegressor()
+gbdt.fit(X, y)
+disp = plot_partial_dependence(
+    gbdt, X, features=[0, 1],
+    line_kw={'linewidth': 4, 'label': 'unconstrained'},
+    ax=ax)
+
+# With positive and negative constraints
+gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, -1])
+gbdt.fit(X, y)
+
+plot_partial_dependence(
+    gbdt, X, features=[0, 1],
+    feature_names=('First feature\nPositive constraint',
+                   'Second feature\nNegtive constraint'),
+    line_kw={'linewidth': 4, 'label': 'constrained'},
+    ax=disp.axes_)
+
+for f_idx in (0, 1):
+    disp.axes_[0, f_idx].plot(X[:, f_idx], y, 'o', alpha=.3, zorder=-1)
+    disp.axes_[0, f_idx].set_ylim(-6, 6)
+
+plt.legend()
+fig.suptitle("Monotonic constraints illustration")
+
+plt.show()
diff --git a/examples/feature_selection/plot_feature_selection.py b/examples/feature_selection/plot_feature_selection.py
index c0c849def9b3a..1e4ef6a81bba8 100644
--- a/examples/feature_selection/plot_feature_selection.py
+++ b/examples/feature_selection/plot_feature_selection.py
@@ -62,8 +62,7 @@
 scores = -np.log10(selector.pvalues_)
 scores /= scores.max()
 plt.bar(X_indices - .45, scores, width=.2,
-        label=r'Univariate score ($-Log(p_{value})$)', color='darkorange',
-        edgecolor='black')
+        label=r'Univariate score ($-Log(p_{value})$)')
 
 # #############################################################################
 # Compare to the weights of an SVM
@@ -75,8 +74,7 @@
 svm_weights = np.abs(clf[-1].coef_).sum(axis=0)
 svm_weights /= svm_weights.sum()
 
-plt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight',
-        color='navy', edgecolor='black')
+plt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight')
 
 clf_selected = make_pipeline(
         SelectKBest(f_classif, k=4), MinMaxScaler(), LinearSVC()
@@ -89,8 +87,7 @@
 svm_weights_selected /= svm_weights_selected.sum()
 
 plt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,
-        width=.2, label='SVM weights after selection', color='c',
-        edgecolor='black')
+        width=.2, label='SVM weights after selection')
 
 
 plt.title("Comparing feature selection")
diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py
index 90e8e4cad1a9b..7e2fae467b7c5 100644
--- a/examples/impute/plot_iterative_imputer_variants_comparison.py
+++ b/examples/impute/plot_iterative_imputer_variants_comparison.py
@@ -119,7 +119,7 @@
     keys=['Original', 'SimpleImputer', 'IterativeImputer'], axis=1
 )
 
-# plot boston results
+# plot california housing results
 fig, ax = plt.subplots(figsize=(13, 6))
 means = -scores.mean()
 errors = scores.std()
diff --git a/pyproject.toml b/pyproject.toml
index 2547baae5874d..6011f5d2ea1ea 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,6 +4,11 @@ requires = [
     "setuptools",
     "wheel",
     "Cython>=0.28.5",
-    "numpy>=1.13.3",
+    "numpy==1.13.3; python_version=='3.6' and platform_system!='AIX'",
+    "numpy==1.14.5; python_version=='3.7' and platform_system!='AIX'",
+    "numpy==1.17.3; python_version>='3.8' and platform_system!='AIX'",
+    "numpy==1.16.0; python_version=='3.6' and platform_system=='AIX'",
+    "numpy==1.16.0; python_version=='3.7' and platform_system=='AIX'",
+    "numpy==1.17.3; python_version>='3.8' and platform_system=='AIX'",
     "scipy>=0.19.1",
 ]
diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index cb8b37692618f..7f203a079f22b 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -60,7 +60,8 @@
     # This variable is injected in the __builtins__ by the build
     # process. It is used to enable importing subpackages of sklearn when
     # the binaries are not built
-    __SKLEARN_SETUP__
+    # mypy error: Cannot determine type of '__SKLEARN_SETUP__'
+    __SKLEARN_SETUP__  # type: ignore
 except NameError:
     __SKLEARN_SETUP__ = False
 
diff --git a/sklearn/_build_utils/deprecated_modules.py b/sklearn/_build_utils/deprecated_modules.py
index 045dc3d297be0..1586f9e991a8d 100644
--- a/sklearn/_build_utils/deprecated_modules.py
+++ b/sklearn/_build_utils/deprecated_modules.py
@@ -271,7 +271,8 @@
 _FILE_CONTENT_TEMPLATE = """
 # THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
 import sys
-from . import {new_module_name}
+# mypy error: Module X has no attribute y (typically for C extensions)
+from . import {new_module_name}  # type: ignore
 from {relative_dots}externals._pep562 import Pep562
 from {relative_dots}utils.deprecation import _raise_dep_warning_if_not_pytest
 
diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py
index 74e0b84477ad6..5422b10cc4dd7 100644
--- a/sklearn/cluster/_dbscan.py
+++ b/sklearn/cluster/_dbscan.py
@@ -29,19 +29,19 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None,
 
     Parameters
     ----------
-    X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
-            array of shape (n_samples, n_samples)
+    X : {array-like, sparse (CSR) matrix} of shape (n_samples, n_features) or \
+            (n_samples, n_samples)
         A feature array, or array of distances between samples if
         ``metric='precomputed'``.
 
-    eps : float, optional
+    eps : float, default=0.5
         The maximum distance between two samples for one to be considered
         as in the neighborhood of the other. This is not a maximum bound
         on the distances of points within a cluster. This is the most
         important DBSCAN parameter to choose appropriately for your data set
         and distance function.
 
-    min_samples : int, optional
+    min_samples : int, default=5
         The number of samples (or total weight) in a neighborhood for a point
         to be considered as a core point. This includes the point itself.
 
@@ -55,33 +55,33 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None,
         X may be a :term:`sparse graph <sparse graph>`,
         in which case only "nonzero" elements may be considered neighbors.
 
-    metric_params : dict, optional
+    metric_params : dict, default=None
         Additional keyword arguments for the metric function.
 
         .. versionadded:: 0.19
 
-    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
+    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
         The algorithm to be used by the NearestNeighbors module
         to compute pointwise distances and find nearest neighbors.
         See NearestNeighbors module documentation for details.
 
-    leaf_size : int, optional (default = 30)
+    leaf_size : int, default=30
         Leaf size passed to BallTree or cKDTree. This can affect the speed
         of the construction and query, as well as the memory required
         to store the tree. The optimal value depends
         on the nature of the problem.
 
-    p : float, optional
+    p : float, default=2
         The power of the Minkowski metric to be used to calculate distance
         between points.
 
-    sample_weight : array, shape (n_samples,), optional
+    sample_weight : array-like of shape (n_samples,), default=None
         Weight of each sample, such that a sample with a weight of at least
         ``min_samples`` is by itself a core sample; a sample with negative
         weight may inhibit its eps-neighbor from being core.
         Note that weights are absolute, and default to 1.
 
-    n_jobs : int or None, optional (default=None)
+    n_jobs : int, default=None
         The number of parallel jobs to run for neighbors search. ``None`` means
         1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means
         using all processors. See :term:`Glossary <n_jobs>` for more details.
@@ -90,10 +90,10 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None,
 
     Returns
     -------
-    core_samples : array [n_core_samples]
+    core_samples : ndarray of shape (n_core_samples,)
         Indices of core samples.
 
-    labels : array [n_samples]
+    labels : ndarray of shape (n_samples,)
         Cluster labels for each point.  Noisy samples are given the label -1.
 
     See also
@@ -200,7 +200,7 @@ class DBSCAN(ClusterMixin, BaseEstimator):
         The power of the Minkowski metric to be used to calculate distance
         between points.
 
-    n_jobs : int or None, default=None
+    n_jobs : int, default=None
         The number of parallel jobs to run.
         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
@@ -208,13 +208,13 @@ class DBSCAN(ClusterMixin, BaseEstimator):
 
     Attributes
     ----------
-    core_sample_indices_ : array, shape = [n_core_samples]
+    core_sample_indices_ : ndarray of shape (n_core_samples,)
         Indices of core samples.
 
-    components_ : array, shape = [n_core_samples, n_features]
+    components_ : ndarray of shape (n_core_samples, n_features)
         Copy of each core sample found by training.
 
-    labels_ : array, shape = [n_samples]
+    labels_ : ndarray of shape (n_samples)
         Cluster labels for each point in the dataset given to fit().
         Noisy samples are given the label -1.
 
@@ -288,13 +288,13 @@ def fit(self, X, y=None, sample_weight=None):
 
         Parameters
         ----------
-        X : array-like or sparse matrix, shape (n_samples, n_features), or \
+        X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
             (n_samples, n_samples)
             Training instances to cluster, or distances between instances if
             ``metric='precomputed'``. If a sparse matrix is provided, it will
             be converted into a sparse ``csr_matrix``.
 
-        sample_weight : array, shape (n_samples,), optional
+        sample_weight : array-like of shape (n_samples,), default=None
             Weight of each sample, such that a sample with a weight of at least
             ``min_samples`` is by itself a core sample; a sample with a
             negative weight may inhibit its eps-neighbor from being core.
@@ -367,13 +367,13 @@ def fit_predict(self, X, y=None, sample_weight=None):
 
         Parameters
         ----------
-        X : array-like or sparse matrix, shape (n_samples, n_features), or \
+        X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
             (n_samples, n_samples)
             Training instances to cluster, or distances between instances if
             ``metric='precomputed'``. If a sparse matrix is provided, it will
             be converted into a sparse ``csr_matrix``.
 
-        sample_weight : array, shape (n_samples,), optional
+        sample_weight : array-like of shape (n_samples,), default=None
             Weight of each sample, such that a sample with a weight of at least
             ``min_samples`` is by itself a core sample; a sample with a
             negative weight may inhibit its eps-neighbor from being core.
@@ -384,7 +384,7 @@ def fit_predict(self, X, y=None, sample_weight=None):
 
         Returns
         -------
-        labels : ndarray, shape (n_samples,)
+        labels : ndarray of shape (n_samples,)
             Cluster labels. Noisy samples are given the label -1.
         """
         self.fit(X, sample_weight=sample_weight)
diff --git a/sklearn/cluster/_k_means_elkan.pyx b/sklearn/cluster/_k_means_elkan.pyx
index e95c8fe0490a4..65c8871fbb456 100644
--- a/sklearn/cluster/_k_means_elkan.pyx
+++ b/sklearn/cluster/_k_means_elkan.pyx
@@ -10,6 +10,7 @@
 
 import numpy as np
 cimport numpy as np
+from threadpoolctl import threadpool_limits
 cimport cython
 from cython cimport floating
 from cython.parallel import prange, parallel
@@ -29,7 +30,19 @@ from ._k_means_fast cimport _center_shift
 np.import_array()
 
 
-def _init_bounds_dense(
+# Threadpoolctl wrappers to limit the number of threads in second level of
+# nested parallelism (i.e. BLAS) to avoid oversubsciption.
+def elkan_iter_chunked_dense(*args, **kwargs):
+    with threadpool_limits(limits=1, user_api="blas"):
+        _elkan_iter_chunked_dense(*args, **kwargs)
+
+
+def elkan_iter_chunked_sparse(*args, **kwargs):
+    with threadpool_limits(limits=1, user_api="blas"):
+        _elkan_iter_chunked_sparse(*args, **kwargs)
+
+
+def init_bounds_dense(
         np.ndarray[floating, ndim=2, mode='c'] X,  # IN
         floating[:, ::1] centers,                  # IN
         floating[:, ::1] center_half_distances,    # IN
@@ -99,7 +112,7 @@ def _init_bounds_dense(
         upper_bounds[i] = min_dist
 
 
-def _init_bounds_sparse(
+def init_bounds_sparse(
         X,                                       # IN
         floating[:, ::1] centers,                # IN
         floating[:, ::1] center_half_distances,  # IN
diff --git a/sklearn/cluster/_k_means_lloyd.pyx b/sklearn/cluster/_k_means_lloyd.pyx
index 93e2c6f0b9c89..4a33b0c5c8a02 100644
--- a/sklearn/cluster/_k_means_lloyd.pyx
+++ b/sklearn/cluster/_k_means_lloyd.pyx
@@ -8,6 +8,7 @@
 
 import numpy as np
 cimport numpy as np
+from threadpoolctl import threadpool_limits
 from cython cimport floating
 from cython.parallel import prange, parallel
 from libc.stdlib cimport malloc, calloc, free
@@ -25,6 +26,18 @@ from ._k_means_fast cimport _average_centers, _center_shift
 np.import_array()
 
 
+# Threadpoolctl wrappers to limit the number of threads in second level of
+# nested parallelism (i.e. BLAS) to avoid oversubsciption.
+def lloyd_iter_chunked_dense(*args, **kwargs):
+    with threadpool_limits(limits=1, user_api="blas"):
+        _lloyd_iter_chunked_dense(*args, **kwargs)
+
+
+def lloyd_iter_chunked_sparse(*args, **kwargs):
+    with threadpool_limits(limits=1, user_api="blas"):
+        _lloyd_iter_chunked_sparse(*args, **kwargs)
+
+
 def _lloyd_iter_chunked_dense(
         np.ndarray[floating, ndim=2, mode='c'] X,  # IN
         floating[::1] sample_weight,               # IN
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 27ec0e5f388f6..b185983c4b0f9 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -15,7 +15,6 @@
 
 import numpy as np
 import scipy.sparse as sp
-from threadpoolctl import threadpool_limits
 
 from ..base import BaseEstimator, ClusterMixin, TransformerMixin
 from ..metrics.pairwise import euclidean_distances
@@ -32,12 +31,12 @@
 from ._k_means_fast import _inertia_dense
 from ._k_means_fast import _inertia_sparse
 from ._k_means_fast import _mini_batch_update_csr
-from ._k_means_lloyd import _lloyd_iter_chunked_dense
-from ._k_means_lloyd import _lloyd_iter_chunked_sparse
-from ._k_means_elkan import _init_bounds_dense
-from ._k_means_elkan import _init_bounds_sparse
-from ._k_means_elkan import _elkan_iter_chunked_dense
-from ._k_means_elkan import _elkan_iter_chunked_sparse
+from ._k_means_lloyd import lloyd_iter_chunked_dense
+from ._k_means_lloyd import lloyd_iter_chunked_sparse
+from ._k_means_elkan import init_bounds_dense
+from ._k_means_elkan import init_bounds_sparse
+from ._k_means_elkan import elkan_iter_chunked_dense
+from ._k_means_elkan import elkan_iter_chunked_sparse
 
 
 ###############################################################################
@@ -420,12 +419,12 @@ def _kmeans_single_elkan(X, sample_weight, n_clusters, max_iter=300,
     center_shift = np.zeros(n_clusters, dtype=X.dtype)
 
     if sp.issparse(X):
-        init_bounds = _init_bounds_sparse
-        elkan_iter = _elkan_iter_chunked_sparse
+        init_bounds = init_bounds_sparse
+        elkan_iter = elkan_iter_chunked_sparse
         _inertia = _inertia_sparse
     else:
-        init_bounds = _init_bounds_dense
-        elkan_iter = _elkan_iter_chunked_dense
+        init_bounds = init_bounds_dense
+        elkan_iter = elkan_iter_chunked_dense
         _inertia = _inertia_dense
 
     init_bounds(X, centers, center_half_distances,
@@ -559,10 +558,10 @@ def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300,
     center_shift = np.zeros(n_clusters, dtype=X.dtype)
 
     if sp.issparse(X):
-        lloyd_iter = _lloyd_iter_chunked_sparse
+        lloyd_iter = lloyd_iter_chunked_sparse
         _inertia = _inertia_sparse
     else:
-        lloyd_iter = _lloyd_iter_chunked_dense
+        lloyd_iter = lloyd_iter_chunked_dense
         _inertia = _inertia_dense
 
     for i in range(max_iter):
@@ -594,7 +593,8 @@ def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300,
     return labels, inertia, centers, i + 1
 
 
-def _labels_inertia(X, sample_weight, x_squared_norms, centers, n_threads=1):
+def _labels_inertia(X, sample_weight, x_squared_norms, centers,
+                    n_threads=None):
     """E step of the K-means EM algorithm.
 
     Compute the labels and the inertia of the given samples and centers.
@@ -615,7 +615,7 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers, n_threads=1):
     centers : ndarray, shape (n_clusters, n_features)
         The cluster centers.
 
-    n_threads : int, default=1
+    n_threads : int, default=None
         The number of OpenMP threads to use for the computation. Parallelism is
         sample-wise on the main cython loop which assigns each sample to its
         closest center.
@@ -631,16 +631,18 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers, n_threads=1):
     n_samples = X.shape[0]
     n_clusters = centers.shape[0]
 
+    n_threads = _openmp_effective_n_threads(n_threads)
+
     sample_weight = _check_normalize_sample_weight(sample_weight, X)
     labels = np.full(n_samples, -1, dtype=np.int32)
     weight_in_clusters = np.zeros(n_clusters, dtype=centers.dtype)
     center_shift = np.zeros_like(weight_in_clusters)
 
     if sp.issparse(X):
-        _labels = _lloyd_iter_chunked_sparse
+        _labels = lloyd_iter_chunked_sparse
         _inertia = _inertia_sparse
     else:
-        _labels = _lloyd_iter_chunked_dense
+        _labels = lloyd_iter_chunked_dense
         _inertia = _inertia_dense
 
     _labels(X, sample_weight, x_squared_norms, centers, centers,
@@ -1033,22 +1035,19 @@ def fit(self, X, y=None, sample_weight=None):
         # seeds for the initializations of the kmeans runs.
         seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
 
-        # limit number of threads in second level of nested parallelism
-        # (i.e. BLAS) to avoid oversubsciption.
-        with threadpool_limits(limits=1, user_api="blas"):
-            for seed in seeds:
-                # run a k-means once
-                labels, inertia, centers, n_iter_ = kmeans_single(
-                    X, sample_weight, self.n_clusters, max_iter=self.max_iter,
-                    init=init, verbose=self.verbose, tol=tol,
-                    x_squared_norms=x_squared_norms, random_state=seed,
-                    n_threads=self._n_threads)
-                # determine if these results are the best so far
-                if best_inertia is None or inertia < best_inertia:
-                    best_labels = labels.copy()
-                    best_centers = centers.copy()
-                    best_inertia = inertia
-                    best_n_iter = n_iter_
+        for seed in seeds:
+            # run a k-means once
+            labels, inertia, centers, n_iter_ = kmeans_single(
+                X, sample_weight, self.n_clusters, max_iter=self.max_iter,
+                init=init, verbose=self.verbose, tol=tol,
+                x_squared_norms=x_squared_norms, random_state=seed,
+                n_threads=self._n_threads)
+            # determine if these results are the best so far
+            if best_inertia is None or inertia < best_inertia:
+                best_labels = labels.copy()
+                best_centers = centers.copy()
+                best_inertia = inertia
+                best_n_iter = n_iter_
 
         if not sp.issparse(X):
             if not self.copy_x:
diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py
index 39aa77db417b9..1d0d93db75101 100644
--- a/sklearn/covariance/_graph_lasso.py
+++ b/sklearn/covariance/_graph_lasso.py
@@ -20,7 +20,8 @@
 from ..exceptions import ConvergenceWarning
 from ..utils.validation import check_random_state
 from ..utils.validation import _deprecate_positional_args
-from ..linear_model import _cd_fast as cd_fast
+# mypy error: Module 'sklearn.linear_model' has no attribute '_cd_fast'
+from ..linear_model import _cd_fast as cd_fast  # type: ignore
 from ..linear_model import lars_path_gram
 from ..model_selection import check_cv, cross_val_score
 
diff --git a/sklearn/datasets/_lfw.py b/sklearn/datasets/_lfw.py
index b5efd68adbd1c..3dc3833db3417 100644
--- a/sklearn/datasets/_lfw.py
+++ b/sklearn/datasets/_lfw.py
@@ -393,7 +393,7 @@ def fetch_lfw_pairs(subset='train', data_home=None, funneled=True, resize=0.5,
     Download it if necessary.
 
     =================   =======================
-    Classes                                5749
+    Classes                                   2
     Samples total                         13233
     Dimensionality                         5828
     Features            real, between 0 and 255
diff --git a/sklearn/datasets/descr/breast_cancer.rst b/sklearn/datasets/descr/breast_cancer.rst
index fea6b6f017c16..bc4d60b9a363d 100644
--- a/sklearn/datasets/descr/breast_cancer.rst
+++ b/sklearn/datasets/descr/breast_cancer.rst
@@ -18,13 +18,13 @@ Breast cancer wisconsin (diagnostic) dataset
         - compactness (perimeter^2 / area - 1.0)
         - concavity (severity of concave portions of the contour)
         - concave points (number of concave portions of the contour)
-        - symmetry 
+        - symmetry
         - fractal dimension ("coastline approximation" - 1)
 
         The mean, standard error, and "worst" or largest (mean of the three
-        largest values) of these features were computed for each image,
-        resulting in 30 features.  For instance, field 3 is Mean Radius, field
-        13 is Radius SE, field 23 is Worst Radius.
+        worst/largest values) of these features were computed for each image,
+        resulting in 30 features.  For instance, field 0 is Mean Radius, field
+        10 is Radius SE, field 20 is Worst Radius.
 
         - class:
                 - WDBC-Malignant
diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py
index 9b7ad28f9f235..6b2d76ce2143b 100644
--- a/sklearn/decomposition/_dict_learning.py
+++ b/sklearn/decomposition/_dict_learning.py
@@ -17,7 +17,7 @@
 from ..utils import (check_array, check_random_state, gen_even_slices,
                      gen_batches)
 from ..utils.extmath import randomized_svd, row_norms
-from ..utils.validation import check_is_fitted
+from ..utils.validation import check_is_fitted, _deprecate_positional_args
 from ..linear_model import Lasso, orthogonal_mp_gram, LassoLars, Lars
 
 
@@ -1013,7 +1013,8 @@ class SparseCoder(SparseCodingMixin, BaseEstimator):
     """
     _required_parameters = ["dictionary"]
 
-    def __init__(self, dictionary, transform_algorithm='omp',
+    @_deprecate_positional_args
+    def __init__(self, dictionary, *, transform_algorithm='omp',
                  transform_n_nonzero_coefs=None, transform_alpha=None,
                  split_sign=False, n_jobs=None, positive_code=False,
                  transform_max_iter=1000):
@@ -1183,7 +1184,8 @@ class DictionaryLearning(SparseCodingMixin, BaseEstimator):
     SparsePCA
     MiniBatchSparsePCA
     """
-    def __init__(self, n_components=None, alpha=1, max_iter=1000, tol=1e-8,
+    @_deprecate_positional_args
+    def __init__(self, n_components=None, *, alpha=1, max_iter=1000, tol=1e-8,
                  fit_algorithm='lars', transform_algorithm='omp',
                  transform_n_nonzero_coefs=None, transform_alpha=None,
                  n_jobs=None, code_init=None, dict_init=None, verbose=False,
@@ -1388,7 +1390,8 @@ class MiniBatchDictionaryLearning(SparseCodingMixin, BaseEstimator):
     MiniBatchSparsePCA
 
     """
-    def __init__(self, n_components=None, alpha=1, n_iter=1000,
+    @_deprecate_positional_args
+    def __init__(self, n_components=None, *, alpha=1, n_iter=1000,
                  fit_algorithm='lars', n_jobs=None, batch_size=3, shuffle=True,
                  dict_init=None, transform_algorithm='omp',
                  transform_n_nonzero_coefs=None, transform_alpha=None,
diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py
index 7147fd452559c..a09b89bda6d6e 100644
--- a/sklearn/decomposition/_factor_analysis.py
+++ b/sklearn/decomposition/_factor_analysis.py
@@ -28,7 +28,7 @@
 from ..base import BaseEstimator, TransformerMixin
 from ..utils import check_array, check_random_state
 from ..utils.extmath import fast_logdet, randomized_svd, squared_norm
-from ..utils.validation import check_is_fitted
+from ..utils.validation import check_is_fitted, _deprecate_positional_args
 from ..exceptions import ConvergenceWarning
 
 
@@ -138,7 +138,9 @@ class FactorAnalysis(TransformerMixin, BaseEstimator):
     FastICA: Independent component analysis, a latent variable model with
         non-Gaussian latent variables.
     """
-    def __init__(self, n_components=None, tol=1e-2, copy=True, max_iter=1000,
+    @_deprecate_positional_args
+    def __init__(self, n_components=None, *, tol=1e-2, copy=True,
+                 max_iter=1000,
                  noise_variance_init=None, svd_method='randomized',
                  iterated_power=3, random_state=0):
         self.n_components = n_components
diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py
index f9e3a148f6860..7329fbbe4be1f 100644
--- a/sklearn/decomposition/_fastica.py
+++ b/sklearn/decomposition/_fastica.py
@@ -20,6 +20,7 @@
 from ..utils import check_array, as_float_array, check_random_state
 from ..utils.validation import check_is_fitted
 from ..utils.validation import FLOAT_DTYPES
+from ..utils.validation import _deprecate_positional_args
 
 __all__ = ['fastica', 'FastICA']
 
@@ -390,7 +391,8 @@ def my_g(x):
     pp. 411-430*
 
     """
-    def __init__(self, n_components=None, algorithm='parallel', whiten=True,
+    @_deprecate_positional_args
+    def __init__(self, n_components=None, *, algorithm='parallel', whiten=True,
                  fun='logcosh', fun_args=None, max_iter=200, tol=1e-4,
                  w_init=None, random_state=None):
         super().__init__()
diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py
index ac535b58e7f5e..bc34c17326f19 100644
--- a/sklearn/decomposition/_incremental_pca.py
+++ b/sklearn/decomposition/_incremental_pca.py
@@ -10,6 +10,7 @@
 from ._base import _BasePCA
 from ..utils import check_array, gen_batches
 from ..utils.extmath import svd_flip, _incremental_mean_and_var
+from ..utils.validation import _deprecate_positional_args
 
 
 class IncrementalPCA(_BasePCA):
@@ -163,8 +164,8 @@ class IncrementalPCA(_BasePCA):
     SparsePCA
     TruncatedSVD
     """
-
-    def __init__(self, n_components=None, whiten=False, copy=True,
+    @_deprecate_positional_args
+    def __init__(self, n_components=None, *, whiten=False, copy=True,
                  batch_size=None):
         self.n_components = n_components
         self.whiten = whiten
diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py
index 212686ae50d5a..527f78d34bbb5 100644
--- a/sklearn/decomposition/_kernel_pca.py
+++ b/sklearn/decomposition/_kernel_pca.py
@@ -14,6 +14,7 @@
 from ..base import BaseEstimator, TransformerMixin
 from ..preprocessing import KernelCenterer
 from ..metrics.pairwise import pairwise_kernels
+from ..utils.validation import _deprecate_positional_args
 
 
 class KernelPCA(TransformerMixin, BaseEstimator):
@@ -138,8 +139,8 @@ class KernelPCA(TransformerMixin, BaseEstimator):
         component analysis. In Advances in kernel methods,
         MIT Press, Cambridge, MA, USA 327-352.
     """
-
-    def __init__(self, n_components=None, kernel="linear",
+    @_deprecate_positional_args
+    def __init__(self, n_components=None, *, kernel="linear",
                  gamma=None, degree=3, coef0=1, kernel_params=None,
                  alpha=1.0, fit_inverse_transform=False, eigen_solver='auto',
                  tol=0, max_iter=None, remove_zero_eig=False,
diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py
index 1458891ce1c8c..641e68cd7fc8b 100644
--- a/sklearn/decomposition/_lda.py
+++ b/sklearn/decomposition/_lda.py
@@ -20,6 +20,7 @@
 from ..utils import check_random_state, gen_batches, gen_even_slices
 from ..utils.validation import check_non_negative
 from ..utils.validation import check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 
 from ._online_lda_fast import (mean_change, _dirichlet_expectation_1d,
                                _dirichlet_expectation_2d)
@@ -280,8 +281,8 @@ class LatentDirichletAllocation(TransformerMixin, BaseEstimator):
         https://github.com/blei-lab/onlineldavb
 
     """
-
-    def __init__(self, n_components=10, doc_topic_prior=None,
+    @_deprecate_positional_args
+    def __init__(self, n_components=10, *, doc_topic_prior=None,
                  topic_word_prior=None, learning_method='batch',
                  learning_decay=.7, learning_offset=10., max_iter=10,
                  batch_size=128, evaluate_every=-1, total_samples=1e6,
diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py
index a454e887c9986..f1385d21596e3 100644
--- a/sklearn/decomposition/_nmf.py
+++ b/sklearn/decomposition/_nmf.py
@@ -19,6 +19,7 @@
 from ..utils import check_random_state, check_array
 from ..utils.extmath import randomized_svd, safe_sparse_dot, squared_norm
 from ..utils.validation import check_is_fitted, check_non_negative
+from ..utils.validation import _deprecate_positional_args
 
 EPSILON = np.finfo(np.float32).eps
 
@@ -1232,8 +1233,8 @@ class NMF(TransformerMixin, BaseEstimator):
     Fevotte, C., & Idier, J. (2011). Algorithms for nonnegative matrix
     factorization with the beta-divergence. Neural Computation, 23(9).
     """
-
-    def __init__(self, n_components=None, init=None, solver='cd',
+    @_deprecate_positional_args
+    def __init__(self, n_components=None, *, init=None, solver='cd',
                  beta_loss='frobenius', tol=1e-4, max_iter=200,
                  random_state=None, alpha=0., l1_ratio=0., verbose=0,
                  shuffle=False):
diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py
index 7a0140b01fc9b..87092d7ccd17e 100644
--- a/sklearn/decomposition/_pca.py
+++ b/sklearn/decomposition/_pca.py
@@ -25,24 +25,25 @@
 from ..utils.extmath import fast_logdet, randomized_svd, svd_flip
 from ..utils.extmath import stable_cumsum
 from ..utils.validation import check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 
 
-def _assess_dimension(spectrum, rank, n_samples, n_features):
-    """Compute the likelihood of a rank ``rank`` dataset.
+def _assess_dimension(spectrum, rank, n_samples):
+    """Compute the log-likelihood of a rank ``rank`` dataset.
 
     The dataset is assumed to be embedded in gaussian noise of shape(n,
     dimf) having spectrum ``spectrum``.
 
     Parameters
     ----------
-    spectrum : array of shape (n)
+    spectrum : array of shape (n_features)
         Data spectrum.
     rank : int
-        Tested rank value.
+        Tested rank value. It should be strictly lower than n_features,
+        otherwise the method isn't specified (division by zero in equation
+        (31) from the paper).
     n_samples : int
         Number of samples.
-    n_features : int
-        Number of features.
 
     Returns
     -------
@@ -54,45 +55,39 @@ def _assess_dimension(spectrum, rank, n_samples, n_features):
     This implements the method of `Thomas P. Minka:
     Automatic Choice of Dimensionality for PCA. NIPS 2000: 598-604`
     """
-    if rank > len(spectrum):
-        raise ValueError("The tested rank cannot exceed the rank of the"
-                         " dataset")
 
-    spectrum_threshold = np.finfo(type(spectrum[0])).eps
+    n_features = spectrum.shape[0]
+    if not 1 <= rank < n_features:
+        raise ValueError("the tested rank should be in [1, n_features - 1]")
+
+    eps = 1e-15
+
+    if spectrum[rank - 1] < eps:
+        # When the tested rank is associated with a small eigenvalue, there's
+        # no point in computing the log-likelihood: it's going to be very
+        # small and won't be the max anyway. Also, it can lead to numerical
+        # issues below when computing pa, in particular in log((spectrum[i] -
+        # spectrum[j]) because this will take the log of something very small.
+        return -np.inf
 
     pu = -rank * log(2.)
-    for i in range(rank):
-        pu += (gammaln((n_features - i) / 2.) -
-               log(np.pi) * (n_features - i) / 2.)
+    for i in range(1, rank + 1):
+        pu += (gammaln((n_features - i + 1) / 2.) -
+               log(np.pi) * (n_features - i + 1) / 2.)
 
     pl = np.sum(np.log(spectrum[:rank]))
     pl = -pl * n_samples / 2.
 
-    if rank == n_features:
-        # TODO: this line is never executed because _infer_dimension's
-        # for loop is off by one
-        pv = 0
-        v = 1
-    else:
-        v = np.sum(spectrum[rank:]) / (n_features - rank)
-        if spectrum_threshold > v:
-            return -np.inf
-        pv = -np.log(v) * n_samples * (n_features - rank) / 2.
+    v = max(eps, np.sum(spectrum[rank:]) / (n_features - rank))
+    pv = -np.log(v) * n_samples * (n_features - rank) / 2.
 
     m = n_features * rank - rank * (rank + 1.) / 2.
-    pp = log(2. * np.pi) * (m + rank + 1.) / 2.
+    pp = log(2. * np.pi) * (m + rank) / 2.
 
     pa = 0.
     spectrum_ = spectrum.copy()
     spectrum_[rank:n_features] = v
     for i in range(rank):
-        if spectrum_[i] < spectrum_threshold:
-            # TODO: this line is never executed
-            # (off by one in _infer_dimension)
-            # this break only happens when rank == n_features and
-            # spectrum_[i] < spectrum_threshold, otherwise the early return
-            # above catches this case.
-            break
         for j in range(i + 1, len(spectrum)):
             pa += log((spectrum[i] - spectrum[j]) *
                       (1. / spectrum_[j] - 1. / spectrum_[i])) + log(n_samples)
@@ -102,15 +97,15 @@ def _assess_dimension(spectrum, rank, n_samples, n_features):
     return ll
 
 
-def _infer_dimension(spectrum, n_samples, n_features):
-    """Infers the dimension of a dataset of shape (n_samples, n_features)
+def _infer_dimension(spectrum, n_samples):
+    """Infers the dimension of a dataset with a given spectrum.
 
-    The dataset is described by its spectrum `spectrum`.
+    The returned value will be in [1, n_features - 1].
     """
-    n_spectrum = len(spectrum)
-    ll = np.empty(n_spectrum)
-    for rank in range(n_spectrum):
-        ll[rank] = _assess_dimension(spectrum, rank, n_samples, n_features)
+    ll = np.empty_like(spectrum)
+    ll[0] = -np.inf  # we don't want to return n_components = 0
+    for rank in range(1, spectrum.shape[0]):
+        ll[rank] = _assess_dimension(spectrum, rank, n_samples)
     return ll.argmax()
 
 
@@ -324,7 +319,7 @@ class PCA(_BasePCA):
     >>> print(pca.singular_values_)
     [6.30061...]
     """
-
+    @_deprecate_positional_args
     def __init__(self, n_components=None, copy=True, whiten=False,
                  svd_solver='auto', tol=0.0, iterated_power='auto',
                  random_state=None):
@@ -471,7 +466,7 @@ def _fit_full(self, X, n_components):
         # Postprocess the number of components required
         if n_components == 'mle':
             n_components = \
-                _infer_dimension(explained_variance_, n_samples, n_features)
+                _infer_dimension(explained_variance_, n_samples)
         elif 0 < n_components < 1.0:
             # number of components for which the cumulated explained
             # variance percentage is superior to the desired threshold
diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py
index 158bbefc22e92..888d5d79e1e4b 100644
--- a/sklearn/decomposition/_sparse_pca.py
+++ b/sklearn/decomposition/_sparse_pca.py
@@ -8,6 +8,7 @@
 
 from ..utils import check_random_state, check_array
 from ..utils.validation import check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 from ..linear_model import ridge_regression
 from ..base import BaseEstimator, TransformerMixin
 from ._dict_learning import dict_learning, dict_learning_online
@@ -131,7 +132,8 @@ class SparsePCA(TransformerMixin, BaseEstimator):
     MiniBatchSparsePCA
     DictionaryLearning
     """
-    def __init__(self, n_components=None, alpha=1, ridge_alpha=0.01,
+    @_deprecate_positional_args
+    def __init__(self, n_components=None, *, alpha=1, ridge_alpha=0.01,
                  max_iter=1000, tol=1e-8, method='lars', n_jobs=None,
                  U_init=None, V_init=None, verbose=False, random_state=None,
                  normalize_components='deprecated'):
@@ -340,7 +342,8 @@ class MiniBatchSparsePCA(SparsePCA):
     SparsePCA
     DictionaryLearning
     """
-    def __init__(self, n_components=None, alpha=1, ridge_alpha=0.01,
+    @_deprecate_positional_args
+    def __init__(self, n_components=None, *, alpha=1, ridge_alpha=0.01,
                  n_iter=100, callback=None, batch_size=3, verbose=False,
                  shuffle=True, n_jobs=None, method='lars', random_state=None,
                  normalize_components='deprecated'):
diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py
index 940eab56feea8..c0057ad6287a1 100644
--- a/sklearn/decomposition/_truncated_svd.py
+++ b/sklearn/decomposition/_truncated_svd.py
@@ -14,6 +14,9 @@
 from ..utils import check_array, check_random_state
 from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip
 from ..utils.sparsefuncs import mean_variance_axis
+from ..utils.validation import _deprecate_positional_args
+from ..utils.validation import check_is_fitted
+
 
 __all__ = ["TruncatedSVD"]
 
@@ -116,7 +119,8 @@ class TruncatedSVD(TransformerMixin, BaseEstimator):
     class to data once, then keep the instance around to do transformations.
 
     """
-    def __init__(self, n_components=2, algorithm="randomized", n_iter=5,
+    @_deprecate_positional_args
+    def __init__(self, n_components=2, *, algorithm="randomized", n_iter=5,
                  random_state=None, tol=0.):
         self.algorithm = algorithm
         self.n_components = n_components
@@ -208,7 +212,8 @@ def transform(self, X):
         X_new : array, shape (n_samples, n_components)
             Reduced version of X. This will always be a dense array.
         """
-        X = check_array(X, accept_sparse='csr')
+        X = check_array(X, accept_sparse=['csr', 'csc'])
+        check_is_fitted(self)
         return safe_sparse_dot(X, self.components_.T)
 
     def inverse_transform(self, X):
diff --git a/sklearn/decomposition/tests/test_nmf.py b/sklearn/decomposition/tests/test_nmf.py
index b128c25c5f7ce..a7ef1243d8e25 100644
--- a/sklearn/decomposition/tests/test_nmf.py
+++ b/sklearn/decomposition/tests/test_nmf.py
@@ -63,7 +63,7 @@ def test_parameter_checking():
         msg = ("init = '{}' can only be used when "
                "n_components <= min(n_samples, n_features)"
                .format(init))
-        assert_raise_message(ValueError, msg, NMF(3, init).fit, A)
+        assert_raise_message(ValueError, msg, NMF(3, init=init).fit, A)
         assert_raise_message(ValueError, msg, nmf._initialize_nmf, A,
                              3, init)
 
diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py
index 438478a55f6fa..0123e169ce9c0 100644
--- a/sklearn/decomposition/tests/test_pca.py
+++ b/sklearn/decomposition/tests/test_pca.py
@@ -295,7 +295,7 @@ def test_n_components_mle(svd_solver):
     X = rng.randn(n_samples, n_features)
     pca = PCA(n_components='mle', svd_solver=svd_solver)
     pca.fit(X)
-    assert pca.n_components_ == 0
+    assert pca.n_components_ == 1
 
 
 @pytest.mark.parametrize("svd_solver", ["arpack", "randomized"])
@@ -333,7 +333,7 @@ def test_infer_dim_1():
     pca = PCA(n_components=p, svd_solver='full')
     pca.fit(X)
     spect = pca.explained_variance_
-    ll = np.array([_assess_dimension(spect, k, n, p) for k in range(p)])
+    ll = np.array([_assess_dimension(spect, k, n) for k in range(1, p)])
     assert ll[1] > ll.max() - .01 * n
 
 
@@ -348,7 +348,7 @@ def test_infer_dim_2():
     pca = PCA(n_components=p, svd_solver='full')
     pca.fit(X)
     spect = pca.explained_variance_
-    assert _infer_dimension(spect, n, p) > 1
+    assert _infer_dimension(spect, n) > 1
 
 
 def test_infer_dim_3():
@@ -361,7 +361,7 @@ def test_infer_dim_3():
     pca = PCA(n_components=p, svd_solver='full')
     pca.fit(X)
     spect = pca.explained_variance_
-    assert _infer_dimension(spect, n, p) > 2
+    assert _infer_dimension(spect, n) > 2
 
 
 @pytest.mark.parametrize(
@@ -570,51 +570,43 @@ def test_pca_n_components_mostly_explained_variance_ratio():
     assert pca2.n_components_ == X.shape[1]
 
 
-def test_infer_dim_bad_spec():
-    # Test a spectrum that drops to near zero for PR #16224
+def test_assess_dimension_bad_rank():
+    # Test error when tested rank not in [1, n_features - 1]
     spectrum = np.array([1, 1e-30, 1e-30, 1e-30])
     n_samples = 10
-    n_features = 5
-    ret = _infer_dimension(spectrum, n_samples, n_features)
-    assert ret == 0
+    for rank in (0, 5):
+        with pytest.raises(ValueError,
+                           match=r"should be in \[1, n_features - 1\]"):
+            _assess_dimension(spectrum, rank, n_samples)
 
 
-def test_assess_dimension_error_rank_greater_than_features():
-    # Test error when tested rank is greater than the number of features
-    # for PR #16224
+def test_small_eigenvalues_mle():
+    # Test rank associated with tiny eigenvalues are given a log-likelihood of
+    # -inf. The inferred rank will be 1
     spectrum = np.array([1, 1e-30, 1e-30, 1e-30])
-    n_samples = 10
-    n_features = 4
-    rank = 5
-    with pytest.raises(ValueError, match="The tested rank cannot exceed "
-                                         "the rank of the dataset"):
-        _assess_dimension(spectrum, rank, n_samples, n_features)
 
+    assert _assess_dimension(spectrum, rank=1, n_samples=10) > -np.inf
 
-def test_assess_dimension_small_eigenvalues():
-    # Test tiny eigenvalues appropriately when using 'mle'
-    # for  PR #16224
-    spectrum = np.array([1, 1e-30, 1e-30, 1e-30])
-    n_samples = 10
-    n_features = 5
-    rank = 3
-    ret = _assess_dimension(spectrum, rank, n_samples, n_features)
-    assert ret == -np.inf
+    for rank in (2, 3):
+        assert _assess_dimension(spectrum, rank, 10) == -np.inf
+
+    assert _infer_dimension(spectrum, 10) == 1
 
 
-def test_infer_dim_mle():
-    # Test small eigenvalues when 'mle' with pathological 'X' dataset
-    # for PR #16224
-    X, _ = datasets.make_classification(n_informative=1, n_repeated=18,
+def test_mle_redundant_data():
+    # Test 'mle' with pathological X: only one relevant feature should give a
+    # rank of 1
+    X, _ = datasets.make_classification(n_features=20,
+                                        n_informative=1, n_repeated=18,
                                         n_redundant=1, n_clusters_per_class=1,
                                         random_state=42)
     pca = PCA(n_components='mle').fit(X)
-    assert pca.n_components_ == 0
+    assert pca.n_components_ == 1
 
 
 def test_fit_mle_too_few_samples():
     # Tests that an error is raised when the number of samples is smaller
-    # than the number of features during an mle fit for PR #16224
+    # than the number of features during an mle fit
     X, _ = datasets.make_classification(n_samples=20, n_features=21,
                                         random_state=42)
 
@@ -623,3 +615,26 @@ def test_fit_mle_too_few_samples():
                                          "supported if "
                                          "n_samples >= n_features"):
         pca.fit(X)
+
+
+def test_mle_simple_case():
+    # non-regression test for issue
+    # https://github.com/scikit-learn/scikit-learn/issues/16730
+    n_samples, n_dim = 1000, 10
+    X = np.random.RandomState(0).randn(n_samples, n_dim)
+    X[:, -1] = np.mean(X[:, :-1], axis=-1)  # true X dim is ndim - 1
+    pca_skl = PCA('mle', svd_solver='full')
+    pca_skl.fit(X)
+    assert pca_skl.n_components_ == n_dim - 1
+
+
+def test_assess_dimesion_rank_one():
+    # Make sure assess_dimension works properly on a matrix of rank 1
+    n_samples, n_features = 9, 6
+    X = np.ones((n_samples, n_features))  # rank 1 matrix
+    _, s, _ = np.linalg.svd(X, full_matrices=True)
+    assert sum(s[1:]) == 0  # except for rank 1, all eigenvalues are 0
+
+    assert np.isfinite(_assess_dimension(s, rank=1, n_samples=n_samples))
+    for rank in range(2, n_features):
+        assert _assess_dimension(s, rank, n_samples) == -np.inf
diff --git a/sklearn/dummy.py b/sklearn/dummy.py
index daa2c1ff0da11..634943231860f 100644
--- a/sklearn/dummy.py
+++ b/sklearn/dummy.py
@@ -395,7 +395,8 @@ def score(self, X, y, sample_weight=None):
             X = np.zeros(shape=(len(y), 1))
         return super().score(X, y, sample_weight)
 
-    @deprecated(
+    # mypy error: Decorated property not supported
+    @deprecated(  # type: ignore
         "The outputs_2d_ attribute is deprecated in version 0.22 "
         "and will be removed in version 0.24. It is equivalent to "
         "n_outputs_ > 1."
@@ -622,7 +623,8 @@ def score(self, X, y, sample_weight=None):
             X = np.zeros(shape=(len(y), 1))
         return super().score(X, y, sample_weight)
 
-    @deprecated(
+    # mypy error: Decorated property not supported
+    @deprecated(  # type: ignore
         "The outputs_2d_ attribute is deprecated in version 0.22 "
         "and will be removed in version 0.24. It is equivalent to "
         "n_outputs_ > 1."
diff --git a/sklearn/ensemble/__init__.py b/sklearn/ensemble/__init__.py
index 27acb2fbcf00a..ae86349ad9af0 100644
--- a/sklearn/ensemble/__init__.py
+++ b/sklearn/ensemble/__init__.py
@@ -2,6 +2,7 @@
 The :mod:`sklearn.ensemble` module includes ensemble-based methods for
 classification, regression and anomaly detection.
 """
+import typing
 
 from ._base import BaseEnsemble
 from ._forest import RandomForestClassifier
@@ -21,6 +22,12 @@
 from ._stacking import StackingClassifier
 from ._stacking import StackingRegressor
 
+if typing.TYPE_CHECKING:
+    # Avoid errors in type checkers (e.g. mypy) for experimental estimators.
+    # TODO: remove this check once the estimator is no longer experimental.
+    from ._hist_gradient_boosting.gradient_boosting import (  # noqa
+        HistGradientBoostingRegressor, HistGradientBoostingClassifier
+    )
 
 __all__ = ["BaseEnsemble",
            "RandomForestClassifier", "RandomForestRegressor",
diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py
index 6a98d79bfac7e..162979373602b 100644
--- a/sklearn/ensemble/_bagging.py
+++ b/sklearn/ensemble/_bagging.py
@@ -22,7 +22,7 @@
 from ..utils.multiclass import check_classification_targets
 from ..utils.random import sample_without_replacement
 from ..utils.validation import has_fit_parameter, check_is_fitted, \
-    _check_sample_weight
+    _check_sample_weight, _deprecate_positional_args
 
 
 __all__ = ["BaggingClassifier",
@@ -193,7 +193,7 @@ class BaseBagging(BaseEnsemble, metaclass=ABCMeta):
     @abstractmethod
     def __init__(self,
                  base_estimator=None,
-                 n_estimators=10,
+                 n_estimators=10, *,
                  max_samples=1.0,
                  max_features=1.0,
                  bootstrap=True,
@@ -577,9 +577,10 @@ class BaggingClassifier(ClassifierMixin, BaseBagging):
     .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine
            Learning and Knowledge Discovery in Databases, 346-361, 2012.
     """
+    @_deprecate_positional_args
     def __init__(self,
                  base_estimator=None,
-                 n_estimators=10,
+                 n_estimators=10, *,
                  max_samples=1.0,
                  max_features=1.0,
                  bootstrap=True,
@@ -975,10 +976,10 @@ class BaggingRegressor(RegressorMixin, BaseBagging):
     .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine
            Learning and Knowledge Discovery in Databases, 346-361, 2012.
     """
-
+    @_deprecate_positional_args
     def __init__(self,
                  base_estimator=None,
-                 n_estimators=10,
+                 n_estimators=10, *,
                  max_samples=1.0,
                  max_features=1.0,
                  bootstrap=True,
diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py
index 23db107874c9b..a91f28b0710b2 100644
--- a/sklearn/ensemble/_base.py
+++ b/sklearn/ensemble/_base.py
@@ -6,6 +6,7 @@
 from abc import ABCMeta, abstractmethod
 import numbers
 import warnings
+from typing import List
 
 import numpy as np
 
@@ -53,10 +54,10 @@ def _set_random_states(estimator, random_state=None):
         parameters.
 
     random_state : int or RandomState, default=None
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
+        Pseudo-random number generator to control the generation of the random
+        integers. Pass an int for reproducible output across multiple function
+        calls.
+        See :term:`Glossary <random_state>`.
 
     Notes
     -----
@@ -106,10 +107,10 @@ class BaseEnsemble(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta):
     """
 
     # overwrite _required_parameters from MetaEstimatorMixin
-    _required_parameters = []
+    _required_parameters: List[str] = []
 
     @abstractmethod
-    def __init__(self, base_estimator, n_estimators=10,
+    def __init__(self, base_estimator, *, n_estimators=10,
                  estimator_params=tuple()):
         # Set parameters
         self.base_estimator = base_estimator
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 432f628e58dc7..40a1c2434316c 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -62,6 +62,7 @@ class calls the ``fit`` method of each sub-estimator on random samples
 from ..utils.fixes import _joblib_parallel_args
 from ..utils.multiclass import check_classification_targets
 from ..utils.validation import check_is_fitted, _check_sample_weight
+from ..utils.validation import _deprecate_positional_args
 
 
 __all__ = ["RandomForestClassifier",
@@ -180,7 +181,7 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta):
     @abstractmethod
     def __init__(self,
                  base_estimator,
-                 n_estimators=100,
+                 n_estimators=100, *,
                  estimator_params=tuple(),
                  bootstrap=False,
                  oob_score=False,
@@ -480,7 +481,7 @@ class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta):
     @abstractmethod
     def __init__(self,
                  base_estimator,
-                 n_estimators=100,
+                 n_estimators=100, *,
                  estimator_params=tuple(),
                  bootstrap=False,
                  oob_score=False,
@@ -735,7 +736,7 @@ class ForestRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta):
     @abstractmethod
     def __init__(self,
                  base_estimator,
-                 n_estimators=100,
+                 n_estimators=100, *,
                  estimator_params=tuple(),
                  bootstrap=False,
                  oob_score=False,
@@ -884,9 +885,9 @@ class RandomForestClassifier(ForestClassifier):
     A random forest is a meta estimator that fits a number of decision tree
     classifiers on various sub-samples of the dataset and uses averaging to
     improve the predictive accuracy and control over-fitting.
-    The sub-sample size is always the same as the original
-    input sample size but the samples are drawn with replacement if
-    `bootstrap=True` (default).
+    The sub-sample size is controlled with the `max_samples` parameter if
+    `bootstrap=True` (default), otherwise the whole dataset is used to build
+    each tree.
 
     Read more in the :ref:`User Guide <forest>`.
 
@@ -1146,8 +1147,9 @@ class labels (multi-output problem).
     >>> print(clf.predict([[0, 0, 0, 0]]))
     [1]
     """
+    @_deprecate_positional_args
     def __init__(self,
-                 n_estimators=100,
+                 n_estimators=100, *,
                  criterion="gini",
                  max_depth=None,
                  min_samples_split=2,
@@ -1202,9 +1204,9 @@ class RandomForestRegressor(ForestRegressor):
     A random forest is a meta estimator that fits a number of classifying
     decision trees on various sub-samples of the dataset and uses averaging
     to improve the predictive accuracy and control over-fitting.
-    The sub-sample size is always the same as the original
-    input sample size but the samples are drawn with replacement if
-    `bootstrap=True` (default).
+    The sub-sample size is controlled with the `max_samples` parameter if
+    `bootstrap=True` (default), otherwise the whole dataset is used to build
+    each tree.
 
     Read more in the :ref:`User Guide <forest>`.
 
@@ -1436,8 +1438,9 @@ class RandomForestRegressor(ForestRegressor):
     >>> print(regr.predict([[0, 0, 0, 0]]))
     [-8.32987858]
     """
+    @_deprecate_positional_args
     def __init__(self,
-                 n_estimators=100,
+                 n_estimators=100, *,
                  criterion="mse",
                  max_depth=None,
                  min_samples_split=2,
@@ -1746,8 +1749,9 @@ class labels (multi-output problem).
     >>> clf.predict([[0, 0, 0, 0]])
     array([1])
     """
+    @_deprecate_positional_args
     def __init__(self,
-                 n_estimators=100,
+                 n_estimators=100, *,
                  criterion="gini",
                  max_depth=None,
                  min_samples_split=2,
@@ -2026,8 +2030,9 @@ class ExtraTreesRegressor(ForestRegressor):
     >>> reg.score(X_test, y_test)
     0.2708...
     """
+    @_deprecate_positional_args
     def __init__(self,
-                 n_estimators=100,
+                 n_estimators=100, *,
                  criterion="mse",
                  max_depth=None,
                  min_samples_split=2,
@@ -2222,8 +2227,9 @@ class RandomTreesEmbedding(BaseForest):
     criterion = 'mse'
     max_features = 1
 
+    @_deprecate_positional_args
     def __init__(self,
-                 n_estimators=100,
+                 n_estimators=100, *,
                  max_depth=5,
                  min_samples_split=2,
                  min_samples_leaf=1,
diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py
index d55499c92bac4..32e534fdc8517 100644
--- a/sklearn/ensemble/_gb.py
+++ b/sklearn/ensemble/_gb.py
@@ -53,6 +53,7 @@
 from ..utils.validation import check_is_fitted, _check_sample_weight
 from ..utils.multiclass import check_classification_targets
 from ..exceptions import NotFittedError
+from ..utils.validation import _deprecate_positional_args
 
 
 class VerboseReporter:
@@ -65,7 +66,6 @@ class VerboseReporter:
         (when iteration mod verbose_mod is zero).; if larger than 1 then output
         is printed for each update.
     """
-
     def __init__(self, verbose):
         self.verbose = verbose
 
@@ -134,7 +134,7 @@ class BaseGradientBoosting(BaseEnsemble, metaclass=ABCMeta):
     """Abstract base class for Gradient Boosting. """
 
     @abstractmethod
-    def __init__(self, loss, learning_rate, n_estimators, criterion,
+    def __init__(self, *, loss, learning_rate, n_estimators, criterion,
                  min_samples_split, min_samples_leaf, min_weight_fraction_leaf,
                  max_depth, min_impurity_decrease, min_impurity_split,
                  init, subsample, max_features, ccp_alpha,
@@ -526,7 +526,7 @@ def _fit_stages(self, X, y, raw_predictions, sample_weight, random_state,
         loss_ = self.loss_
 
         if self.verbose:
-            verbose_reporter = VerboseReporter(self.verbose)
+            verbose_reporter = VerboseReporter(verbose=self.verbose)
             verbose_reporter.init(self, begin_at_stage)
 
         X_csc = csc_matrix(X) if issparse(X) else None
@@ -1067,7 +1067,8 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting):
 
     _SUPPORTED_LOSS = ('deviance', 'exponential')
 
-    def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100,
+    @_deprecate_positional_args
+    def __init__(self, *, loss='deviance', learning_rate=0.1, n_estimators=100,
                  subsample=1.0, criterion='friedman_mse', min_samples_split=2,
                  min_samples_leaf=1, min_weight_fraction_leaf=0.,
                  max_depth=3, min_impurity_decrease=0.,
@@ -1565,7 +1566,8 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting):
 
     _SUPPORTED_LOSS = ('ls', 'lad', 'huber', 'quantile')
 
-    def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100,
+    @_deprecate_positional_args
+    def __init__(self, *, loss='ls', learning_rate=0.1, n_estimators=100,
                  subsample=1.0, criterion='friedman_mse', min_samples_split=2,
                  min_samples_leaf=1, min_weight_fraction_leaf=0.,
                  max_depth=3, min_impurity_decrease=0.,
diff --git a/sklearn/ensemble/_hist_gradient_boosting/binning.py b/sklearn/ensemble/_hist_gradient_boosting/binning.py
index 83c338d89633e..84a76dd252064 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/binning.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/binning.py
@@ -49,7 +49,7 @@ def _find_binning_thresholds(data, max_bins, subsample, random_state):
     """
     rng = check_random_state(random_state)
     if subsample is not None and data.shape[0] > subsample:
-        subset = rng.choice(np.arange(data.shape[0]), subsample, replace=False)
+        subset = rng.choice(data.shape[0], subsample, replace=False)
         data = data.take(subset, axis=0)
 
     binning_thresholds = []
diff --git a/sklearn/ensemble/_hist_gradient_boosting/common.pxd b/sklearn/ensemble/_hist_gradient_boosting/common.pxd
index fa78f2024aa5c..60399c2fbdd70 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/common.pxd
+++ b/sklearn/ensemble/_hist_gradient_boosting/common.pxd
@@ -30,3 +30,9 @@ cdef packed struct node_struct:
     unsigned int depth
     unsigned char is_leaf
     X_BINNED_DTYPE_C bin_threshold
+
+
+cpdef enum MonotonicConstraint:
+    NO_CST = 0
+    POS = 1
+    NEG = -1
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 09906f7f4f215..796f4f060dda5 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -10,7 +10,9 @@
                      is_classifier)
 from ...utils import check_random_state, check_array, resample
 from ...utils.validation import (check_is_fitted,
-                                 check_consistent_length, _check_sample_weight)
+                                 check_consistent_length,
+                                 _check_sample_weight,
+                                 _deprecate_positional_args)
 from ...utils.multiclass import check_classification_targets
 from ...metrics import check_scoring
 from ...model_selection import train_test_split
@@ -27,10 +29,11 @@ class BaseHistGradientBoosting(BaseEstimator, ABC):
     """Base class for histogram-based gradient boosting estimators."""
 
     @abstractmethod
-    def __init__(self, loss, learning_rate, max_iter, max_leaf_nodes,
+    def __init__(self, loss, *, learning_rate, max_iter, max_leaf_nodes,
                  max_depth, min_samples_leaf, l2_regularization, max_bins,
-                 warm_start, early_stopping, scoring, validation_fraction,
-                 n_iter_no_change, tol, verbose, random_state):
+                 monotonic_cst, warm_start, early_stopping, scoring,
+                 validation_fraction, n_iter_no_change, tol, verbose,
+                 random_state):
         self.loss = loss
         self.learning_rate = learning_rate
         self.max_iter = max_iter
@@ -39,6 +42,7 @@ def __init__(self, loss, learning_rate, max_iter, max_leaf_nodes,
         self.min_samples_leaf = min_samples_leaf
         self.l2_regularization = l2_regularization
         self.max_bins = max_bins
+        self.monotonic_cst = monotonic_cst
         self.warm_start = warm_start
         self.early_stopping = early_stopping
         self.scoring = scoring
@@ -82,6 +86,12 @@ def _validate_parameters(self):
             raise ValueError('max_bins={} should be no smaller than 2 '
                              'and no larger than 255.'.format(self.max_bins))
 
+        if self.monotonic_cst is not None and self.n_trees_per_iteration_ != 1:
+            raise ValueError(
+                'monotonic constraints are not supported for '
+                'multiclass classification.'
+                )
+
     def fit(self, X, y, sample_weight=None):
         """Fit the gradient boosting model.
 
@@ -217,14 +227,6 @@ def fit(self, X, y, sample_weight=None):
             )
             raw_predictions += self._baseline_prediction
 
-            # initialize gradients and hessians (empty arrays).
-            # shape = (n_trees_per_iteration, n_samples).
-            gradients, hessians = self.loss_.init_gradients_and_hessians(
-                n_samples=n_samples,
-                prediction_dim=self.n_trees_per_iteration_,
-                sample_weight=sample_weight_train
-            )
-
             # predictors is a matrix (list of lists) of TreePredictor objects
             # with shape (n_iter_, n_trees_per_iteration)
             self._predictors = predictors = []
@@ -315,13 +317,6 @@ def fit(self, X, y, sample_weight=None):
                     X_binned_train, y_train, sample_weight_train,
                     self._random_seed)
 
-            # Initialize the gradients and hessians
-            gradients, hessians = self.loss_.init_gradients_and_hessians(
-                n_samples=n_samples,
-                sample_weight=sample_weight_train,
-                prediction_dim=self.n_trees_per_iteration_
-            )
-
             # Get the predictors from the previous fit
             predictors = self._predictors
 
@@ -352,12 +347,12 @@ def fit(self, X, y, sample_weight=None):
 
             # Build `n_trees_per_iteration` trees.
             for k in range(self.n_trees_per_iteration_):
-
                 grower = TreeGrower(
                     X_binned_train, gradients[k, :], hessians[k, :],
                     n_bins=n_bins,
                     n_bins_non_missing=self.bin_mapper_.n_bins_non_missing_,
                     has_missing_values=has_missing_values,
+                    monotonic_cst=self.monotonic_cst,
                     max_leaf_nodes=self.max_leaf_nodes,
                     max_depth=self.max_depth,
                     min_samples_leaf=self.min_samples_leaf,
@@ -790,6 +785,11 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
         Features with a small number of unique values may use less than
         ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin
         is always reserved for missing values. Must be no larger than 255.
+    monotonic_cst : array-like of int of shape (n_features), default=None
+        Indicates the monotonic constraint to enforce on each feature. -1, 1
+        and 0 respectively correspond to a positive constraint, negative
+        constraint and no constraint. Read more in the :ref:`User Guide
+        <monotonic_cst_gbdt>`.
     warm_start : bool, optional (default=False)
         When set to ``True``, reuse the solution of the previous call to fit
         and add more estimators to the ensemble. For results to be valid, the
@@ -855,28 +855,31 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
     >>> # To use this experimental feature, we need to explicitly ask for it:
     >>> from sklearn.experimental import enable_hist_gradient_boosting  # noqa
     >>> from sklearn.ensemble import HistGradientBoostingRegressor
-    >>> from sklearn.datasets import load_boston
-    >>> X, y = load_boston(return_X_y=True)
+    >>> from sklearn.datasets import load_diabetes
+    >>> X, y = load_diabetes(return_X_y=True)
     >>> est = HistGradientBoostingRegressor().fit(X, y)
     >>> est.score(X, y)
-    0.98...
+    0.92...
     """
 
     _VALID_LOSSES = ('least_squares', 'least_absolute_deviation')
 
-    def __init__(self, loss='least_squares', learning_rate=0.1,
+    @_deprecate_positional_args
+    def __init__(self, loss='least_squares', *, learning_rate=0.1,
                  max_iter=100, max_leaf_nodes=31, max_depth=None,
                  min_samples_leaf=20, l2_regularization=0., max_bins=255,
-                 warm_start=False, early_stopping='auto', scoring='loss',
-                 validation_fraction=0.1, n_iter_no_change=10, tol=1e-7,
+                 monotonic_cst=None, warm_start=False, early_stopping='auto',
+                 scoring='loss', validation_fraction=0.1,
+                 n_iter_no_change=10, tol=1e-7,
                  verbose=0, random_state=None):
         super(HistGradientBoostingRegressor, self).__init__(
             loss=loss, learning_rate=learning_rate, max_iter=max_iter,
             max_leaf_nodes=max_leaf_nodes, max_depth=max_depth,
             min_samples_leaf=min_samples_leaf,
             l2_regularization=l2_regularization, max_bins=max_bins,
-            warm_start=warm_start, early_stopping=early_stopping,
-            scoring=scoring, validation_fraction=validation_fraction,
+            monotonic_cst=monotonic_cst, early_stopping=early_stopping,
+            warm_start=warm_start, scoring=scoring,
+            validation_fraction=validation_fraction,
             n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose,
             random_state=random_state)
 
@@ -978,6 +981,11 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
         Features with a small number of unique values may use less than
         ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin
         is always reserved for missing values. Must be no larger than 255.
+    monotonic_cst : array-like of int of shape (n_features), default=None
+        Indicates the monotonic constraint to enforce on each feature. -1, 1
+        and 0 respectively correspond to a positive constraint, negative
+        constraint and no constraint. Read more in the :ref:`User Guide
+        <monotonic_cst_gbdt>`.
     warm_start : bool, optional (default=False)
         When set to ``True``, reuse the solution of the previous call to fit
         and add more estimators to the ensemble. For results to be valid, the
@@ -1056,10 +1064,11 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting,
     _VALID_LOSSES = ('binary_crossentropy', 'categorical_crossentropy',
                      'auto')
 
-    def __init__(self, loss='auto', learning_rate=0.1, max_iter=100,
+    @_deprecate_positional_args
+    def __init__(self, loss='auto', *, learning_rate=0.1, max_iter=100,
                  max_leaf_nodes=31, max_depth=None, min_samples_leaf=20,
-                 l2_regularization=0., max_bins=255, warm_start=False,
-                 early_stopping='auto', scoring='loss',
+                 l2_regularization=0., max_bins=255, monotonic_cst=None,
+                 warm_start=False, early_stopping='auto', scoring='loss',
                  validation_fraction=0.1, n_iter_no_change=10, tol=1e-7,
                  verbose=0, random_state=None):
         super(HistGradientBoostingClassifier, self).__init__(
@@ -1067,8 +1076,9 @@ def __init__(self, loss='auto', learning_rate=0.1, max_iter=100,
             max_leaf_nodes=max_leaf_nodes, max_depth=max_depth,
             min_samples_leaf=min_samples_leaf,
             l2_regularization=l2_regularization, max_bins=max_bins,
-            warm_start=warm_start, early_stopping=early_stopping,
-            scoring=scoring, validation_fraction=validation_fraction,
+            monotonic_cst=monotonic_cst, warm_start=warm_start,
+            early_stopping=early_stopping, scoring=scoring,
+            validation_fraction=validation_fraction,
             n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose,
             random_state=random_state)
 
diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py
index bbee8f6c4585c..e0b54550d3082 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/grower.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py
@@ -17,6 +17,7 @@
 from .utils import sum_parallel
 from .common import PREDICTOR_RECORD_DTYPE
 from .common import Y_DTYPE
+from .common import MonotonicConstraint
 
 
 EPS = np.finfo(Y_DTYPE).eps  # to avoid zero division errors
@@ -71,7 +72,6 @@ class TreeNode:
     split_info = None
     left_child = None
     right_child = None
-    value = None
     histograms = None
     sibling = None
     parent = None
@@ -88,13 +88,25 @@ class TreeNode:
     partition_stop = 0
 
     def __init__(self, depth, sample_indices, sum_gradients,
-                 sum_hessians, parent=None):
+                 sum_hessians, parent=None, value=None):
         self.depth = depth
         self.sample_indices = sample_indices
         self.n_samples = sample_indices.shape[0]
         self.sum_gradients = sum_gradients
         self.sum_hessians = sum_hessians
         self.parent = parent
+        self.value = value
+        self.is_leaf = False
+        self.set_children_bounds(float('-inf'), float('+inf'))
+
+    def set_children_bounds(self, lower, upper):
+        """Set children values bounds to respect monotonic constraints."""
+
+        # These are bounds for the node's *children* values, not the node's
+        # value. The bounds are used in the splitter when considering potential
+        # left and right child.
+        self.children_lower_bound = lower
+        self.children_upper_bound = upper
 
     def __lt__(self, other_node):
         """Comparison for priority queue.
@@ -167,8 +179,8 @@ class TreeGrower:
     def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None,
                  max_depth=None, min_samples_leaf=20, min_gain_to_split=0.,
                  n_bins=256, n_bins_non_missing=None, has_missing_values=False,
-                 l2_regularization=0., min_hessian_to_split=1e-3,
-                 shrinkage=1.):
+                 monotonic_cst=None, l2_regularization=0.,
+                 min_hessian_to_split=1e-3, shrinkage=1.):
 
         self._validate_parameters(X_binned, max_leaf_nodes, max_depth,
                                   min_samples_leaf, min_gain_to_split,
@@ -189,17 +201,42 @@ def __init__(self, X_binned, gradients, hessians, max_leaf_nodes=None,
             has_missing_values = [has_missing_values] * X_binned.shape[1]
         has_missing_values = np.asarray(has_missing_values, dtype=np.uint8)
 
+        if monotonic_cst is None:
+            self.with_monotonic_cst = False
+            monotonic_cst = np.full(shape=X_binned.shape[1],
+                                    fill_value=MonotonicConstraint.NO_CST,
+                                    dtype=np.int8)
+        else:
+            self.with_monotonic_cst = True
+            monotonic_cst = np.asarray(monotonic_cst, dtype=np.int8)
+
+            if monotonic_cst.shape[0] != X_binned.shape[1]:
+                raise ValueError(
+                    "monotonic_cst has shape {} but the input data "
+                    "X has {} features.".format(
+                        monotonic_cst.shape[0], X_binned.shape[1]
+                    )
+                )
+            if np.any(monotonic_cst < -1) or np.any(monotonic_cst > 1):
+                raise ValueError(
+                    "monotonic_cst must be None or an array-like of "
+                    "-1, 0 or 1."
+                    )
+
         hessians_are_constant = hessians.shape[0] == 1
         self.histogram_builder = HistogramBuilder(
             X_binned, n_bins, gradients, hessians, hessians_are_constant)
         missing_values_bin_idx = n_bins - 1
         self.splitter = Splitter(
             X_binned, n_bins_non_missing, missing_values_bin_idx,
-            has_missing_values, l2_regularization, min_hessian_to_split,
+            has_missing_values, monotonic_cst,
+            l2_regularization, min_hessian_to_split,
             min_samples_leaf, min_gain_to_split, hessians_are_constant)
         self.n_bins_non_missing = n_bins_non_missing
         self.max_leaf_nodes = max_leaf_nodes
         self.has_missing_values = has_missing_values
+        self.monotonic_cst = monotonic_cst
+        self.l2_regularization = l2_regularization
         self.n_features = X_binned.shape[1]
         self.max_depth = max_depth
         self.min_samples_leaf = min_samples_leaf
@@ -252,6 +289,20 @@ def grow(self):
         while self.splittable_nodes:
             self.split_next()
 
+        self._apply_shrinkage()
+
+    def _apply_shrinkage(self):
+        """Multiply leaves values by shrinkage parameter.
+
+        This must be done at the very end of the growing process. If this were
+        done during the growing process e.g. in finalize_leaf(), then a leaf
+        would be shrunk but its sibling would potentially not be (if it's a
+        non-leaf), which would lead to a wrong computation of the 'middle'
+        value needed to enforce the monotonic constraints.
+        """
+        for leaf in self.finalized_leaves:
+            leaf.value *= self.shrinkage
+
     def _intilialize_root(self, gradients, hessians, hessians_are_constant):
         """Initialize root node and finalize it if needed."""
         n_samples = self.X_binned.shape[0]
@@ -265,7 +316,8 @@ def _intilialize_root(self, gradients, hessians, hessians_are_constant):
             depth=depth,
             sample_indices=self.splitter.partition,
             sum_gradients=sum_gradients,
-            sum_hessians=sum_hessians
+            sum_hessians=sum_hessians,
+            value=0
         )
 
         self.root.partition_start = 0
@@ -294,7 +346,8 @@ def _compute_best_split_and_push(self, node):
 
         node.split_info = self.splitter.find_node_split(
             node.n_samples, node.histograms, node.sum_gradients,
-            node.sum_hessians)
+            node.sum_hessians, node.value, node.children_lower_bound,
+            node.children_upper_bound)
 
         if node.split_info.gain <= 0:  # no valid split
             self._finalize_leaf(node)
@@ -329,12 +382,17 @@ def split_next(self):
                                    sample_indices_left,
                                    node.split_info.sum_gradient_left,
                                    node.split_info.sum_hessian_left,
-                                   parent=node)
+                                   parent=node,
+                                   value=node.split_info.value_left,
+                                   )
         right_child_node = TreeNode(depth,
                                     sample_indices_right,
                                     node.split_info.sum_gradient_right,
                                     node.split_info.sum_hessian_right,
-                                    parent=node)
+                                    parent=node,
+                                    value=node.split_info.value_right,
+                                    )
+
         left_child_node.sibling = right_child_node
         right_child_node.sibling = left_child_node
         node.right_child = right_child_node
@@ -372,10 +430,29 @@ def split_next(self):
         if right_child_node.n_samples < self.min_samples_leaf * 2:
             self._finalize_leaf(right_child_node)
 
-        # Compute histograms of childs, and compute their best possible split
+        if self.with_monotonic_cst:
+            # Set value bounds for respecting monotonic constraints
+            # See test_nodes_values() for details
+            if (self.monotonic_cst[node.split_info.feature_idx] ==
+                    MonotonicConstraint.NO_CST):
+                lower_left = lower_right = node.children_lower_bound
+                upper_left = upper_right = node.children_upper_bound
+            else:
+                mid = (left_child_node.value + right_child_node.value) / 2
+                if (self.monotonic_cst[node.split_info.feature_idx] ==
+                        MonotonicConstraint.POS):
+                    lower_left, upper_left = node.children_lower_bound, mid
+                    lower_right, upper_right = mid, node.children_upper_bound
+                else:  # NEG
+                    lower_left, upper_left = mid, node.children_upper_bound
+                    lower_right, upper_right = node.children_lower_bound, mid
+            left_child_node.set_children_bounds(lower_left, upper_left)
+            right_child_node.set_children_bounds(lower_right, upper_right)
+
+        # Compute histograms of children, and compute their best possible split
         # (if needed)
-        should_split_left = left_child_node.value is None  # node isn't a leaf
-        should_split_right = right_child_node.value is None
+        should_split_left = not left_child_node.is_leaf
+        should_split_right = not right_child_node.is_leaf
         if should_split_left or should_split_right:
 
             # We will compute the histograms of both nodes even if one of them
@@ -412,17 +489,9 @@ def split_next(self):
         return left_child_node, right_child_node
 
     def _finalize_leaf(self, node):
-        """Compute the prediction value that minimizes the objective function.
+        """Make node a leaf of the tree being grown."""
 
-        This sets the node.value attribute (node is a leaf iff node.value is
-        not None).
-
-        See Equation 5 of:
-        XGBoost: A Scalable Tree Boosting System, T. Chen, C. Guestrin, 2016
-        https://arxiv.org/abs/1603.02754
-        """
-        node.value = -self.shrinkage * node.sum_gradients / (
-            node.sum_hessians + self.splitter.l2_regularization + EPS)
+        node.is_leaf = True
         self.finalized_leaves.append(node)
 
     def _finalize_splittable_nodes(self):
@@ -464,10 +533,11 @@ def _fill_predictor_node_array(predictor_nodes, grower_node,
     else:
         node['gain'] = -1
 
-    if grower_node.value is not None:
+    node['value'] = grower_node.value
+
+    if grower_node.is_leaf:
         # Leaf node
         node['is_leaf'] = True
-        node['value'] = grower_node.value
         return next_free_idx + 1
     else:
         # Decision node
diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
index 0e74d6ba38c71..43405551ef357 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
+++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx
@@ -19,11 +19,13 @@ IF SKLEARN_OPENMP_PARALLELISM_ENABLED:
     from openmp cimport omp_get_max_threads
 from libc.stdlib cimport malloc, free
 from libc.string cimport memcpy
+from numpy.math cimport INFINITY
 
 from .common cimport X_BINNED_DTYPE_C
 from .common cimport Y_DTYPE_C
 from .common cimport hist_struct
 from .common import HISTOGRAM_DTYPE
+from .common cimport MonotonicConstraint
 
 
 cdef struct split_info_struct:
@@ -39,6 +41,8 @@ cdef struct split_info_struct:
     Y_DTYPE_C sum_hessian_right
     unsigned int n_samples_left
     unsigned int n_samples_right
+    Y_DTYPE_C value_left
+    Y_DTYPE_C value_right
 
 
 class SplitInfo:
@@ -70,7 +74,7 @@ class SplitInfo:
     def __init__(self, gain, feature_idx, bin_idx,
                  missing_go_to_left, sum_gradient_left, sum_hessian_left,
                  sum_gradient_right, sum_hessian_right, n_samples_left,
-                 n_samples_right):
+                 n_samples_right, value_left, value_right):
         self.gain = gain
         self.feature_idx = feature_idx
         self.bin_idx = bin_idx
@@ -81,6 +85,8 @@ class SplitInfo:
         self.sum_hessian_right = sum_hessian_right
         self.n_samples_left = n_samples_left
         self.n_samples_right = n_samples_right
+        self.value_left = value_left
+        self.value_right = value_right
 
 
 @cython.final
@@ -126,6 +132,7 @@ cdef class Splitter:
         const unsigned int [::1] n_bins_non_missing
         unsigned char missing_values_bin_idx
         const unsigned char [::1] has_missing_values
+        const char [::1] monotonic_cst
         unsigned char hessians_are_constant
         Y_DTYPE_C l2_regularization
         Y_DTYPE_C min_hessian_to_split
@@ -141,6 +148,7 @@ cdef class Splitter:
                  const unsigned int [::1] n_bins_non_missing,
                  const unsigned char missing_values_bin_idx,
                  const unsigned char [::1] has_missing_values,
+                 const char [::1] monotonic_cst,
                  Y_DTYPE_C l2_regularization,
                  Y_DTYPE_C min_hessian_to_split=1e-3,
                  unsigned int min_samples_leaf=20,
@@ -152,6 +160,7 @@ cdef class Splitter:
         self.n_bins_non_missing = n_bins_non_missing
         self.missing_values_bin_idx = missing_values_bin_idx
         self.has_missing_values = has_missing_values
+        self.monotonic_cst = monotonic_cst
         self.l2_regularization = l2_regularization
         self.min_hessian_to_split = min_hessian_to_split
         self.min_samples_leaf = min_samples_leaf
@@ -350,7 +359,11 @@ cdef class Splitter:
             unsigned int n_samples,
             hist_struct [:, ::1] histograms,  # IN
             const Y_DTYPE_C sum_gradients,
-            const Y_DTYPE_C sum_hessians):
+            const Y_DTYPE_C sum_hessians,
+            const Y_DTYPE_C value,
+            const Y_DTYPE_C lower_bound=-INFINITY,
+            const Y_DTYPE_C upper_bound=INFINITY,
+            ):
         """For each feature, find the best bin to split on at a given node.
 
         Return the best split info among all features.
@@ -366,6 +379,22 @@ cdef class Splitter:
             The sum of the gradients for each sample at the node.
         sum_hessians : float
             The sum of the hessians for each sample at the node.
+        value : float
+            The bounded value of the current node. We directly pass the value
+            instead of re-computing it from sum_gradients and sum_hessians,
+            because we need to compute the loss and the gain based on the
+            *bounded* value: computing the value from
+            sum_gradients / sum_hessians would give the unbounded value, and
+            the interaction with min_gain_to_split would not be correct
+            anymore. Side note: we can't use the lower_bound / upper_bound
+            parameters either because these refer to the bounds of the
+            children, not the bounds of the current node.
+        lower_bound : float
+            Lower bound for the children values for respecting the monotonic
+            constraints.
+        upper_bound : float
+            Upper bound for the children values for respecting the monotonic
+            constraints.
 
         Returns
         -------
@@ -378,7 +407,8 @@ cdef class Splitter:
             int n_features = self.n_features
             split_info_struct split_info
             split_info_struct * split_infos
-            const unsigned char [:] has_missing_values = self.has_missing_values
+            const unsigned char [::1] has_missing_values = self.has_missing_values
+            const char [::1] monotonic_cst = self.monotonic_cst
 
         with nogil:
 
@@ -386,6 +416,8 @@ cdef class Splitter:
                 self.n_features * sizeof(split_info_struct))
 
             for feature_idx in prange(n_features, schedule='static'):
+                split_infos[feature_idx].feature_idx = feature_idx
+
                 # For each feature, find best bin to split on
                 # Start with a gain of -1 (if no better split is found, that
                 # means one of the constraints isn't respected
@@ -404,7 +436,8 @@ cdef class Splitter:
                 self._find_best_bin_to_split_left_to_right(
                     feature_idx, has_missing_values[feature_idx],
                     histograms, n_samples, sum_gradients, sum_hessians,
-                    &split_infos[feature_idx])
+                    value, monotonic_cst[feature_idx],
+                    lower_bound, upper_bound, &split_infos[feature_idx])
 
                 if has_missing_values[feature_idx]:
                     # We need to explore both directions to check whether
@@ -412,7 +445,9 @@ cdef class Splitter:
                     # gain
                     self._find_best_bin_to_split_right_to_left(
                         feature_idx, histograms, n_samples,
-                        sum_gradients, sum_hessians, &split_infos[feature_idx])
+                        sum_gradients, sum_hessians,
+                        value, monotonic_cst[feature_idx],
+                        lower_bound, upper_bound, &split_infos[feature_idx])
 
             # then compute best possible split among all features
             best_feature_idx = self._find_best_feature_to_split_helper(
@@ -430,6 +465,8 @@ cdef class Splitter:
             split_info.sum_hessian_right,
             split_info.n_samples_left,
             split_info.n_samples_right,
+            split_info.value_left,
+            split_info.value_right,
         )
         free(split_infos)
         return out
@@ -456,6 +493,10 @@ cdef class Splitter:
             unsigned int n_samples,
             Y_DTYPE_C sum_gradients,
             Y_DTYPE_C sum_hessians,
+            Y_DTYPE_C value,
+            char monotonic_cst,
+            Y_DTYPE_C lower_bound,
+            Y_DTYPE_C upper_bound,
             split_info_struct * split_info) nogil:  # OUT
         """Find best bin to split on for a given feature.
 
@@ -481,15 +522,20 @@ cdef class Splitter:
             Y_DTYPE_C sum_hessian_right
             Y_DTYPE_C sum_gradient_left
             Y_DTYPE_C sum_gradient_right
-            Y_DTYPE_C negative_loss_current_node
+            Y_DTYPE_C loss_current_node
             Y_DTYPE_C gain
+            unsigned char found_better_split = False
+
+            Y_DTYPE_C best_sum_hessian_left
+            Y_DTYPE_C best_sum_gradient_left
+            unsigned int best_bin_idx
+            unsigned int best_n_samples_left
+            Y_DTYPE_C best_gain = -1
 
         sum_gradient_left, sum_hessian_left = 0., 0.
         n_samples_left = 0
-        negative_loss_current_node = negative_loss(sum_gradients,
-                                                   sum_hessians,
-                                                   self.l2_regularization)
 
+        loss_current_node = _loss_from_value(value, sum_gradients)
 
         for bin_idx in range(end):
             n_samples_left += histograms[feature_idx, bin_idx].count
@@ -519,21 +565,40 @@ cdef class Splitter:
 
             gain = _split_gain(sum_gradient_left, sum_hessian_left,
                                sum_gradient_right, sum_hessian_right,
-                               negative_loss_current_node,
+                               loss_current_node,
+                               monotonic_cst,
+                               lower_bound,
+                               upper_bound,
                                self.l2_regularization)
 
-            if gain > split_info.gain and gain > self.min_gain_to_split:
-                split_info.gain = gain
-                split_info.feature_idx = feature_idx
-                split_info.bin_idx = bin_idx
-                # we scan from left to right so missing values go to the right
-                split_info.missing_go_to_left = False
-                split_info.sum_gradient_left = sum_gradient_left
-                split_info.sum_gradient_right = sum_gradient_right
-                split_info.sum_hessian_left = sum_hessian_left
-                split_info.sum_hessian_right = sum_hessian_right
-                split_info.n_samples_left = n_samples_left
-                split_info.n_samples_right = n_samples_right
+            if gain > best_gain and gain > self.min_gain_to_split:
+                found_better_split = True
+                best_gain = gain
+                best_bin_idx = bin_idx
+                best_sum_gradient_left = sum_gradient_left
+                best_sum_hessian_left = sum_hessian_left
+                best_n_samples_left = n_samples_left
+
+        if found_better_split:
+            split_info.gain = best_gain
+            split_info.bin_idx = best_bin_idx
+            # we scan from left to right so missing values go to the right
+            split_info.missing_go_to_left = False
+            split_info.sum_gradient_left = best_sum_gradient_left
+            split_info.sum_gradient_right = sum_gradients - best_sum_gradient_left
+            split_info.sum_hessian_left = best_sum_hessian_left
+            split_info.sum_hessian_right = sum_hessians - best_sum_hessian_left
+            split_info.n_samples_left = best_n_samples_left
+            split_info.n_samples_right = n_samples - best_n_samples_left
+
+            # We recompute best values here but it's cheap
+            split_info.value_left = compute_node_value(
+                split_info.sum_gradient_left, split_info.sum_hessian_left,
+                lower_bound, upper_bound, self.l2_regularization)
+
+            split_info.value_right = compute_node_value(
+                split_info.sum_gradient_right, split_info.sum_hessian_right,
+                lower_bound, upper_bound, self.l2_regularization)
 
     cdef void _find_best_bin_to_split_right_to_left(
             self,
@@ -542,6 +607,10 @@ cdef class Splitter:
             unsigned int n_samples,
             Y_DTYPE_C sum_gradients,
             Y_DTYPE_C sum_hessians,
+            Y_DTYPE_C value,
+            char monotonic_cst,
+            Y_DTYPE_C lower_bound,
+            Y_DTYPE_C upper_bound,
             split_info_struct * split_info) nogil:  # OUT
         """Find best bin to split on for a given feature.
 
@@ -565,15 +634,21 @@ cdef class Splitter:
             Y_DTYPE_C sum_hessian_right
             Y_DTYPE_C sum_gradient_left
             Y_DTYPE_C sum_gradient_right
-            Y_DTYPE_C negative_loss_current_node
+            Y_DTYPE_C loss_current_node
             Y_DTYPE_C gain
             unsigned int start = self.n_bins_non_missing[feature_idx] - 2
+            unsigned char found_better_split = False
+
+            Y_DTYPE_C best_sum_hessian_left
+            Y_DTYPE_C best_sum_gradient_left
+            unsigned int best_bin_idx
+            unsigned int best_n_samples_left
+            Y_DTYPE_C best_gain = split_info.gain  # computed during previous scan
 
         sum_gradient_right, sum_hessian_right = 0., 0.
         n_samples_right = 0
-        negative_loss_current_node = negative_loss(sum_gradients,
-                                                   sum_hessians,
-                                                   self.l2_regularization)
+
+        loss_current_node = _loss_from_value(value, sum_gradients)
 
         for bin_idx in range(start, -1, -1):
             n_samples_right += histograms[feature_idx, bin_idx + 1].count
@@ -604,28 +679,51 @@ cdef class Splitter:
 
             gain = _split_gain(sum_gradient_left, sum_hessian_left,
                                sum_gradient_right, sum_hessian_right,
-                               negative_loss_current_node,
+                               loss_current_node,
+                               monotonic_cst,
+                               lower_bound,
+                               upper_bound,
                                self.l2_regularization)
 
-            if gain > split_info.gain and gain > self.min_gain_to_split:
-                split_info.gain = gain
-                split_info.feature_idx = feature_idx
-                split_info.bin_idx = bin_idx
-                # we scan from right to left so missing values go to the left
-                split_info.missing_go_to_left = True
-                split_info.sum_gradient_left = sum_gradient_left
-                split_info.sum_gradient_right = sum_gradient_right
-                split_info.sum_hessian_left = sum_hessian_left
-                split_info.sum_hessian_right = sum_hessian_right
-                split_info.n_samples_left = n_samples_left
-                split_info.n_samples_right = n_samples_right
+            if gain > best_gain and gain > self.min_gain_to_split:
+                found_better_split = True
+                best_gain = gain
+                best_bin_idx = bin_idx
+                best_sum_gradient_left = sum_gradient_left
+                best_sum_hessian_left = sum_hessian_left
+                best_n_samples_left = n_samples_left
+
+        if found_better_split:
+            split_info.gain = best_gain
+            split_info.bin_idx = best_bin_idx
+            # we scan from right to left so missing values go to the left
+            split_info.missing_go_to_left = True
+            split_info.sum_gradient_left = best_sum_gradient_left
+            split_info.sum_gradient_right = sum_gradients - best_sum_gradient_left
+            split_info.sum_hessian_left = best_sum_hessian_left
+            split_info.sum_hessian_right = sum_hessians - best_sum_hessian_left
+            split_info.n_samples_left = best_n_samples_left
+            split_info.n_samples_right = n_samples - best_n_samples_left
+
+            # We recompute best values here but it's cheap
+            split_info.value_left = compute_node_value(
+                split_info.sum_gradient_left, split_info.sum_hessian_left,
+                lower_bound, upper_bound, self.l2_regularization)
+
+            split_info.value_right = compute_node_value(
+                split_info.sum_gradient_right, split_info.sum_hessian_right,
+                lower_bound, upper_bound, self.l2_regularization)
+
 
 cdef inline Y_DTYPE_C _split_gain(
         Y_DTYPE_C sum_gradient_left,
         Y_DTYPE_C sum_hessian_left,
         Y_DTYPE_C sum_gradient_right,
         Y_DTYPE_C sum_hessian_right,
-        Y_DTYPE_C negative_loss_current_node,
+        Y_DTYPE_C loss_current_node,
+        char monotonic_cst,
+        Y_DTYPE_C lower_bound,
+        Y_DTYPE_C upper_bound,
         Y_DTYPE_C l2_regularization) nogil:
     """Loss reduction
 
@@ -638,18 +736,44 @@ cdef inline Y_DTYPE_C _split_gain(
     """
     cdef:
         Y_DTYPE_C gain
-    gain = negative_loss(sum_gradient_left, sum_hessian_left,
-                         l2_regularization)
-    gain += negative_loss(sum_gradient_right, sum_hessian_right,
-                          l2_regularization)
-    gain -= negative_loss_current_node
+        Y_DTYPE_C value_left
+        Y_DTYPE_C value_right
+
+    # Compute values of potential left and right children
+    value_left = compute_node_value(sum_gradient_left, sum_hessian_left,
+                                    lower_bound, upper_bound,
+                                    l2_regularization)
+    value_right = compute_node_value(sum_gradient_right, sum_hessian_right,
+                                    lower_bound, upper_bound,
+                                    l2_regularization)
+
+    if ((monotonic_cst == MonotonicConstraint.POS and value_left > value_right) or
+            (monotonic_cst == MonotonicConstraint.NEG and value_left < value_right)):
+        # don't consider this split since it does not respect the monotonic
+        # constraints. Note that these comparisons need to be done on values
+        # that have already been clipped to take the monotonic constraints into
+        # account (if any).
+        return -1
+
+    gain = loss_current_node
+    gain -= _loss_from_value(value_left, sum_gradient_left)
+    gain -= _loss_from_value(value_right, sum_gradient_right)
+    # Note that for the gain to be correct (and for min_gain_to_split to work
+    # as expected), we need all values to be bounded (current node, left child
+    # and right child).
+
     return gain
 
-cdef inline Y_DTYPE_C negative_loss(
-        Y_DTYPE_C gradient,
-        Y_DTYPE_C hessian,
-        Y_DTYPE_C l2_regularization) nogil:
-    return (gradient * gradient) / (hessian + l2_regularization)
+cdef inline Y_DTYPE_C _loss_from_value(
+        Y_DTYPE_C value,
+        Y_DTYPE_C sum_gradient) nogil:
+    """Return loss of a node from its (bounded) value
+
+    See Equation 6 of:
+    XGBoost: A Scalable Tree Boosting System, T. Chen, C. Guestrin, 2016
+    https://arxiv.org/abs/1603.02754
+    """
+    return sum_gradient * value
 
 cdef inline unsigned char sample_goes_left(
         unsigned char missing_go_to_left,
@@ -666,3 +790,32 @@ cdef inline unsigned char sample_goes_left(
         or (
             bin_value <= split_bin_idx
         ))
+
+
+cpdef inline Y_DTYPE_C compute_node_value(
+        Y_DTYPE_C sum_gradient,
+        Y_DTYPE_C sum_hessian,
+        Y_DTYPE_C lower_bound,
+        Y_DTYPE_C upper_bound,
+        Y_DTYPE_C l2_regularization) nogil:
+    """Compute a node's value.
+
+    The value is capped in the [lower_bound, upper_bound] interval to respect
+    monotonic constraints. Shrinkage is ignored.
+
+    See Equation 5 of:
+    XGBoost: A Scalable Tree Boosting System, T. Chen, C. Guestrin, 2016
+    https://arxiv.org/abs/1603.02754
+    """
+
+    cdef:
+        Y_DTYPE_C value 
+
+    value = -sum_gradient / (sum_hessian + l2_regularization + 1e-15)
+
+    if value < lower_bound:
+        value = lower_bound
+    elif value > upper_bound:
+        value = upper_bound
+
+    return value
\ No newline at end of file
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index 13cd8eac1cb7d..1b61e65793422 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -659,3 +659,25 @@ def test_early_stopping_on_test_set_with_warm_start():
     # does not raise on second call
     gb.set_params(max_iter=2)
     gb.fit(X, y)
+
+
+@pytest.mark.parametrize('Est', (HistGradientBoostingClassifier,
+                                 HistGradientBoostingRegressor))
+def test_single_node_trees(Est):
+    # Make sure it's still possible to build single-node trees. In that case
+    # the value of the root is set to 0. That's a correct value: if the tree is
+    # single-node that's because min_gain_to_split is not respected right from
+    # the root, so we don't want the tree to have any impact on the
+    # predictions.
+
+    X, y = make_classification(random_state=0)
+    y[:] = 1  # constant target will lead to a single root node
+
+    est = Est(max_iter=20)
+    est.fit(X, y)
+
+    assert all(len(predictor[0].nodes) == 1 for predictor in est._predictors)
+    assert all(predictor[0].nodes[0]['value'] == 0
+               for predictor in est._predictors)
+    # Still gives correct predictions thanks to the baseline prediction
+    assert_allclose(est.predict(X), y)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py
index d770b50e7aa30..73be2e4f4d155 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py
@@ -134,6 +134,8 @@ def test_grow_tree(n_bins, constant_hessian, stopping_param, shrinkage):
     # All the leafs are pure, it is not possible to split any further:
     assert not grower.splittable_nodes
 
+    grower._apply_shrinkage()
+
     # Check the values of the leaves:
     assert grower.root.left_child.value == approx(shrinkage)
     assert grower.root.right_child.left_child.value == approx(shrinkage)
@@ -393,5 +395,5 @@ def test_split_on_nan_with_infinite_values():
     predictions = predictor.predict(X)
     predictions_binned = predictor.predict_binned(
         X_binned, missing_values_bin_idx=bin_mapper.missing_values_bin_idx_)
-    assert np.all(predictions == -gradients)
-    assert np.all(predictions_binned == -gradients)
+    np.testing.assert_allclose(predictions, -gradients)
+    np.testing.assert_allclose(predictions_binned, -gradients)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py
new file mode 100644
index 0000000000000..d4e4c8976caed
--- /dev/null
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_monotonic_contraints.py
@@ -0,0 +1,341 @@
+import numpy as np
+import pytest
+
+from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
+from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
+from sklearn.ensemble._hist_gradient_boosting.common import MonotonicConstraint
+from sklearn.ensemble._hist_gradient_boosting.splitting import (
+    Splitter,
+    compute_node_value
+)
+from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingRegressor
+from sklearn.ensemble import HistGradientBoostingClassifier
+
+
+def is_increasing(a):
+    return (np.diff(a) >= 0.0).all()
+
+
+def is_decreasing(a):
+    return (np.diff(a) <= 0.0).all()
+
+
+def assert_leaves_values_monotonic(predictor, monotonic_cst):
+    # make sure leaves values (from left to right) are either all increasing
+    # or all decreasing (or neither) depending on the monotonic constraint.
+    nodes = predictor.nodes
+
+    def get_leaves_values():
+        """get leaves values from left to right"""
+        values = []
+
+        def depth_first_collect_leaf_values(node_idx):
+            node = nodes[node_idx]
+            if node['is_leaf']:
+                values.append(node['value'])
+                return
+            depth_first_collect_leaf_values(node['left'])
+            depth_first_collect_leaf_values(node['right'])
+
+        depth_first_collect_leaf_values(0)  # start at root (0)
+        return values
+
+    values = get_leaves_values()
+
+    if monotonic_cst == MonotonicConstraint.NO_CST:
+        # some increasing, some decreasing
+        assert not is_increasing(values) and not is_decreasing(values)
+    elif monotonic_cst == MonotonicConstraint.POS:
+        # all increasing
+        assert is_increasing(values)
+    else:  # NEG
+        # all decreasing
+        assert is_decreasing(values)
+
+
+def assert_children_values_monotonic(predictor, monotonic_cst):
+    # Make sure siblings values respect the monotonic constraints. Left should
+    # be lower (resp greater) than right child if constraint is POS (resp.
+    # NEG).
+    # Note that this property alone isn't enough to ensure full monotonicity,
+    # since we also need to guanrantee that all the descendents of the left
+    # child won't be greater (resp. lower) than the right child, or its
+    # descendents. That's why we need to bound the predicted values (this is
+    # tested in assert_children_values_bounded)
+    nodes = predictor.nodes
+    left_lower = []
+    left_greater = []
+    for node in nodes:
+        if node['is_leaf']:
+            continue
+
+        left_idx = node['left']
+        right_idx = node['right']
+
+        if nodes[left_idx]['value'] < nodes[right_idx]['value']:
+            left_lower.append(node)
+        elif nodes[left_idx]['value'] > nodes[right_idx]['value']:
+            left_greater.append(node)
+
+    if monotonic_cst == MonotonicConstraint.NO_CST:
+        assert left_lower and left_greater
+    elif monotonic_cst == MonotonicConstraint.POS:
+        assert left_lower and not left_greater
+    else:  # NEG
+        assert not left_lower and left_greater
+
+
+def assert_children_values_bounded(grower, monotonic_cst):
+    # Make sure that the values of the children of a node are bounded by the
+    # middle value between that node and its sibling (if there is a monotonic
+    # constraint).
+    # As a bonus, we also check that the siblings values are properly ordered
+    # which is slightly redundant with assert_children_values_monotonic (but
+    # this check is done on the grower nodes whereas
+    # assert_children_values_monotonic is done on the predictor nodes)
+
+    if monotonic_cst == MonotonicConstraint.NO_CST:
+        return
+
+    def recursively_check_children_node_values(node):
+        if node.is_leaf:
+            return
+        if node is not grower.root and node is node.parent.left_child:
+            sibling = node.sibling  # on the right
+            middle = (node.value + sibling.value) / 2
+            if monotonic_cst == MonotonicConstraint.POS:
+                assert (node.left_child.value <=
+                        node.right_child.value <=
+                        middle)
+                if not sibling.is_leaf:
+                    assert (middle <=
+                            sibling.left_child.value <=
+                            sibling.right_child.value)
+            else:  # NEG
+                assert (node.left_child.value >=
+                        node.right_child.value >=
+                        middle)
+                if not sibling.is_leaf:
+                    assert (middle >=
+                            sibling.left_child.value >=
+                            sibling.right_child.value)
+
+        recursively_check_children_node_values(node.left_child)
+        recursively_check_children_node_values(node.right_child)
+
+    recursively_check_children_node_values(grower.root)
+
+
+@pytest.mark.parametrize('seed', range(3))
+@pytest.mark.parametrize('monotonic_cst', (
+    MonotonicConstraint.NO_CST,
+    MonotonicConstraint.POS,
+    MonotonicConstraint.NEG,
+))
+def test_nodes_values(monotonic_cst, seed):
+    # Build a single tree with only one feature, and make sure the nodes
+    # values respect the monotonic constraints.
+
+    # Considering the following tree with a monotonic POS constraint, we
+    # should have:
+    #
+    #       root
+    #      /    \
+    #     5     10    # middle = 7.5
+    #    / \   / \
+    #   a  b  c  d
+    #
+    # a <= b and c <= d  (assert_children_values_monotonic)
+    # a, b <= middle <= c, d (assert_children_values_bounded)
+    # a <= b <= c <= d (assert_leaves_values_monotonic)
+    #
+    # The last one is a consequence of the others, but can't hurt to check
+
+    rng = np.random.RandomState(seed)
+    n_samples = 1000
+    n_features = 1
+    X_binned = rng.randint(0, 255, size=(n_samples, n_features),
+                           dtype=np.uint8)
+    X_binned = np.asfortranarray(X_binned)
+
+    gradients = rng.normal(size=n_samples).astype(G_H_DTYPE)
+    hessians = np.ones(shape=1, dtype=G_H_DTYPE)
+
+    grower = TreeGrower(X_binned, gradients, hessians,
+                        monotonic_cst=[monotonic_cst],
+                        shrinkage=.1)
+    grower.grow()
+
+    # grow() will shrink the leaves values at the very end. For our comparison
+    # tests, we need to revert the shrinkage of the leaves, else we would
+    # compare the value of a leaf (shrunk) with a node (not shrunk) and the
+    # test would not be correct.
+    for leave in grower.finalized_leaves:
+        leave.value /= grower.shrinkage
+
+    # The consistency of the bounds can only be checked on the tree grower
+    # as the node bounds are not copied into the predictor tree. The
+    # consistency checks on the values of node children and leaves can be
+    # done either on the grower tree or on the predictor tree. We only
+    # do those checks on the predictor tree as the latter is derived from
+    # the former.
+    predictor = grower.make_predictor()
+    assert_children_values_monotonic(predictor, monotonic_cst)
+    assert_children_values_bounded(grower, monotonic_cst)
+    assert_leaves_values_monotonic(predictor, monotonic_cst)
+
+
+@pytest.mark.parametrize('seed', range(3))
+def test_predictions(seed):
+    # Train a model with a POS constraint on the first feature and a NEG
+    # constraint on the second feature, and make sure the constraints are
+    # respected by checking the predictions.
+    # test adapted from lightgbm's test_monotone_constraint(), itself inspired
+    # by https://xgboost.readthedocs.io/en/latest/tutorials/monotonic.html
+
+    rng = np.random.RandomState(seed)
+
+    n_samples = 1000
+    f_0 = rng.rand(n_samples)  # positive correlation with y
+    f_1 = rng.rand(n_samples)  # negative correslation with y
+    X = np.c_[f_0, f_1]
+    noise = rng.normal(loc=0.0, scale=0.01, size=n_samples)
+    y = (5 * f_0 + np.sin(10 * np.pi * f_0) -
+         5 * f_1 - np.cos(10 * np.pi * f_1) +
+         noise)
+
+    gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, -1])
+    gbdt.fit(X, y)
+
+    linspace = np.linspace(0, 1, 100)
+    sin = np.sin(linspace)
+    constant = np.full_like(linspace, fill_value=.5)
+
+    # We now assert the predictions properly respect the constraints, on each
+    # feature. When testing for a feature we need to set the other one to a
+    # constant, because the monotonic constraints are only a "all else being
+    # equal" type of constraints:
+    # a constraint on the first feature only means that
+    # x0 < x0' => f(x0, x1) < f(x0', x1)
+    # while x1 stays constant.
+    # The constraint does not guanrantee that
+    # x0 < x0' => f(x0, x1) < f(x0', x1')
+
+    # First feature (POS)
+    # assert pred is all increasing when f_0 is all increasing
+    X = np.c_[linspace, constant]
+    pred = gbdt.predict(X)
+    assert is_increasing(pred)
+    # assert pred actually follows the variations of f_0
+    X = np.c_[sin, constant]
+    pred = gbdt.predict(X)
+    assert np.all((np.diff(pred) >= 0) == (np.diff(sin) >= 0))
+
+    # Second feature (NEG)
+    # assert pred is all decreasing when f_1 is all increasing
+    X = np.c_[constant, linspace]
+    pred = gbdt.predict(X)
+    assert is_decreasing(pred)
+    # assert pred actually follows the inverse variations of f_1
+    X = np.c_[constant, sin]
+    pred = gbdt.predict(X)
+    assert ((np.diff(pred) <= 0) == (np.diff(sin) >= 0)).all()
+
+
+def test_input_error():
+    X = [[1, 2], [2, 3], [3, 4]]
+    y = [0, 1, 2]
+
+    gbdt = HistGradientBoostingRegressor(monotonic_cst=[1, 0, -1])
+    with pytest.raises(ValueError,
+                       match='monotonic_cst has shape 3 but the input data'):
+        gbdt.fit(X, y)
+
+    for monotonic_cst in ([1, 3], [1, -3]):
+        gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)
+        with pytest.raises(ValueError,
+                           match='must be None or an array-like of '
+                                 '-1, 0 or 1'):
+            gbdt.fit(X, y)
+
+    gbdt = HistGradientBoostingClassifier(monotonic_cst=[0, 1])
+    with pytest.raises(
+            ValueError,
+            match='monotonic constraints are not supported '
+                  'for multiclass classification'
+            ):
+        gbdt.fit(X, y)
+
+
+def test_bounded_value_min_gain_to_split():
+    # The purpose of this test is to show that when computing the gain at a
+    # given split, the value of the current node should be properly bounded to
+    # respect the monotonic constraints, because it strongly interacts with
+    # min_gain_to_split. We build a simple example where gradients are [1, 1,
+    # 100, 1, 1] (hessians are all ones). The best split happens on the 3rd
+    # bin, and depending on whether the value of the node is bounded or not,
+    # the min_gain_to_split constraint is or isn't satisfied.
+    l2_regularization = 0
+    min_hessian_to_split = 0
+    min_samples_leaf = 1
+    n_bins = n_samples = 5
+    X_binned = np.arange(n_samples).reshape(-1, 1).astype(X_BINNED_DTYPE)
+    sample_indices = np.arange(n_samples, dtype=np.uint32)
+    all_hessians = np.ones(n_samples, dtype=G_H_DTYPE)
+    all_gradients = np.array([1, 1, 100, 1, 1], dtype=G_H_DTYPE)
+    sum_gradients = all_gradients.sum()
+    sum_hessians = all_hessians.sum()
+    hessians_are_constant = False
+
+    builder = HistogramBuilder(X_binned, n_bins, all_gradients,
+                               all_hessians, hessians_are_constant)
+    n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1],
+                                  dtype=np.uint32)
+    has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1],
+        dtype=np.int8)
+    missing_values_bin_idx = n_bins - 1
+    children_lower_bound, children_upper_bound = -np.inf, np.inf
+
+    min_gain_to_split = 2000
+    splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
+                        has_missing_values, monotonic_cst, l2_regularization,
+                        min_hessian_to_split, min_samples_leaf,
+                        min_gain_to_split, hessians_are_constant)
+
+    histograms = builder.compute_histograms_brute(sample_indices)
+
+    # Since the gradient array is [1, 1, 100, 1, 1]
+    # the max possible gain happens on the 3rd bin (or equivalently in the 2nd)
+    # and is equal to about 1307, which less than min_gain_to_split = 2000, so
+    # the node is considered unsplittable (gain = -1)
+    current_lower_bound, current_upper_bound = -np.inf, np.inf
+    value = compute_node_value(sum_gradients, sum_hessians,
+                               current_lower_bound, current_upper_bound,
+                               l2_regularization)
+    # the unbounded value is equal to -sum_gradients / sum_hessians
+    assert value == pytest.approx(-104 / 5)
+    split_info = splitter.find_node_split(n_samples, histograms,
+                                          sum_gradients, sum_hessians, value,
+                                          lower_bound=children_lower_bound,
+                                          upper_bound=children_upper_bound)
+    assert split_info.gain == -1  # min_gain_to_split not respected
+
+    # here again the max possible gain is on the 3rd bin but we now cap the
+    # value of the node into [-10, inf].
+    # This means the gain is now about 2430 which is more than the
+    # min_gain_to_split constraint.
+    current_lower_bound, current_upper_bound = -10, np.inf
+    value = compute_node_value(sum_gradients, sum_hessians,
+                               current_lower_bound, current_upper_bound,
+                               l2_regularization)
+    assert value == -10
+    split_info = splitter.find_node_split(n_samples, histograms,
+                                          sum_gradients, sum_hessians, value,
+                                          lower_bound=children_lower_bound,
+                                          upper_bound=children_upper_bound)
+    assert split_info.gain > min_gain_to_split
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py
index 5f80f99f05116..bcc19d750d9d3 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_splitting.py
@@ -4,7 +4,11 @@
 from sklearn.ensemble._hist_gradient_boosting.common import HISTOGRAM_DTYPE
 from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
 from sklearn.ensemble._hist_gradient_boosting.common import X_BINNED_DTYPE
-from sklearn.ensemble._hist_gradient_boosting.splitting import Splitter
+from sklearn.ensemble._hist_gradient_boosting.common import MonotonicConstraint
+from sklearn.ensemble._hist_gradient_boosting.splitting import (
+    Splitter,
+    compute_node_value
+)
 from sklearn.ensemble._hist_gradient_boosting.histogram import HistogramBuilder
 from sklearn.utils._testing import skip_if_32bit
 
@@ -43,20 +47,26 @@ def test_histogram_split(n_bins):
                                           dtype=np.uint32)
             has_missing_values = np.array([False] * X_binned.shape[1],
                                           dtype=np.uint8)
+            monotonic_cst = np.array(
+                [MonotonicConstraint.NO_CST] * X_binned.shape[1],
+                dtype=np.int8)
             missing_values_bin_idx = n_bins - 1
             splitter = Splitter(X_binned,
                                 n_bins_non_missing,
                                 missing_values_bin_idx,
                                 has_missing_values,
+                                monotonic_cst,
                                 l2_regularization,
                                 min_hessian_to_split,
                                 min_samples_leaf, min_gain_to_split,
                                 hessians_are_constant)
 
             histograms = builder.compute_histograms_brute(sample_indices)
+            value = compute_node_value(sum_gradients, sum_hessians,
+                                       -np.inf, np.inf, l2_regularization)
             split_info = splitter.find_node_split(
                 sample_indices.shape[0], histograms, sum_gradients,
-                sum_hessians)
+                sum_hessians, value)
 
             assert split_info.bin_idx == true_bin
             assert split_info.gain >= 0
@@ -106,26 +116,40 @@ def test_gradient_and_hessian_sanity(constant_hessian):
     n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1],
                                   dtype=np.uint32)
     has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1],
+        dtype=np.int8)
     missing_values_bin_idx = n_bins - 1
     splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
-                        has_missing_values, l2_regularization,
+                        has_missing_values, monotonic_cst, l2_regularization,
                         min_hessian_to_split, min_samples_leaf,
                         min_gain_to_split, constant_hessian)
 
     hists_parent = builder.compute_histograms_brute(sample_indices)
+    value_parent = compute_node_value(sum_gradients, sum_hessians,
+                                      -np.inf, np.inf, l2_regularization)
     si_parent = splitter.find_node_split(n_samples, hists_parent,
-                                         sum_gradients, sum_hessians)
+                                         sum_gradients, sum_hessians,
+                                         value_parent)
     sample_indices_left, sample_indices_right, _ = splitter.split_indices(
         si_parent, sample_indices)
 
     hists_left = builder.compute_histograms_brute(sample_indices_left)
+    value_left = compute_node_value(si_parent.sum_gradient_left,
+                                    si_parent.sum_hessian_left,
+                                    -np.inf, np.inf, l2_regularization)
     hists_right = builder.compute_histograms_brute(sample_indices_right)
+    value_right = compute_node_value(si_parent.sum_gradient_right,
+                                     si_parent.sum_hessian_right,
+                                     -np.inf, np.inf, l2_regularization)
     si_left = splitter.find_node_split(n_samples, hists_left,
                                        si_parent.sum_gradient_left,
-                                       si_parent.sum_hessian_left)
+                                       si_parent.sum_hessian_left,
+                                       value_left)
     si_right = splitter.find_node_split(n_samples, hists_right,
                                         si_parent.sum_gradient_right,
-                                        si_parent.sum_hessian_right)
+                                        si_parent.sum_hessian_right,
+                                        value_right)
 
     # make sure that si.sum_gradient_left + si.sum_gradient_right have their
     # expected value, same for hessians
@@ -206,17 +230,22 @@ def test_split_indices():
     n_bins_non_missing = np.array([n_bins] * X_binned.shape[1],
                                   dtype=np.uint32)
     has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1],
+        dtype=np.int8)
     missing_values_bin_idx = n_bins - 1
     splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
-                        has_missing_values, l2_regularization,
+                        has_missing_values, monotonic_cst, l2_regularization,
                         min_hessian_to_split, min_samples_leaf,
                         min_gain_to_split, hessians_are_constant)
 
     assert np.all(sample_indices == splitter.partition)
 
     histograms = builder.compute_histograms_brute(sample_indices)
+    value = compute_node_value(sum_gradients, sum_hessians,
+                               -np.inf, np.inf, l2_regularization)
     si_root = splitter.find_node_split(n_samples, histograms,
-                                       sum_gradients, sum_hessians)
+                                       sum_gradients, sum_hessians, value)
 
     # sanity checks for best split
     assert si_root.feature_idx == 1
@@ -263,15 +292,20 @@ def test_min_gain_to_split():
     n_bins_non_missing = np.array([n_bins - 1] * X_binned.shape[1],
                                   dtype=np.uint32)
     has_missing_values = np.array([False] * X_binned.shape[1], dtype=np.uint8)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1],
+        dtype=np.int8)
     missing_values_bin_idx = n_bins - 1
     splitter = Splitter(X_binned, n_bins_non_missing, missing_values_bin_idx,
-                        has_missing_values, l2_regularization,
+                        has_missing_values, monotonic_cst, l2_regularization,
                         min_hessian_to_split, min_samples_leaf,
                         min_gain_to_split, hessians_are_constant)
 
     histograms = builder.compute_histograms_brute(sample_indices)
+    value = compute_node_value(sum_gradients, sum_hessians,
+                               -np.inf, np.inf, l2_regularization)
     split_info = splitter.find_node_split(n_samples, histograms,
-                                          sum_gradients, sum_hessians)
+                                          sum_gradients, sum_hessians, value)
     assert split_info.gain == -1
 
 
@@ -347,7 +381,7 @@ def test_min_gain_to_split():
          3,  # cut on bin_idx=3
          False),  # missing values go to right
 
-        ([6, 6, 6, 6, 0, 1, 2, 3, 4, 5],  # 4 <=> missing
+        ([6, 6, 6, 6, 0, 1, 2, 3, 4, 5],  # 6 <=> missing
          [1, 1, 1, 1, 5, 5, 5, 5, 5, 5],
          True,  # missing values
          6,  # n_bins_non_missing
@@ -400,16 +434,22 @@ def test_splitting_missing_values(X_binned, all_gradients,
                                hessians_are_constant)
 
     n_bins_non_missing = np.array([n_bins_non_missing], dtype=np.uint32)
+    monotonic_cst = np.array(
+        [MonotonicConstraint.NO_CST] * X_binned.shape[1],
+        dtype=np.int8)
     missing_values_bin_idx = n_bins - 1
     splitter = Splitter(X_binned, n_bins_non_missing,
                         missing_values_bin_idx, has_missing_values,
+                        monotonic_cst,
                         l2_regularization, min_hessian_to_split,
                         min_samples_leaf, min_gain_to_split,
                         hessians_are_constant)
 
     histograms = builder.compute_histograms_brute(sample_indices)
+    value = compute_node_value(sum_gradients, sum_hessians,
+                               -np.inf, np.inf, l2_regularization)
     split_info = splitter.find_node_split(n_samples, histograms,
-                                          sum_gradients, sum_hessians)
+                                          sum_gradients, sum_hessians, value)
 
     assert split_info.bin_idx == expected_bin_idx
     if has_missing_values:
diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py
index 501f2425541e8..9cec1c08efc9e 100644
--- a/sklearn/ensemble/_iforest.py
+++ b/sklearn/ensemble/_iforest.py
@@ -16,6 +16,7 @@
 )
 from ..utils.fixes import _joblib_parallel_args
 from ..utils.validation import check_is_fitted, _num_samples
+from ..utils.validation import _deprecate_positional_args
 from ..base import OutlierMixin
 
 from ._bagging import BaseBagging
@@ -93,7 +94,7 @@ class IsolationForest(OutlierMixin, BaseBagging):
         processors. See :term:`Glossary <n_jobs>` for more details.
 
     behaviour : str, default='deprecated'
-        This parameter has not effect, is deprecated, and will be removed.
+        This parameter has no effect, is deprecated, and will be removed.
 
         .. versionadded:: 0.20
            ``behaviour`` is added in 0.20 for back-compatibility purpose.
@@ -181,8 +182,8 @@ class IsolationForest(OutlierMixin, BaseBagging):
     >>> clf.predict([[0.1], [0], [90]])
     array([ 1,  1, -1])
     """
-
-    def __init__(self,
+    @_deprecate_positional_args
+    def __init__(self, *,
                  n_estimators=100,
                  max_samples="auto",
                  contamination="auto",
diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py
index ba817613523f6..a75e9236f1612 100644
--- a/sklearn/ensemble/_stacking.py
+++ b/sklearn/ensemble/_stacking.py
@@ -30,6 +30,7 @@
 from ..utils.multiclass import check_classification_targets
 from ..utils.validation import check_is_fitted
 from ..utils.validation import column_or_1d
+from ..utils.validation import _deprecate_positional_args
 
 
 class _BaseStacking(TransformerMixin, _BaseHeterogeneousEnsemble,
@@ -37,7 +38,7 @@ class _BaseStacking(TransformerMixin, _BaseHeterogeneousEnsemble,
     """Base class for stacking method."""
 
     @abstractmethod
-    def __init__(self, estimators, final_estimator=None, cv=None,
+    def __init__(self, estimators, final_estimator=None, *, cv=None,
                  stack_method='auto', n_jobs=None, verbose=0,
                  passthrough=False):
         super().__init__(estimators=estimators)
@@ -366,7 +367,8 @@ class StackingClassifier(ClassifierMixin, _BaseStacking):
     0.9...
 
     """
-    def __init__(self, estimators, final_estimator=None, cv=None,
+    @_deprecate_positional_args
+    def __init__(self, estimators, final_estimator=None, *, cv=None,
                  stack_method='auto', n_jobs=None, passthrough=False,
                  verbose=0):
         super().__init__(
@@ -603,8 +605,9 @@ class StackingRegressor(RegressorMixin, _BaseStacking):
     0.3...
 
     """
-    def __init__(self, estimators, final_estimator=None, cv=None, n_jobs=None,
-                 passthrough=False, verbose=0):
+    @_deprecate_positional_args
+    def __init__(self, estimators, final_estimator=None, *, cv=None,
+                 n_jobs=None, passthrough=False, verbose=0):
         super().__init__(
             estimators=estimators,
             final_estimator=final_estimator,
diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py
index 0da6dc86c30fa..cab321702c85d 100644
--- a/sklearn/ensemble/_voting.py
+++ b/sklearn/ensemble/_voting.py
@@ -30,6 +30,7 @@
 from ..utils.validation import check_is_fitted
 from ..utils.multiclass import check_classification_targets
 from ..utils.validation import column_or_1d
+from ..utils.validation import _deprecate_positional_args
 from ..exceptions import NotFittedError
 
 
@@ -205,8 +206,8 @@ class VotingClassifier(ClassifierMixin, _BaseVoting):
     >>> print(eclf3.transform(X).shape)
     (6, 6)
     """
-
-    def __init__(self, estimators, voting='hard', weights=None,
+    @_deprecate_positional_args
+    def __init__(self, estimators, *, voting='hard', weights=None,
                  n_jobs=None, flatten_transform=True, verbose=False):
         super().__init__(estimators=estimators)
         self.voting = voting
@@ -410,8 +411,9 @@ class VotingRegressor(RegressorMixin, _BaseVoting):
     >>> print(er.fit(X, y).predict(X))
     [ 3.3  5.7 11.8 19.7 28.  40.3]
     """
-
-    def __init__(self, estimators, weights=None, n_jobs=None, verbose=False):
+    @_deprecate_positional_args
+    def __init__(self, estimators, *, weights=None, n_jobs=None,
+                 verbose=False):
         super().__init__(estimators=estimators)
         self.weights = weights
         self.n_jobs = n_jobs
diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py
index 8b1e05502deac..7fc8f898a5ae0 100644
--- a/sklearn/ensemble/_weight_boosting.py
+++ b/sklearn/ensemble/_weight_boosting.py
@@ -41,6 +41,7 @@
 from ..utils.validation import _check_sample_weight
 from ..utils.validation import has_fit_parameter
 from ..utils.validation import _num_samples
+from ..utils.validation import _deprecate_positional_args
 
 __all__ = [
     'AdaBoostClassifier',
@@ -57,7 +58,7 @@ class BaseWeightBoosting(BaseEnsemble, metaclass=ABCMeta):
 
     @abstractmethod
     def __init__(self,
-                 base_estimator=None,
+                 base_estimator=None, *,
                  n_estimators=50,
                  estimator_params=tuple(),
                  learning_rate=1.,
@@ -397,8 +398,9 @@ class AdaBoostClassifier(ClassifierMixin, BaseWeightBoosting):
     >>> clf.score(X, y)
     0.983...
     """
+    @_deprecate_positional_args
     def __init__(self,
-                 base_estimator=None,
+                 base_estimator=None, *,
                  n_estimators=50,
                  learning_rate=1.,
                  algorithm='SAMME.R',
@@ -959,8 +961,9 @@ class AdaBoostRegressor(RegressorMixin, BaseWeightBoosting):
     .. [2] H. Drucker, "Improving Regressors using Boosting Techniques", 1997.
 
     """
+    @_deprecate_positional_args
     def __init__(self,
-                 base_estimator=None,
+                 base_estimator=None, *,
                  n_estimators=50,
                  learning_rate=1.,
                  loss='linear',
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index 0b504ee43a7ba..8144a095cec3a 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -15,6 +15,7 @@
 import itertools
 from itertools import combinations
 from itertools import product
+from typing import Dict, Any
 
 import numpy as np
 from scipy.sparse import csr_matrix
@@ -100,12 +101,12 @@
     "RandomTreesEmbedding": RandomTreesEmbedding,
 }
 
-FOREST_ESTIMATORS = dict()
+FOREST_ESTIMATORS: Dict[str, Any] = dict()
 FOREST_ESTIMATORS.update(FOREST_CLASSIFIERS)
 FOREST_ESTIMATORS.update(FOREST_REGRESSORS)
 FOREST_ESTIMATORS.update(FOREST_TRANSFORMERS)
 
-FOREST_CLASSIFIERS_REGRESSORS = FOREST_CLASSIFIERS.copy()
+FOREST_CLASSIFIERS_REGRESSORS: Dict[str, Any] = FOREST_CLASSIFIERS.copy()
 FOREST_CLASSIFIERS_REGRESSORS.update(FOREST_REGRESSORS)
 
 
@@ -1259,7 +1260,8 @@ def test_min_impurity_decrease():
             assert tree.min_impurity_decrease == 0.1
 
 
-class MyBackend(DEFAULT_JOBLIB_BACKEND):
+# mypy error: Variable "DEFAULT_JOBLIB_BACKEND" is not valid type
+class MyBackend(DEFAULT_JOBLIB_BACKEND):  # type: ignore
     def __init__(self, *args, **kwargs):
         self.count = 0
         super().__init__(*args, **kwargs)
diff --git a/sklearn/experimental/enable_hist_gradient_boosting.py b/sklearn/experimental/enable_hist_gradient_boosting.py
index 6b0a6ad8a28bb..7de4f2e434de0 100644
--- a/sklearn/experimental/enable_hist_gradient_boosting.py
+++ b/sklearn/experimental/enable_hist_gradient_boosting.py
@@ -26,7 +26,11 @@
 
 from .. import ensemble
 
-ensemble.HistGradientBoostingClassifier = HistGradientBoostingClassifier
-ensemble.HistGradientBoostingRegressor = HistGradientBoostingRegressor
+# use settattr to avoid mypy errors when monkeypatching
+setattr(ensemble, "HistGradientBoostingClassifier",
+        HistGradientBoostingClassifier)
+setattr(ensemble, "HistGradientBoostingRegressor",
+        HistGradientBoostingRegressor)
+
 ensemble.__all__ += ['HistGradientBoostingClassifier',
                      'HistGradientBoostingRegressor']
diff --git a/sklearn/experimental/enable_iterative_imputer.py b/sklearn/experimental/enable_iterative_imputer.py
index 99d18a289aa99..eebe816980b0f 100644
--- a/sklearn/experimental/enable_iterative_imputer.py
+++ b/sklearn/experimental/enable_iterative_imputer.py
@@ -15,5 +15,6 @@
 from ..impute._iterative import IterativeImputer
 from .. import impute
 
-impute.IterativeImputer = IterativeImputer
+# use settattr to avoid mypy errors when monkeypatching
+setattr(impute, 'IterativeImputer', IterativeImputer)
 impute.__all__ += ['IterativeImputer']
diff --git a/sklearn/externals/_arff.py b/sklearn/externals/_arff.py
index bf3cbfc9a9b98..63a39c3c15d4a 100644
--- a/sklearn/externals/_arff.py
+++ b/sklearn/externals/_arff.py
@@ -148,6 +148,8 @@
                     'joel.nothman@gmail.com')
 __version__ = '2.4.0'
 
+from typing import Optional
+
 import re
 import sys
 import csv
@@ -318,7 +320,7 @@ def _parse_values(s):
 
 # EXCEPTIONS ==================================================================
 class ArffException(Exception):
-    message = None
+    message : Optional[str] = None
 
     def __init__(self):
         self.line = -1
diff --git a/sklearn/externals/joblib/__init__.py b/sklearn/externals/joblib/__init__.py
deleted file mode 100644
index 4fcf030c28853..0000000000000
--- a/sklearn/externals/joblib/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Import necessary to preserve backward compatibility of pickles
-import sys
-import warnings
-
-from joblib import *
-
-
-msg = ("sklearn.externals.joblib is deprecated in 0.21 and will be removed "
-       "in 0.23. Please import this functionality directly from joblib, "
-       "which can be installed with: pip install joblib. If this warning is "
-       "raised when loading pickled models, you may need to re-serialize "
-       "those models with scikit-learn 0.21+.")
-
-if not hasattr(sys, "_is_pytest_session"):
-    warnings.warn(msg, category=FutureWarning)
diff --git a/sklearn/externals/joblib/numpy_pickle.py b/sklearn/externals/joblib/numpy_pickle.py
deleted file mode 100644
index e79a0e1c5c056..0000000000000
--- a/sklearn/externals/joblib/numpy_pickle.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# Import necessary to preserve backward compatibility of pickles
-
-from joblib.numpy_pickle import *
diff --git a/sklearn/externals/setup.py b/sklearn/externals/setup.py
deleted file mode 100644
index 936f0327226d6..0000000000000
--- a/sklearn/externals/setup.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# -*- coding: utf-8 -*-
-
-
-def configuration(parent_package='', top_path=None):
-    from numpy.distutils.misc_util import Configuration
-    config = Configuration('externals', parent_package, top_path)
-    config.add_subpackage('joblib')
-
-    return config
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 28b2eb99ee93c..ebc584b6271a9 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -1165,7 +1165,7 @@ def fit(self, raw_documents, y=None):
         return self
 
     def fit_transform(self, raw_documents, y=None):
-        """Learn the vocabulary dictionary and return term-document matrix.
+        """Learn the vocabulary dictionary and return document-term matrix.
 
         This is equivalent to fit followed by transform, but more efficiently
         implemented.
@@ -1816,7 +1816,7 @@ def fit(self, raw_documents, y=None):
         return self
 
     def fit_transform(self, raw_documents, y=None):
-        """Learn vocabulary and idf, return term-document matrix.
+        """Learn vocabulary and idf, return document-term matrix.
 
         This is equivalent to fit followed by transform, but more efficiently
         implemented.
diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py
index dd72bddc58eb5..76326a8617da5 100644
--- a/sklearn/feature_selection/_from_model.py
+++ b/sklearn/feature_selection/_from_model.py
@@ -10,6 +10,7 @@
 
 from ..exceptions import NotFittedError
 from ..utils.metaestimators import if_delegate_has_method
+from ..utils.validation import _deprecate_positional_args
 
 
 def _get_feature_importances(estimator, norm_order=1):
@@ -116,9 +117,8 @@ class SelectFromModel(MetaEstimatorMixin, SelectorMixin, BaseEstimator):
         estimator is of dimension 2.
 
     max_features : int or None, optional
-        The maximum number of features selected scoring above ``threshold``.
-        To disable ``threshold`` and only select based on ``max_features``,
-        set ``threshold=-np.inf``.
+        The maximum number of features to select.
+        To only select based on ``max_features``, set ``threshold=-np.inf``.
 
         .. versionadded:: 0.20
 
@@ -158,7 +158,8 @@ class SelectFromModel(MetaEstimatorMixin, SelectorMixin, BaseEstimator):
            [-0.48],
            [ 1.48]])
     """
-    def __init__(self, estimator, threshold=None, prefit=False,
+    @_deprecate_positional_args
+    def __init__(self, estimator, *, threshold=None, prefit=False,
                  norm_order=1, max_features=None):
         self.estimator = estimator
         self.threshold = threshold
diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py
index ac4dbdf58d174..7e7aada0d70b3 100644
--- a/sklearn/feature_selection/_rfe.py
+++ b/sklearn/feature_selection/_rfe.py
@@ -13,6 +13,7 @@
 from ..utils.metaestimators import if_delegate_has_method
 from ..utils.metaestimators import _safe_split
 from ..utils.validation import check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 from ..base import BaseEstimator
 from ..base import MetaEstimatorMixin
 from ..base import clone
@@ -95,7 +96,7 @@ class RFE(SelectorMixin, MetaEstimatorMixin, BaseEstimator):
     >>> from sklearn.svm import SVR
     >>> X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
     >>> estimator = SVR(kernel="linear")
-    >>> selector = RFE(estimator, 5, step=1)
+    >>> selector = RFE(estimator, n_features_to_select=5, step=1)
     >>> selector = selector.fit(X, y)
     >>> selector.support_
     array([ True,  True,  True,  True,  True, False, False, False, False,
@@ -119,7 +120,8 @@ class RFE(SelectorMixin, MetaEstimatorMixin, BaseEstimator):
            for cancer classification using support vector machines",
            Mach. Learn., 46(1-3), 389--422, 2002.
     """
-    def __init__(self, estimator, n_features_to_select=None, step=1,
+    @_deprecate_positional_args
+    def __init__(self, estimator, *, n_features_to_select=None, step=1,
                  verbose=0):
         self.estimator = estimator
         self.n_features_to_select = n_features_to_select
@@ -464,7 +466,8 @@ class RFECV(RFE):
            for cancer classification using support vector machines",
            Mach. Learn., 46(1-3), 389--422, 2002.
     """
-    def __init__(self, estimator, step=1, min_features_to_select=1, cv=None,
+    @_deprecate_positional_args
+    def __init__(self, estimator, *, step=1, min_features_to_select=1, cv=None,
                  scoring=None, verbose=0, n_jobs=None):
         self.estimator = estimator
         self.step = step
diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py
index 221e46f2a505e..7ca0ce4a36715 100644
--- a/sklearn/feature_selection/_univariate_selection.py
+++ b/sklearn/feature_selection/_univariate_selection.py
@@ -17,6 +17,7 @@
                      safe_mask)
 from ..utils.extmath import safe_sparse_dot, row_norms
 from ..utils.validation import check_is_fitted
+from ..utils.validation import _deprecate_positional_args
 from ._base import SelectorMixin
 
 
@@ -419,9 +420,9 @@ class SelectPercentile(_BaseFilter):
     SelectFwe: Select features based on family-wise error rate.
     GenericUnivariateSelect: Univariate feature selector with configurable mode.
     """
-
-    def __init__(self, score_func=f_classif, percentile=10):
-        super().__init__(score_func)
+    @_deprecate_positional_args
+    def __init__(self, score_func=f_classif, *, percentile=10):
+        super().__init__(score_func=score_func)
         self.percentile = percentile
 
     def _check_params(self, X, y):
@@ -503,9 +504,9 @@ class SelectKBest(_BaseFilter):
     SelectFwe: Select features based on family-wise error rate.
     GenericUnivariateSelect: Univariate feature selector with configurable mode.
     """
-
-    def __init__(self, score_func=f_classif, k=10):
-        super().__init__(score_func)
+    @_deprecate_positional_args
+    def __init__(self, score_func=f_classif, *, k=10):
+        super().__init__(score_func=score_func)
         self.k = k
 
     def _check_params(self, X, y):
@@ -582,9 +583,9 @@ class SelectFpr(_BaseFilter):
     SelectFwe: Select features based on family-wise error rate.
     GenericUnivariateSelect: Univariate feature selector with configurable mode.
     """
-
-    def __init__(self, score_func=f_classif, alpha=5e-2):
-        super().__init__(score_func)
+    @_deprecate_positional_args
+    def __init__(self, score_func=f_classif, *, alpha=5e-2):
+        super().__init__(score_func=score_func)
         self.alpha = alpha
 
     def _get_support_mask(self):
@@ -648,9 +649,9 @@ class SelectFdr(_BaseFilter):
     SelectFwe: Select features based on family-wise error rate.
     GenericUnivariateSelect: Univariate feature selector with configurable mode.
     """
-
-    def __init__(self, score_func=f_classif, alpha=5e-2):
-        super().__init__(score_func)
+    @_deprecate_positional_args
+    def __init__(self, score_func=f_classif, *, alpha=5e-2):
+        super().__init__(score_func=score_func)
         self.alpha = alpha
 
     def _get_support_mask(self):
@@ -711,9 +712,9 @@ class SelectFwe(_BaseFilter):
     SelectFdr: Select features based on an estimated false discovery rate.
     GenericUnivariateSelect: Univariate feature selector with configurable mode.
     """
-
-    def __init__(self, score_func=f_classif, alpha=5e-2):
-        super().__init__(score_func)
+    @_deprecate_positional_args
+    def __init__(self, score_func=f_classif, *, alpha=5e-2):
+        super().__init__(score_func=score_func)
         self.alpha = alpha
 
     def _get_support_mask(self):
@@ -761,7 +762,7 @@ class GenericUnivariateSelect(_BaseFilter):
     >>> X, y = load_breast_cancer(return_X_y=True)
     >>> X.shape
     (569, 30)
-    >>> transformer = GenericUnivariateSelect(chi2, 'k_best', param=20)
+    >>> transformer = GenericUnivariateSelect(chi2, mode='k_best', param=20)
     >>> X_new = transformer.fit_transform(X, y)
     >>> X_new.shape
     (569, 20)
@@ -786,8 +787,9 @@ class GenericUnivariateSelect(_BaseFilter):
                         'fdr': SelectFdr,
                         'fwe': SelectFwe}
 
-    def __init__(self, score_func=f_classif, mode='percentile', param=1e-5):
-        super().__init__(score_func)
+    @_deprecate_positional_args
+    def __init__(self, score_func=f_classif, *, mode='percentile', param=1e-5):
+        super().__init__(score_func=score_func)
         self.mode = mode
         self.param = param
 
diff --git a/sklearn/feature_selection/tests/test_feature_select.py b/sklearn/feature_selection/tests/test_feature_select.py
index abb11fdc7b8da..27938c5e27819 100644
--- a/sklearn/feature_selection/tests/test_feature_select.py
+++ b/sklearn/feature_selection/tests/test_feature_select.py
@@ -552,7 +552,7 @@ def test_nans():
     X = [[0, 1, 0], [0, -1, -1], [0, .5, .5]]
     y = [1, 0, 1]
 
-    for select in (SelectKBest(f_classif, 2),
+    for select in (SelectKBest(f_classif, k=2),
                    SelectPercentile(f_classif, percentile=67)):
         ignore_warnings(select.fit)(X, y)
         assert_array_equal(select.get_support(indices=True), np.array([1, 2]))
diff --git a/sklearn/impute/__init__.py b/sklearn/impute/__init__.py
index 4e435d44fbdbf..940035ae58589 100644
--- a/sklearn/impute/__init__.py
+++ b/sklearn/impute/__init__.py
@@ -1,8 +1,14 @@
 """Transformers for missing value imputation"""
+import typing
 
 from ._base import MissingIndicator, SimpleImputer
 from ._knn import KNNImputer
 
+if typing.TYPE_CHECKING:
+    # Avoid errors in type checkers (e.g. mypy) for experimental estimators.
+    # TODO: remove this check once the estimator is no longer experimental.
+    from ._iterative import IterativeImputer  # noqa
+
 __all__ = [
     'MissingIndicator',
     'SimpleImputer',
diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py
index ff4d9d6738977..8efafd8a7eef4 100644
--- a/sklearn/inspection/_permutation_importance.py
+++ b/sklearn/inspection/_permutation_importance.py
@@ -100,6 +100,21 @@ def permutation_importance(estimator, X, y, scoring=None, n_repeats=5,
     ----------
     .. [BRE] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32,
              2001. https://doi.org/10.1023/A:1010933404324
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> from sklearn.inspection import permutation_importance
+    >>> X = [[1, 9, 9],[1, 9, 9],[1, 9, 9],
+    ...      [0, 9, 9],[0, 9, 9],[0, 9, 9]]
+    >>> y = [1, 1, 1, 0, 0, 0]
+    >>> clf = LogisticRegression().fit(X, y)
+    >>> result = permutation_importance(clf, X, y, n_repeats=10,
+    ...                                 random_state=0)
+    >>> result.importances_mean
+    array([0.4666..., 0.       , 0.       ])
+    >>> result.importances_std
+    array([0.2211..., 0.       , 0.       ])
     """
     if not hasattr(X, "iloc"):
         X = check_array(X, force_all_finite='allow-nan', dtype=None)
diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py
index 6ae62ce245a56..47b7813ae8aaa 100644
--- a/sklearn/kernel_approximation.py
+++ b/sklearn/kernel_approximation.py
@@ -40,10 +40,10 @@ class RBFSampler(TransformerMixin, BaseEstimator):
         Equals the dimensionality of the computed feature space.
 
     random_state : int, RandomState instance or None, optional (default=None)
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
+        Pseudo-random number generator to control the generation of the random
+        weights and random offset when fitting the training data.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
 
     Attributes
     ----------
@@ -154,10 +154,10 @@ class SkewedChi2Sampler(TransformerMixin, BaseEstimator):
         Equals the dimensionality of the computed feature space.
 
     random_state : int, RandomState instance or None, optional (default=None)
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
+        Pseudo-random number generator to control the generation of the random
+        weights and random offset when fitting the training data.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
 
     Examples
     --------
@@ -482,10 +482,11 @@ class Nystroem(TransformerMixin, BaseEstimator):
         How many data points will be used to construct the mapping.
 
     random_state : int, RandomState instance or None, optional (default=None)
-        If int, random_state is the seed used by the random number generator;
-        If RandomState instance, random_state is the random number generator;
-        If None, the random number generator is the RandomState instance used
-        by `np.random`.
+        Pseudo-random number generator to control the uniform sampling without
+        replacement of n_components of the training data to construct the basis
+        kernel.
+        Pass an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
 
     Attributes
     ----------
diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index ca1e8ccb48ca4..46e924abbc1d0 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -25,7 +25,8 @@
 from ..utils.validation import check_is_fitted, _check_sample_weight
 from ..utils.validation import column_or_1d
 
-from . import _cd_fast as cd_fast
+# mypy error: Module 'sklearn.linear_model' has no attribute '_cd_fast'
+from . import _cd_fast as cd_fast  # type: ignore
 
 
 def _set_order(X, y, order='C'):
diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index 81068bb40c725..a3781cf981710 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -19,7 +19,8 @@
 
 from ._base import LinearModel
 from ..base import RegressorMixin, MultiOutputMixin
-from ..utils import arrayfuncs, as_float_array
+# mypy error: Module 'sklearn.utils' has no attribute 'arrayfuncs'
+from ..utils import arrayfuncs, as_float_array  # type: ignore
 from ..model_selection import check_cv
 from ..exceptions import ConvergenceWarning
 
diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 8b89458525590..874dc743f4cc2 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -729,10 +729,10 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
             w0 = w0.ravel()
         target = Y_multi
         if solver == 'lbfgs':
-            func = lambda x, *args: _multinomial_loss_grad(x, *args)[0:2]
+            def func(x, *args): return _multinomial_loss_grad(x, *args)[0:2]
         elif solver == 'newton-cg':
-            func = lambda x, *args: _multinomial_loss(x, *args)[0]
-            grad = lambda x, *args: _multinomial_loss_grad(x, *args)[1]
+            def func(x, *args): return _multinomial_loss(x, *args)[0]
+            def grad(x, *args): return _multinomial_loss_grad(x, *args)[1]
             hess = _multinomial_grad_hess
         warm_start_sag = {'coef': w0.T}
     else:
@@ -741,7 +741,7 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
             func = _logistic_loss_and_grad
         elif solver == 'newton-cg':
             func = _logistic_loss
-            grad = lambda x, *args: _logistic_loss_and_grad(x, *args)[1]
+            def grad(x, *args): return _logistic_loss_and_grad(x, *args)[1]
             hess = _logistic_grad_hess
         warm_start_sag = {'coef': np.expand_dims(w0, axis=1)}
 
@@ -1306,8 +1306,8 @@ def fit(self, X, y, sample_weight=None):
         if self.penalty == 'elasticnet':
             if (not isinstance(self.l1_ratio, numbers.Number) or
                     self.l1_ratio < 0 or self.l1_ratio > 1):
-                        raise ValueError("l1_ratio must be between 0 and 1;"
-                                         " got (l1_ratio=%r)" % self.l1_ratio)
+                raise ValueError("l1_ratio must be between 0 and 1;"
+                                 " got (l1_ratio=%r)" % self.l1_ratio)
         elif self.l1_ratio is not None:
             warnings.warn("l1_ratio parameter is only used when penalty is "
                           "'elasticnet'. Got "
diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index dd305ebc55f4f..bf1e77e3e355b 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -287,25 +287,31 @@ def _make_validation_score_cb(self, validation_mask, X, y, sample_weight,
             self, X[validation_mask], y[validation_mask],
             sample_weight[validation_mask], classes=classes)
 
-    @deprecated("Attribute standard_coef_ was deprecated "
+    # mypy error: Decorated property not supported
+    @deprecated("Attribute standard_coef_ was deprecated "  # type: ignore
                 "in version 0.23 and will be removed in 0.25.")
     @property
     def standard_coef_(self):
         return self._standard_coef
 
-    @deprecated("Attribute standard_intercept_ was deprecated "
-                "in version 0.23 and will be removed in 0.25.")
+    # mypy error: Decorated property not supported
+    @deprecated(  # type: ignore
+        "Attribute standard_intercept_ was deprecated "
+        "in version 0.23 and will be removed in 0.25."
+    )
     @property
     def standard_intercept_(self):
         return self._standard_intercept
 
-    @deprecated("Attribute average_coef_ was deprecated "
+    # mypy error: Decorated property not supported
+    @deprecated("Attribute average_coef_ was deprecated "  # type: ignore
                 "in version 0.23 and will be removed in 0.25.")
     @property
     def average_coef_(self):
         return self._average_coef
 
-    @deprecated("Attribute average_intercept_ was deprecated "
+    # mypy error: Decorated property not supported
+    @deprecated("Attribute average_intercept_ was deprecated "  # type: ignore
                 "in version 0.23 and will be removed in 0.25.")
     @property
     def average_intercept_(self):
diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py
index c962edccc953a..0d30c4dd13022 100644
--- a/sklearn/linear_model/tests/test_base.py
+++ b/sklearn/linear_model/tests/test_base.py
@@ -212,16 +212,30 @@ def test_linear_regression_pd_sparse_dataframe_warning():
     # restrict the pd versions < '0.24.0' as they have a bug in is_sparse func
     if LooseVersion(pd.__version__) < '0.24.0':
         pytest.skip("pandas 0.24+ required.")
-    df = pd.DataFrame()
-    for col in range(4):
+
+    # Warning is raised only when some of the columns is sparse
+    df = pd.DataFrame({'0': np.random.randn(10)})
+    for col in range(1, 4):
         arr = np.random.randn(10)
         arr[:8] = 0
-        df[str(col)] = pd.arrays.SparseArray(arr, fill_value=0)
+        # all columns but the first column is sparse
+        if col != 0:
+            arr = pd.arrays.SparseArray(arr, fill_value=0)
+        df[str(col)] = arr
+
     msg = "pandas.DataFrame with sparse columns found."
     with pytest.warns(UserWarning, match=msg):
         reg = LinearRegression()
         reg.fit(df.iloc[:, 0:2], df.iloc[:, 3])
 
+    # does not warn when the whole dataframe is sparse
+    df['0'] = pd.arrays.SparseArray(df['0'], fill_value=0)
+    assert hasattr(df, "sparse")
+
+    with pytest.warns(None) as record:
+        reg.fit(df.iloc[:, 0:2], df.iloc[:, 3])
+    assert not record
+
 
 def test_preprocess_data():
     n_samples = 200
diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py
index 8a7fc3f85f425..f26db5cc2028d 100644
--- a/sklearn/manifold/_isomap.py
+++ b/sklearn/manifold/_isomap.py
@@ -167,8 +167,11 @@ def _fit_transform(self, X):
 
         self.embedding_ = self.kernel_pca_.fit_transform(G)
 
-    @deprecated("Attribute `training_data_` was deprecated in version 0.22 and"
-                " will be removed in 0.24.")
+    # mypy error: Decorated property not supported
+    @deprecated(  # type: ignore
+        "Attribute `training_data_` was deprecated in version 0.22 and"
+        " will be removed in 0.24."
+    )
     @property
     def training_data_(self):
         check_is_fitted(self)
diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py
index 3fd4d1b364b05..53558f6051283 100644
--- a/sklearn/manifold/_t_sne.py
+++ b/sklearn/manifold/_t_sne.py
@@ -22,7 +22,8 @@
 from ..decomposition import PCA
 from ..metrics.pairwise import pairwise_distances
 from . import _utils
-from . import _barnes_hut_tsne
+# mypy error: Module 'sklearn.manifold' has no attribute '_barnes_hut_tsne'
+from . import _barnes_hut_tsne  # type: ignore
 
 
 MACHINE_EPSILON = np.finfo(np.double).eps
diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py
index 9486bbd4a96f5..5e38e1afff592 100644
--- a/sklearn/manifold/tests/test_t_sne.py
+++ b/sklearn/manifold/tests/test_t_sne.py
@@ -21,7 +21,8 @@
 from sklearn.manifold._t_sne import _gradient_descent
 from sklearn.manifold._t_sne import trustworthiness
 from sklearn.manifold import TSNE
-from sklearn.manifold import _barnes_hut_tsne
+# mypy error: Module 'sklearn.manifold' has no attribute '_barnes_hut_tsne'
+from sklearn.manifold import _barnes_hut_tsne  # type: ignore
 from sklearn.manifold._utils import _binary_search_perplexity
 from sklearn.datasets import make_blobs
 from scipy.optimize import check_grad
diff --git a/sklearn/metrics/_plot/confusion_matrix.py b/sklearn/metrics/_plot/confusion_matrix.py
index 96d99adfe7386..8916b523fc273 100644
--- a/sklearn/metrics/_plot/confusion_matrix.py
+++ b/sklearn/metrics/_plot/confusion_matrix.py
@@ -137,7 +137,8 @@ def plot_confusion_matrix(estimator, X, y_true, labels=None,
     Parameters
     ----------
     estimator : estimator instance
-        Trained classifier.
+        Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
+        in which the last estimator is a classifier.
 
     X : {array-like, sparse matrix} of shape (n_samples, n_features)
         Input values.
diff --git a/sklearn/metrics/_plot/precision_recall_curve.py b/sklearn/metrics/_plot/precision_recall_curve.py
index a83fbe5acc60a..bfec9276f83be 100644
--- a/sklearn/metrics/_plot/precision_recall_curve.py
+++ b/sklearn/metrics/_plot/precision_recall_curve.py
@@ -106,7 +106,8 @@ def plot_precision_recall_curve(estimator, X, y,
     Parameters
     ----------
     estimator : estimator instance
-        Trained classifier.
+        Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
+        in which the last estimator is a classifier.
 
     X : {array-like, sparse matrix} of shape (n_samples, n_features)
         Input values.
diff --git a/sklearn/metrics/_plot/roc_curve.py b/sklearn/metrics/_plot/roc_curve.py
index fb76691ff37d1..d786ac6659d41 100644
--- a/sklearn/metrics/_plot/roc_curve.py
+++ b/sklearn/metrics/_plot/roc_curve.py
@@ -115,7 +115,8 @@ def plot_roc_curve(estimator, X, y, sample_weight=None,
     Parameters
     ----------
     estimator : estimator instance
-        Trained classifier.
+        Fitted classifier or a fitted :class:`~sklearn.pipeline.Pipeline`
+        in which the last estimator is a classifier.
 
     X : {array-like, sparse matrix} of shape (n_samples, n_features)
         Input values.
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 4542b8e2a2964..a66ff9525c28c 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -737,8 +737,9 @@ def _test_precision_recall_curve(y_true, probas_pred):
     assert_array_almost_equal(precision_recall_auc, 0.859, 3)
     assert_array_almost_equal(precision_recall_auc,
                               average_precision_score(y_true, probas_pred))
+    # `_average_precision` is not very precise in case of 0.5 ties: be tolerant
     assert_almost_equal(_average_precision(y_true, probas_pred),
-                        precision_recall_auc, decimal=3)
+                        precision_recall_auc, decimal=2)
     assert p.size == r.size
     assert p.size == thresholds.size + 1
     # Smoke test in the case of proba having only one value
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index bbd8b3f33dfb7..3e5b85ed73a02 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -385,9 +385,10 @@ def _check_param_grid(param_grid):
 
             if (isinstance(v, str) or
                     not isinstance(v, (np.ndarray, Sequence))):
-                raise ValueError("Parameter values for parameter ({0}) need "
-                                 "to be a sequence(but not a string) or"
-                                 " np.ndarray.".format(name))
+                raise ValueError("Parameter grid for parameter ({0}) needs to"
+                                 " be a list or numpy array, but got ({1})."
+                                 " Single values need to be wrapped in a list"
+                                 " with one element.".format(name, type(v)))
 
             if len(v) == 0:
                 raise ValueError("Parameter values for parameter ({0}) need "
diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index 90ffb3989349a..e728533c3b5cf 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -2144,7 +2144,8 @@ def train_test_split(*arrays, **options):
 
 # Tell nose that train_test_split is not a test.
 # (Needed for external libraries that may use nose.)
-train_test_split.__test__ = False
+# Use setattr to avoid mypy errors when monkeypatching.
+setattr(train_test_split, '__test__', False)
 
 
 def _build_repr(self):
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index 49d4b156e0686..1673040f96bc6 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -428,12 +428,14 @@ def test_grid_search_when_param_grid_includes_range():
 
 
 def test_grid_search_bad_param_grid():
-    param_dict = {"C": 1.0}
+    param_dict = {"C": 1}
     clf = SVC(gamma='auto')
     assert_raise_message(
         ValueError,
-        "Parameter values for parameter (C) need to be a sequence"
-        "(but not a string) or np.ndarray.",
+        "Parameter grid for parameter (C) needs to"
+        " be a list or numpy array, but got (<class 'int'>)."
+        " Single values need to be wrapped in a list"
+        " with one element.",
         GridSearchCV, clf, param_dict)
 
     param_dict = {"C": []}
@@ -447,8 +449,10 @@ def test_grid_search_bad_param_grid():
     clf = SVC(gamma='auto')
     assert_raise_message(
         ValueError,
-        "Parameter values for parameter (C) need to be a sequence"
-        "(but not a string) or np.ndarray.",
+        "Parameter grid for parameter (C) needs to"
+        " be a list or numpy array, but got (<class 'str'>)."
+        " Single values need to be wrapped in a list"
+        " with one element.",
         GridSearchCV, clf, param_dict)
 
     param_dict = {"C": np.ones((3, 2))}
diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py
index 3404a9768f36a..91a97e2810baa 100644
--- a/sklearn/neighbors/_kde.py
+++ b/sklearn/neighbors/_kde.py
@@ -8,7 +8,7 @@
 from scipy.special import gammainc
 from ..base import BaseEstimator
 from ..utils import check_array, check_random_state
-from ..utils.validation import _check_sample_weight
+from ..utils.validation import _check_sample_weight, check_is_fitted
 
 from ..utils.extmath import row_norms
 from ._ball_tree import BallTree, DTYPE
@@ -184,6 +184,7 @@ def score_samples(self, X):
             probability densities, so values will be low for high-dimensional
             data.
         """
+        check_is_fitted(self)
         # The returned density is normalized to the number of points.
         # For it to be a probability, we must scale it.  For this reason
         # we'll also scale atol.
@@ -241,6 +242,7 @@ def sample(self, n_samples=1, random_state=None):
         X : array_like, shape (n_samples, n_features)
             List of samples.
         """
+        check_is_fitted(self)
         # TODO: implement sampling for other valid kernel shapes
         if self.kernel not in ['gaussian', 'tophat']:
             raise NotImplementedError()
diff --git a/sklearn/neighbors/tests/test_kde.py b/sklearn/neighbors/tests/test_kde.py
index 6687cfa475ce8..e17e8e575f728 100644
--- a/sklearn/neighbors/tests/test_kde.py
+++ b/sklearn/neighbors/tests/test_kde.py
@@ -9,6 +9,7 @@
 from sklearn.datasets import make_blobs
 from sklearn.model_selection import GridSearchCV
 from sklearn.preprocessing import StandardScaler
+from sklearn.exceptions import NotFittedError
 import joblib
 
 
@@ -235,3 +236,15 @@ def test_pickling(tmpdir, sample_weight):
     scores_pickled = kde.score_samples(X)
 
     assert_allclose(scores, scores_pickled)
+
+
+@pytest.mark.parametrize('method', ['score_samples', 'sample'])
+def test_check_is_fitted(method):
+    # Check that predict raises an exception in an unfitted estimator.
+    # Unfitted estimators should raise a NotFittedError.
+    rng = np.random.RandomState(0)
+    X = rng.randn(10, 2)
+    kde = KernelDensity()
+
+    with pytest.raises(NotFittedError):
+        getattr(kde, method)(X)
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 64d2de70df531..c1bbdbd629ff8 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -84,47 +84,21 @@ class Pipeline(_BaseComposition):
 
     Examples
     --------
-    >>> from sklearn import svm
+    >>> from sklearn.svm import SVC
+    >>> from sklearn.preprocessing import StandardScaler
     >>> from sklearn.datasets import make_classification
-    >>> from sklearn.feature_selection import SelectKBest
-    >>> from sklearn.feature_selection import f_regression
+    >>> from sklearn.model_selection import train_test_split
     >>> from sklearn.pipeline import Pipeline
-    >>> # generate some data to play with
-    >>> X, y = make_classification(
-    ...     n_informative=5, n_redundant=0, random_state=42)
-    >>> # ANOVA SVM-C
-    >>> anova_filter = SelectKBest(f_regression, k=5)
-    >>> clf = svm.SVC(kernel='linear')
-    >>> anova_svm = Pipeline([('anova', anova_filter), ('svc', clf)])
-    >>> # You can set the parameters using the names issued
-    >>> # For instance, fit using a k of 10 in the SelectKBest
-    >>> # and a parameter 'C' of the svm
-    >>> anova_svm.set_params(anova__k=10, svc__C=.1).fit(X, y)
-    Pipeline(steps=[('anova', SelectKBest(...)), ('svc', SVC(...))])
-    >>> prediction = anova_svm.predict(X)
-    >>> anova_svm.score(X, y)
-    0.83
-    >>> # getting the selected features chosen by anova_filter
-    >>> anova_svm['anova'].get_support()
-    array([False, False,  True,  True, False, False,  True,  True, False,
-           True, False,  True,  True, False,  True, False,  True,  True,
-           False, False])
-    >>> # Another way to get selected features chosen by anova_filter
-    >>> anova_svm.named_steps.anova.get_support()
-    array([False, False,  True,  True, False, False,  True,  True, False,
-           True, False,  True,  True, False,  True, False,  True,  True,
-           False, False])
-    >>> # Indexing can also be used to extract a sub-pipeline.
-    >>> sub_pipeline = anova_svm[:1]
-    >>> sub_pipeline
-    Pipeline(steps=[('anova', SelectKBest(...))])
-    >>> coef = anova_svm[-1].coef_
-    >>> anova_svm['svc'] is anova_svm[-1]
-    True
-    >>> coef.shape
-    (1, 10)
-    >>> sub_pipeline.inverse_transform(coef).shape
-    (1, 20)
+    >>> X, y = make_classification(random_state=0)
+    >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
+    ...                                                     random_state=0)
+    >>> pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])
+    >>> # The pipeline can be used as any other estimator
+    >>> # and avoids leaking the test set into the train set
+    >>> pipe.fit(X_train, y_train)
+    Pipeline(steps=[('scaler', StandardScaler()), ('svc', SVC())])
+    >>> pipe.score(X_test, y_test)
+    0.88
     """
 
     # BaseEstimator interface
@@ -258,17 +232,7 @@ def _log_message(self, step_idx):
                                                   len(self.steps),
                                                   name)
 
-    # Estimator interface
-
-    def _fit(self, X, y=None, **fit_params):
-        # shallow copy of steps - this should really be steps_
-        self.steps = list(self.steps)
-        self._validate_steps()
-        # Setup the memory
-        memory = check_memory(self.memory)
-
-        fit_transform_one_cached = memory.cache(_fit_transform_one)
-
+    def _check_fit_params(self, **fit_params):
         fit_params_steps = {name: {} for name, step in self.steps
                             if step is not None}
         for pname, pval in fit_params.items():
@@ -281,6 +245,19 @@ def _fit(self, X, y=None, **fit_params):
                     "=sample_weight)`.".format(pname))
             step, param = pname.split('__', 1)
             fit_params_steps[step][param] = pval
+        return fit_params_steps
+
+    # Estimator interface
+
+    def _fit(self, X, y=None, **fit_params_steps):
+        # shallow copy of steps - this should really be steps_
+        self.steps = list(self.steps)
+        self._validate_steps()
+        # Setup the memory
+        memory = check_memory(self.memory)
+
+        fit_transform_one_cached = memory.cache(_fit_transform_one)
+
         for (step_idx,
              name,
              transformer) in self._iter(with_final=False,
@@ -318,9 +295,7 @@ def _fit(self, X, y=None, **fit_params):
             # transformer. This is necessary when loading the transformer
             # from the cache.
             self.steps[step_idx] = (name, fitted_transformer)
-        if self._final_estimator == 'passthrough':
-            return X, {}
-        return X, fit_params_steps[self.steps[-1][0]]
+        return X
 
     def fit(self, X, y=None, **fit_params):
         """Fit the model
@@ -348,11 +323,14 @@ def fit(self, X, y=None, **fit_params):
         self : Pipeline
             This estimator
         """
-        Xt, fit_params = self._fit(X, y, **fit_params)
+        fit_params_steps = self._check_fit_params(**fit_params)
+        Xt = self._fit(X, y, **fit_params_steps)
         with _print_elapsed_time('Pipeline',
                                  self._log_message(len(self.steps) - 1)):
             if self._final_estimator != 'passthrough':
-                self._final_estimator.fit(Xt, y, **fit_params)
+                fit_params_last_step = fit_params_steps[self.steps[-1][0]]
+                self._final_estimator.fit(Xt, y, **fit_params_last_step)
+
         return self
 
     def fit_transform(self, X, y=None, **fit_params):
@@ -382,16 +360,20 @@ def fit_transform(self, X, y=None, **fit_params):
         Xt : array-like of shape  (n_samples, n_transformed_features)
             Transformed samples
         """
+        fit_params_steps = self._check_fit_params(**fit_params)
+        Xt = self._fit(X, y, **fit_params_steps)
+
         last_step = self._final_estimator
-        Xt, fit_params = self._fit(X, y, **fit_params)
         with _print_elapsed_time('Pipeline',
                                  self._log_message(len(self.steps) - 1)):
             if last_step == 'passthrough':
                 return Xt
+            fit_params_last_step = fit_params_steps[self.steps[-1][0]]
             if hasattr(last_step, 'fit_transform'):
-                return last_step.fit_transform(Xt, y, **fit_params)
+                return last_step.fit_transform(Xt, y, **fit_params_last_step)
             else:
-                return last_step.fit(Xt, y, **fit_params).transform(Xt)
+                return last_step.fit(Xt, y,
+                                     **fit_params_last_step).transform(Xt)
 
     @if_delegate_has_method(delegate='_final_estimator')
     def predict(self, X, **predict_params):
@@ -447,10 +429,14 @@ def fit_predict(self, X, y=None, **fit_params):
         -------
         y_pred : array-like
         """
-        Xt, fit_params = self._fit(X, y, **fit_params)
+        fit_params_steps = self._check_fit_params(**fit_params)
+        Xt = self._fit(X, y, **fit_params_steps)
+
+        fit_params_last_step = fit_params_steps[self.steps[-1][0]]
         with _print_elapsed_time('Pipeline',
                                  self._log_message(len(self.steps) - 1)):
-            y_pred = self.steps[-1][-1].fit_predict(Xt, y, **fit_params)
+            y_pred = self.steps[-1][-1].fit_predict(Xt, y,
+                                                    **fit_params_last_step)
         return y_pred
 
     @if_delegate_has_method(delegate='_final_estimator')
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index 33e2bac562489..c95351db9d985 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -210,11 +210,6 @@ class MinMaxScaler(TransformerMixin, BaseEstimator):
 
     where min, max = feature_range.
 
-    The transformation is calculated as::
-
-        X_scaled = scale * X + min - X.min(axis=0) * scale
-        where scale = (max - min) / (X.max(axis=0) - X.min(axis=0))
-
     This transformation is often used as an alternative to zero mean,
     unit variance scaling.
 
diff --git a/sklearn/setup.py b/sklearn/setup.py
index e759cdabc88ee..d90c198ac0d7b 100644
--- a/sklearn/setup.py
+++ b/sklearn/setup.py
@@ -55,13 +55,13 @@ def configuration(parent_package='', top_path=None):
     config.add_subpackage('ensemble/_hist_gradient_boosting/tests')
     config.add_subpackage('_loss/')
     config.add_subpackage('_loss/tests')
+    config.add_subpackage('externals')
 
     # submodules which have their own setup.py
     config.add_subpackage('cluster')
     config.add_subpackage('datasets')
     config.add_subpackage('decomposition')
     config.add_subpackage('ensemble')
-    config.add_subpackage('externals')
     config.add_subpackage('feature_extraction')
     config.add_subpackage('manifold')
     config.add_subpackage('metrics')
diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py
index d935de697fcf7..6cecefb693ec8 100644
--- a/sklearn/svm/_base.py
+++ b/sklearn/svm/_base.py
@@ -3,9 +3,11 @@
 import warnings
 from abc import ABCMeta, abstractmethod
 
-from . import _libsvm as libsvm
-from .import _liblinear as liblinear
-from . import _libsvm_sparse as libsvm_sparse
+# mypy error: error: Module 'sklearn.svm' has no attribute '_libsvm'
+# (and same for other imports)
+from . import _libsvm as libsvm  # type: ignore
+from .import _liblinear as liblinear  # type: ignore
+from . import _libsvm_sparse as libsvm_sparse  # type: ignore
 from ..base import BaseEstimator, ClassifierMixin
 from ..preprocessing import LabelEncoder
 from ..utils.multiclass import _ovr_decision_function
@@ -110,7 +112,8 @@ def fit(self, X, y, sample_weight=None):
 
         Parameters
         ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features) \
+                or (n_samples, n_samples)
             Training vectors, where n_samples is the number of samples
             and n_features is the number of features.
             For kernel="precomputed", the expected shape of X is
@@ -144,6 +147,13 @@ def fit(self, X, y, sample_weight=None):
             raise TypeError("Sparse precomputed kernels are not supported.")
         self._sparse = sparse and not callable(self.kernel)
 
+        if hasattr(self, 'decision_function_shape'):
+            if self.decision_function_shape not in ('ovr', 'ovo'):
+                raise ValueError(
+                    f"decision_function_shape must be either 'ovr' or 'ovo', "
+                    f"got {self.decision_function_shape}."
+                )
+
         if callable(self.kernel):
             check_consistent_length(X, y)
         else:
diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py
index da5cfa437e476..46086729af35c 100644
--- a/sklearn/svm/_classes.py
+++ b/sklearn/svm/_classes.py
@@ -159,15 +159,21 @@ class LinearSVC(BaseEstimator, LinearClassifierMixin,
     Examples
     --------
     >>> from sklearn.svm import LinearSVC
+    >>> from sklearn.pipeline import make_pipeline
+    >>> from sklearn.preprocessing import StandardScaler
     >>> from sklearn.datasets import make_classification
     >>> X, y = make_classification(n_features=4, random_state=0)
-    >>> clf = LinearSVC(random_state=0, tol=1e-5)
+    >>> clf = make_pipeline(StandardScaler(),
+    ...                     LinearSVC(random_state=0, tol=1e-5))
     >>> clf.fit(X, y)
-    LinearSVC(random_state=0, tol=1e-05)
-    >>> print(clf.coef_)
-    [[0.085... 0.394... 0.498... 0.375...]]
-    >>> print(clf.intercept_)
-    [0.284...]
+    Pipeline(steps=[('standardscaler', StandardScaler()),
+                    ('linearsvc', LinearSVC(random_state=0, tol=1e-05))])
+
+    >>> print(clf.named_steps['linearsvc'].coef_)
+    [[0.141...   0.526... 0.679... 0.493...]]
+
+    >>> print(clf.named_steps['linearsvc'].intercept_)
+    [0.1693...]
     >>> print(clf.predict([[0, 0, 0, 0]]))
     [1]
     """
@@ -322,17 +328,23 @@ class LinearSVR(RegressorMixin, LinearModel):
     Examples
     --------
     >>> from sklearn.svm import LinearSVR
+    >>> from sklearn.pipeline import make_pipeline
+    >>> from sklearn.preprocessing import StandardScaler
     >>> from sklearn.datasets import make_regression
     >>> X, y = make_regression(n_features=4, random_state=0)
-    >>> regr = LinearSVR(random_state=0, tol=1e-5)
+    >>> regr = make_pipeline(StandardScaler(),
+    ...                      LinearSVR(random_state=0, tol=1e-5))
     >>> regr.fit(X, y)
-    LinearSVR(random_state=0, tol=1e-05)
-    >>> print(regr.coef_)
-    [16.35... 26.91... 42.30... 60.47...]
-    >>> print(regr.intercept_)
-    [-4.29...]
+    Pipeline(steps=[('standardscaler', StandardScaler()),
+                    ('linearsvr', LinearSVR(random_state=0, tol=1e-05))])
+
+    >>> print(regr.named_steps['linearsvr'].coef_)
+    [18.582... 27.023... 44.357... 64.522...]
+    >>> print(regr.named_steps['linearsvr'].intercept_)
+    [-4...]
     >>> print(regr.predict([[0, 0, 0, 0]]))
-    [-4.29...]
+    [-2.384...]
+
 
     See also
     --------
@@ -461,6 +473,7 @@ class SVC(BaseSVC):
 
     shrinking : bool, default=True
         Whether to use the shrinking heuristic.
+        See the :ref:`User Guide <shrinking_svm>`.
 
     probability : bool, default=False
         Whether to enable probability estimates. This must be enabled prior
@@ -495,7 +508,8 @@ class SVC(BaseSVC):
         (n_samples, n_classes) as all other classifiers, or the original
         one-vs-one ('ovo') decision function of libsvm which has shape
         (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one
-        ('ovo') is always used as multi-class strategy.
+        ('ovo') is always used as multi-class strategy. The parameter is
+        ignored for binary classification.
 
         .. versionchanged:: 0.19
             decision_function_shape is 'ovr' by default.
@@ -517,7 +531,7 @@ class SVC(BaseSVC):
 
     random_state : int or RandomState instance, default=None
         Controls the pseudo random number generation for shuffling the data for
-        probability estimates.
+        probability estimates. Ignored when `probability` is False.
         Pass an int for reproducible output across multiple function calls.
         See :term:`Glossary <random_state>`.
 
@@ -533,11 +547,13 @@ class SVC(BaseSVC):
         Number of support vectors for each class.
 
     dual_coef_ : ndarray of shape (n_class-1, n_SV)
-        Coefficients of the support vector in the decision function.
+        Dual coefficients of the support vector in the decision
+        function (see :ref:`sgd_mathematical_formulation`), multiplied by
+        their targets.
         For multiclass, coefficient for all 1-vs-1 classifiers.
         The layout of the coefficients in the multiclass case is somewhat
-        non-trivial. See the section about multi-class classification in the
-        SVM section of the User Guide for details.
+        non-trivial. See the :ref:`multi-class section of the User Guide
+        <svm_multi_class>` for details.
 
     coef_ : ndarray of shape (n_class * (n_class-1) / 2, n_features)
         Weights assigned to the features (coefficients in the primal
@@ -576,12 +592,16 @@ class SVC(BaseSVC):
     Examples
     --------
     >>> import numpy as np
+    >>> from sklearn.pipeline import make_pipeline
+    >>> from sklearn.preprocessing import StandardScaler
     >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
     >>> y = np.array([1, 1, 2, 2])
     >>> from sklearn.svm import SVC
-    >>> clf = SVC(gamma='auto')
+    >>> clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
     >>> clf.fit(X, y)
-    SVC(gamma='auto')
+    Pipeline(steps=[('standardscaler', StandardScaler()),
+                    ('svc', SVC(gamma='auto'))])
+
     >>> print(clf.predict([[-0.8, -1]]))
     [1]
 
@@ -637,9 +657,9 @@ class NuSVC(BaseSVC):
     Parameters
     ----------
     nu : float, default=0.5
-        An upper bound on the fraction of training errors and a lower
-        bound of the fraction of support vectors. Should be in the
-        interval (0, 1].
+        An upper bound on the fraction of margin errors (see :ref:`User Guide
+        <nu_svc>`) and a lower bound of the fraction of support vectors.
+        Should be in the interval (0, 1].
 
     kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf'
          Specifies the kernel type to be used in the algorithm.
@@ -668,6 +688,7 @@ class NuSVC(BaseSVC):
 
     shrinking : bool, default=True
         Whether to use the shrinking heuristic.
+        See the :ref:`User Guide <shrinking_svm>`.
 
     probability : bool, default=False
         Whether to enable probability estimates. This must be enabled prior
@@ -700,7 +721,9 @@ class NuSVC(BaseSVC):
         Whether to return a one-vs-rest ('ovr') decision function of shape
         (n_samples, n_classes) as all other classifiers, or the original
         one-vs-one ('ovo') decision function of libsvm which has shape
-        (n_samples, n_classes * (n_classes - 1) / 2).
+        (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one
+        ('ovo') is always used as multi-class strategy. The parameter is
+        ignored for binary classification.
 
         .. versionchanged:: 0.19
             decision_function_shape is 'ovr' by default.
@@ -722,7 +745,7 @@ class NuSVC(BaseSVC):
 
     random_state : int or RandomState instance, default=None
         Controls the pseudo random number generation for shuffling the data for
-        probability estimates.
+        probability estimates. Ignored when `probability` is False.
         Pass an int for reproducible output across multiple function calls.
         See :term:`Glossary <random_state>`.
 
@@ -738,11 +761,13 @@ class NuSVC(BaseSVC):
         Number of support vectors for each class.
 
     dual_coef_ : ndarray of shape (n_class-1, n_SV)
-        Coefficients of the support vector in the decision function.
+        Dual coefficients of the support vector in the decision
+        function (see :ref:`sgd_mathematical_formulation`), multiplied by
+        their targets.
         For multiclass, coefficient for all 1-vs-1 classifiers.
         The layout of the coefficients in the multiclass case is somewhat
-        non-trivial. See the section about multi-class classification in
-        the SVM section of the User Guide for details.
+        non-trivial. See the :ref:`multi-class section of the User Guide
+        <svm_multi_class>` for details.
 
     coef_ : ndarray of shape (n_class * (n_class-1) / 2, n_features)
         Weights assigned to the features (coefficients in the primal
@@ -783,10 +808,12 @@ class NuSVC(BaseSVC):
     >>> import numpy as np
     >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
     >>> y = np.array([1, 1, 2, 2])
+    >>> from sklearn.pipeline import make_pipeline
+    >>> from sklearn.preprocessing import StandardScaler
     >>> from sklearn.svm import NuSVC
-    >>> clf = NuSVC()
+    >>> clf = make_pipeline(StandardScaler(), NuSVC())
     >>> clf.fit(X, y)
-    NuSVC()
+    Pipeline(steps=[('standardscaler', StandardScaler()), ('nusvc', NuSVC())])
     >>> print(clf.predict([[-0.8, -1]]))
     [1]
 
@@ -893,6 +920,7 @@ class SVR(RegressorMixin, BaseLibSVM):
 
     shrinking : bool, default=True
         Whether to use the shrinking heuristic.
+        See the :ref:`User Guide <shrinking_svm>`.
 
     cache_size : float, default=200
         Specify the size of the kernel cache (in MB).
@@ -932,14 +960,18 @@ class SVR(RegressorMixin, BaseLibSVM):
     Examples
     --------
     >>> from sklearn.svm import SVR
+    >>> from sklearn.pipeline import make_pipeline
+    >>> from sklearn.preprocessing import StandardScaler
     >>> import numpy as np
     >>> n_samples, n_features = 10, 5
     >>> rng = np.random.RandomState(0)
     >>> y = rng.randn(n_samples)
     >>> X = rng.randn(n_samples, n_features)
-    >>> regr = SVR(C=1.0, epsilon=0.2)
+    >>> regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
     >>> regr.fit(X, y)
-    SVR(epsilon=0.2)
+    Pipeline(steps=[('standardscaler', StandardScaler()),
+                    ('svr', SVR(epsilon=0.2))])
+
 
     See also
     --------
@@ -970,14 +1002,16 @@ def __init__(self, kernel='rbf', degree=3, gamma='scale',
             shrinking=shrinking, probability=False, cache_size=cache_size,
             class_weight=None, max_iter=max_iter, random_state=None)
 
-    @deprecated(
+    # mypy error: Decorated property not supported
+    @deprecated(  # type: ignore
         "The probA_ attribute is deprecated in version 0.23 and will be "
         "removed in version 0.25.")
     @property
     def probA_(self):
         return self._probA
 
-    @deprecated(
+    # mypy error: Decorated property not supported
+    @deprecated(  # type: ignore
         "The probB_ attribute is deprecated in version 0.23 and will be "
         "removed in version 0.25.")
     @property
@@ -1033,6 +1067,7 @@ class NuSVR(RegressorMixin, BaseLibSVM):
 
     shrinking : bool, default=True
         Whether to use the shrinking heuristic.
+        See the :ref:`User Guide <shrinking_svm>`.
 
     tol : float, default=1e-3
         Tolerance for stopping criterion.
@@ -1072,14 +1107,17 @@ class NuSVR(RegressorMixin, BaseLibSVM):
     Examples
     --------
     >>> from sklearn.svm import NuSVR
+    >>> from sklearn.pipeline import make_pipeline
+    >>> from sklearn.preprocessing import StandardScaler
     >>> import numpy as np
     >>> n_samples, n_features = 10, 5
     >>> np.random.seed(0)
     >>> y = np.random.randn(n_samples)
     >>> X = np.random.randn(n_samples, n_features)
-    >>> regr = NuSVR(C=1.0, nu=0.1)
+    >>> regr = make_pipeline(StandardScaler(), NuSVR(C=1.0, nu=0.1))
     >>> regr.fit(X, y)
-    NuSVR(nu=0.1)
+    Pipeline(steps=[('standardscaler', StandardScaler()),
+                    ('nusvr', NuSVR(nu=0.1))])
 
     See also
     --------
@@ -1157,6 +1195,7 @@ class OneClassSVM(OutlierMixin, BaseLibSVM):
 
     shrinking : bool, default=True
         Whether to use the shrinking heuristic.
+        See the :ref:`User Guide <shrinking_svm>`.
 
     cache_size : float, default=200
         Specify the size of the kernel cache (in MB).
@@ -1304,14 +1343,16 @@ def predict(self, X):
         y = super().predict(X)
         return np.asarray(y, dtype=np.intp)
 
-    @deprecated(
+    # mypy error: Decorated property not supported
+    @deprecated(  # type: ignore
         "The probA_ attribute is deprecated in version 0.23 and will be "
         "removed in version 0.25.")
     @property
     def probA_(self):
         return self._probA
 
-    @deprecated(
+    # mypy error: Decorated property not supported
+    @deprecated(  # type: ignore
         "The probB_ attribute is deprecated in version 0.23 and will be "
         "removed in version 0.25.")
     @property
diff --git a/sklearn/svm/setup.py b/sklearn/svm/setup.py
index 3ab495d7441cd..989e6c7d6a316 100644
--- a/sklearn/svm/setup.py
+++ b/sklearn/svm/setup.py
@@ -16,22 +16,27 @@ def configuration(parent_package='', top_path=None):
     config.add_library('libsvm-skl',
                        sources=[join('src', 'libsvm', 'libsvm_template.cpp')],
                        depends=[join('src', 'libsvm', 'svm.cpp'),
-                                join('src', 'libsvm', 'svm.h')],
+                                join('src', 'libsvm', 'svm.h'),
+                                join('src', 'newrand', 'newrand.h')],
                        # Force C++ linking in case gcc is picked up instead
                        # of g++ under windows with some versions of MinGW
                        extra_link_args=['-lstdc++'],
+                       # Use C++11 to use the random number generator fix
+                       extra_compiler_args=['-std=c++11'],
                        )
 
     libsvm_sources = ['_libsvm.pyx']
     libsvm_depends = [join('src', 'libsvm', 'libsvm_helper.c'),
                       join('src', 'libsvm', 'libsvm_template.cpp'),
                       join('src', 'libsvm', 'svm.cpp'),
-                      join('src', 'libsvm', 'svm.h')]
+                      join('src', 'libsvm', 'svm.h'),
+                      join('src', 'newrand', 'newrand.h')]
 
     config.add_extension('_libsvm',
                          sources=libsvm_sources,
                          include_dirs=[numpy.get_include(),
-                                       join('src', 'libsvm')],
+                                       join('src', 'libsvm'),
+                                       join('src', 'newrand')],
                          libraries=['libsvm-skl'],
                          depends=libsvm_depends,
                          )
@@ -41,16 +46,30 @@ def configuration(parent_package='', top_path=None):
     if os.name == 'posix':
         libraries.append('m')
 
-    liblinear_sources = ['_liblinear.pyx',
-                         join('src', 'liblinear', '*.cpp')]
+    # precompile liblinear to use C++11 flag
+    config.add_library('liblinear-skl',
+                       sources=[join('src', 'liblinear', 'linear.cpp'),
+                                join('src', 'liblinear', 'tron.cpp')],
+                       depends=[join('src', 'liblinear', 'linear.h'),
+                                join('src', 'liblinear', 'tron.h'),
+                                join('src', 'newrand', 'newrand.h')],
+                       # Force C++ linking in case gcc is picked up instead
+                       # of g++ under windows with some versions of MinGW
+                       extra_link_args=['-lstdc++'],
+                       # Use C++11 to use the random number generator fix
+                       extra_compiler_args=['-std=c++11'],
+                       )
 
+    liblinear_sources = ['_liblinear.pyx']
     liblinear_depends = [join('src', 'liblinear', '*.h'),
+                         join('src', 'newrand', 'newrand.h'),
                          join('src', 'liblinear', 'liblinear_helper.c')]
 
     config.add_extension('_liblinear',
                          sources=liblinear_sources,
-                         libraries=libraries,
+                         libraries=['liblinear-skl'] + libraries,
                          include_dirs=[join('.', 'src', 'liblinear'),
+                                       join('.', 'src', 'newrand'),
                                        join('..', 'utils'),
                                        numpy.get_include()],
                          depends=liblinear_depends,
@@ -64,8 +83,10 @@ def configuration(parent_package='', top_path=None):
     config.add_extension('_libsvm_sparse', libraries=['libsvm-skl'],
                          sources=libsvm_sparse_sources,
                          include_dirs=[numpy.get_include(),
-                                       join("src", "libsvm")],
+                                       join("src", "libsvm"),
+                                       join("src", "newrand")],
                          depends=[join("src", "libsvm", "svm.h"),
+                                  join('src', 'newrand', 'newrand.h'),
                                   join("src", "libsvm",
                                        "libsvm_sparse_helper.c")])
 
diff --git a/sklearn/svm/src/liblinear/liblinear_helper.c b/sklearn/svm/src/liblinear/liblinear_helper.c
index 86d88e7da9273..7433a0086f682 100644
--- a/sklearn/svm/src/liblinear/liblinear_helper.c
+++ b/sklearn/svm/src/liblinear/liblinear_helper.c
@@ -182,7 +182,7 @@ struct parameter *set_parameter(int solver_type, double eps, double C,
     if (param == NULL)
         return NULL;
 
-    srand(seed);
+    set_seed(seed);
     param->solver_type = solver_type;
     param->eps = eps;
     param->C = C;
diff --git a/sklearn/svm/src/liblinear/linear.cpp b/sklearn/svm/src/liblinear/linear.cpp
index d9bdfb69c413d..cc603b435f655 100644
--- a/sklearn/svm/src/liblinear/linear.cpp
+++ b/sklearn/svm/src/liblinear/linear.cpp
@@ -1,8 +1,8 @@
-/* 
+/*
    Modified 2011:
 
    - Make labels sorted in group_classes, Dan Yamins.
-   
+
    Modified 2012:
 
    - Changes roles of +1 and -1 to match scikit API, Andreas Mueller
@@ -22,6 +22,13 @@
    Modified 2015:
    - Patched liblinear for sample_weights - Manoj Kumar
      See https://github.com/scikit-learn/scikit-learn/pull/5274
+
+   Modified 2020:
+   - Improved random number generator by using a mersenne twister + tweaked
+     lemire postprocessor. This fixed a convergence issue on windows targets.
+     Sylvain Marie
+     See <https://github.com/scikit-learn/scikit-learn/pull/13511#issuecomment-481729756>
+
  */
 
 #include <math.h>
@@ -32,6 +39,10 @@
 #include <locale.h>
 #include "linear.h"
 #include "tron.h"
+#include <climits>
+#include <random>
+#include "../newrand/newrand.h"
+
 typedef signed char schar;
 template <class T> static inline void swap(T& x, T& y) { T t=x; x=y; y=t; }
 #ifndef min
@@ -456,19 +467,19 @@ void l2r_l2_svr_fun::grad(double *w, double *g)
 		g[i] = w[i] + 2*g[i];
 }
 
-// A coordinate descent algorithm for 
+// A coordinate descent algorithm for
 // multi-class support vector machines by Crammer and Singer
 //
 //  min_{\alpha}  0.5 \sum_m ||w_m(\alpha)||^2 + \sum_i \sum_m e^m_i alpha^m_i
 //    s.t.     \alpha^m_i <= C^m_i \forall m,i , \sum_m \alpha^m_i=0 \forall i
-// 
+//
 //  where e^m_i = 0 if y_i  = m,
 //        e^m_i = 1 if y_i != m,
-//  C^m_i = C if m  = y_i, 
-//  C^m_i = 0 if m != y_i, 
-//  and w_m(\alpha) = \sum_i \alpha^m_i x_i 
+//  C^m_i = C if m  = y_i,
+//  C^m_i = 0 if m != y_i,
+//  and w_m(\alpha) = \sum_i \alpha^m_i x_i
 //
-// Given: 
+// Given:
 // x, y, C
 // eps is the stopping tolerance
 //
@@ -579,7 +590,7 @@ int Solver_MCSVM_CS::Solve(double *w)
 	double eps_shrink = max(10.0*eps, 1.0); // stopping tolerance for shrinking
 	bool start_from_all = true;
 
-	// Initial alpha can be set here. Note that 
+	// Initial alpha can be set here. Note that
 	// sum_m alpha[i*nr_class+m] = 0, for all i=1,...,l-1
 	// alpha[i*nr_class+m] <= C[GETI(i)] if prob->y[i] == m
 	// alpha[i*nr_class+m] <= 0 if prob->y[i] != m
@@ -615,7 +626,7 @@ int Solver_MCSVM_CS::Solve(double *w)
 		double stopping = -INF;
 		for(i=0;i<active_size;i++)
 		{
-			int j = i+rand()%(active_size-i);
+			int j = i+bounded_rand_int(active_size-i);
 			swap(index[i], index[j]);
 		}
 		for(s=0;s<active_size;s++)
@@ -775,14 +786,14 @@ int Solver_MCSVM_CS::Solve(double *w)
 	return iter;
 }
 
-// A coordinate descent algorithm for 
+// A coordinate descent algorithm for
 // L1-loss and L2-loss SVM dual problems
 //
 //  min_\alpha  0.5(\alpha^T (Q + D)\alpha) - e^T \alpha,
 //    s.t.      0 <= \alpha_i <= upper_bound_i,
-// 
+//
 //  where Qij = yi yj xi^T xj and
-//  D is a diagonal matrix 
+//  D is a diagonal matrix
 //
 // In L1-SVM case:
 // 		upper_bound_i = Cp if y_i = 1
@@ -793,12 +804,12 @@ int Solver_MCSVM_CS::Solve(double *w)
 // 		D_ii = 1/(2*Cp)	if y_i = 1
 // 		D_ii = 1/(2*Cn)	if y_i = -1
 //
-// Given: 
+// Given:
 // x, y, Cp, Cn
 // eps is the stopping tolerance
 //
 // solution will be put in w
-// 
+//
 // See Algorithm 3 of Hsieh et al., ICML 2008
 
 #undef GETI
@@ -888,7 +899,7 @@ static int solve_l2r_l1l2_svc(
 
 		for (i=0; i<active_size; i++)
 		{
-			int j = i+rand()%(active_size-i);
+			int j = i+bounded_rand_int(active_size-i);
 			swap(index[i], index[j]);
 		}
 
@@ -1009,14 +1020,14 @@ static int solve_l2r_l1l2_svc(
 }
 
 
-// A coordinate descent algorithm for 
+// A coordinate descent algorithm for
 // L1-loss and L2-loss epsilon-SVR dual problem
 //
 //  min_\beta  0.5\beta^T (Q + diag(lambda)) \beta - p \sum_{i=1}^l|\beta_i| + \sum_{i=1}^l yi\beta_i,
 //    s.t.      -upper_bound_i <= \beta_i <= upper_bound_i,
-// 
+//
 //  where Qij = xi^T xj and
-//  D is a diagonal matrix 
+//  D is a diagonal matrix
 //
 // In L1-SVM case:
 // 		upper_bound_i = C
@@ -1025,13 +1036,13 @@ static int solve_l2r_l1l2_svc(
 // 		upper_bound_i = INF
 // 		lambda_i = 1/(2*C)
 //
-// Given: 
+// Given:
 // x, y, p, C
 // eps is the stopping tolerance
 //
 // solution will be put in w
 //
-// See Algorithm 4 of Ho and Lin, 2012   
+// See Algorithm 4 of Ho and Lin, 2012
 
 #undef GETI
 #define GETI(i) (i)
@@ -1107,7 +1118,7 @@ static int solve_l2r_l1l2_svr(
 
 		for(i=0; i<active_size; i++)
 		{
-			int j = i+rand()%(active_size-i);
+			int j = i+bounded_rand_int(active_size-i);
 			swap(index[i], index[j]);
 		}
 
@@ -1253,17 +1264,17 @@ static int solve_l2r_l1l2_svr(
 }
 
 
-// A coordinate descent algorithm for 
+// A coordinate descent algorithm for
 // the dual of L2-regularized logistic regression problems
 //
 //  min_\alpha  0.5(\alpha^T Q \alpha) + \sum \alpha_i log (\alpha_i) + (upper_bound_i - \alpha_i) log (upper_bound_i - \alpha_i),
 //    s.t.      0 <= \alpha_i <= upper_bound_i,
-// 
-//  where Qij = yi yj xi^T xj and 
+//
+//  where Qij = yi yj xi^T xj and
 //  upper_bound_i = Cp if y_i = 1
 //  upper_bound_i = Cn if y_i = -1
 //
-// Given: 
+// Given:
 // x, y, Cp, Cn
 // eps is the stopping tolerance
 //
@@ -1333,7 +1344,7 @@ int solve_l2r_lr_dual(const problem *prob, double *w, double eps, double Cp, dou
 	{
 		for (i=0; i<l; i++)
 		{
-			int j = i+rand()%(l-i);
+			int j = i+bounded_rand_int(l-i);
 			swap(index[i], index[j]);
 		}
 		int newton_iter = 0;
@@ -1521,7 +1532,7 @@ static int solve_l1r_l2_svc(
 
 		for(j=0; j<active_size; j++)
 		{
-			int i = j+rand()%(active_size-j);
+			int i = j+bounded_rand_int(active_size-j);
 			swap(index[i], index[j]);
 		}
 
@@ -1903,7 +1914,7 @@ static int solve_l1r_lr(
 
 			for(j=0; j<QP_active_size; j++)
 			{
-				int i = j+rand()%(QP_active_size-j);
+				int i = j+bounded_rand_int(QP_active_size-j);
 				swap(index[i], index[j]);
 			}
 
@@ -2234,14 +2245,14 @@ static void group_classes(const problem *prob, int *nr_class_ret, int **label_re
                 label[i+1] = this_label;
                 count[i+1] = this_count;
         }
-        
+
         for (i=0; i <l; i++)
         {
                 j = 0;
                 int this_label = (int)prob->y[i];
                 while(this_label != label[j])
                 {
-                        j++;      
+                        j++;
                 }
                 data_label[i] = j;
 
@@ -2594,7 +2605,7 @@ void cross_validation(const problem *prob, const parameter *param, int nr_fold,
 	for(i=0;i<l;i++) perm[i]=i;
 	for(i=0;i<l;i++)
 	{
-		int j = i+rand()%(l-i);
+		int j = i+bounded_rand_int(l-i);
 		swap(perm[i],perm[j]);
 	}
 	for(i=0;i<=nr_fold;i++)
@@ -3057,4 +3068,3 @@ void set_print_string_function(void (*print_func)(const char*))
 	else
 		liblinear_print_string = print_func;
 }
-
diff --git a/sklearn/svm/src/liblinear/linear.h b/sklearn/svm/src/liblinear/linear.h
index cca7373cbc4b1..1e4952b184d97 100644
--- a/sklearn/svm/src/liblinear/linear.h
+++ b/sklearn/svm/src/liblinear/linear.h
@@ -49,6 +49,8 @@ struct model
 	int *n_iter;    /* no. of iterations of each class */
 };
 
+void set_seed(unsigned seed);
+
 struct model* train(const struct problem *prob, const struct parameter *param, BlasFunctions *blas_functions);
 void cross_validation(const struct problem *prob, const struct parameter *param, int nr_fold, double *target);
 
diff --git a/sklearn/svm/src/libsvm/LIBSVM_CHANGES b/sklearn/svm/src/libsvm/LIBSVM_CHANGES
index 7a30471387c53..c437720def7e1 100644
--- a/sklearn/svm/src/libsvm/LIBSVM_CHANGES
+++ b/sklearn/svm/src/libsvm/LIBSVM_CHANGES
@@ -4,5 +4,7 @@ This is here mainly as checklist for incorporation of new versions of libsvm.
 
   * Add copyright to files svm.cpp and svm.h
   * Add random_seed support and call to srand in fit function
+  * Improved random number generator (fix on windows, enhancement on other
+    platforms). See <https://github.com/scikit-learn/scikit-learn/pull/13511#issuecomment-481729756>
 
 The changes made with respect to upstream are detailed in the heading of svm.cpp
diff --git a/sklearn/svm/src/libsvm/svm.cpp b/sklearn/svm/src/libsvm/svm.cpp
index 9321340acaaed..c9a5df10c4924 100644
--- a/sklearn/svm/src/libsvm/svm.cpp
+++ b/sklearn/svm/src/libsvm/svm.cpp
@@ -48,6 +48,13 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
    - Make labels sorted in svm_group_classes, Fabian Pedregosa.
 
+   Modified 2020:
+
+   - Improved random number generator by using a mersenne twister + tweaked
+     lemire postprocessor. This fixed a convergence issue on windows targets.
+     Sylvain Marie,
+     see <https://github.com/scikit-learn/scikit-learn/pull/13511#issuecomment-481729756>
+
  */
 
 #include <math.h>
@@ -57,7 +64,10 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <float.h>
 #include <string.h>
 #include <stdarg.h>
+#include <climits>
+#include <random>
 #include "svm.h"
+#include "../newrand/newrand.h"
 
 #ifndef _LIBSVM_CPP
 typedef float Qfloat;
@@ -2093,7 +2103,7 @@ static void svm_binary_svc_probability(
 	for(i=0;i<prob->l;i++) perm[i]=i;
 	for(i=0;i<prob->l;i++)
 	{
-		int j = i+rand()%(prob->l-i);
+		int j = i+bounded_rand_int(prob->l-i);
 		swap(perm[i],perm[j]);
 	}
 	for(i=0;i<nr_fold;i++)
@@ -2348,7 +2358,7 @@ PREFIX(model) *PREFIX(train)(const PREFIX(problem) *prob, const svm_parameter *p
 
     if(param->random_seed >= 0)
     {
-        srand(param->random_seed);
+        set_seed(param->random_seed);
     }
 
 	if(param->svm_type == ONE_CLASS ||
@@ -2628,7 +2638,7 @@ void PREFIX(cross_validation)(const PREFIX(problem) *prob, const svm_parameter *
 	int nr_class;
     if(param->random_seed >= 0)
     {
-        srand(param->random_seed);
+        set_seed(param->random_seed);
     }
 
 	// stratified cv may not give leave-one-out rate
@@ -2650,7 +2660,7 @@ void PREFIX(cross_validation)(const PREFIX(problem) *prob, const svm_parameter *
 		for (c=0; c<nr_class; c++) 
 			for(i=0;i<count[c];i++)
 			{
-				int j = i+rand()%(count[c]-i);
+				int j = i+bounded_rand_int(count[c]-i);
 				swap(index[start[c]+j],index[start[c]+i]);
 			}
 		for(i=0;i<nr_fold;i++)
@@ -2687,7 +2697,7 @@ void PREFIX(cross_validation)(const PREFIX(problem) *prob, const svm_parameter *
 		for(i=0;i<l;i++) perm[i]=i;
 		for(i=0;i<l;i++)
 		{
-			int j = i+rand()%(l-i);
+			int j = i+bounded_rand_int(l-i);
 			swap(perm[i],perm[j]);
 		}
 		for(i=0;i<=nr_fold;i++)
diff --git a/sklearn/svm/src/newrand/newrand.h b/sklearn/svm/src/newrand/newrand.h
new file mode 100644
index 0000000000000..b46861b71e765
--- /dev/null
+++ b/sklearn/svm/src/newrand/newrand.h
@@ -0,0 +1,68 @@
+/*
+   Creation, 2020:
+   - New random number generator using a mersenne twister + tweaked lemire
+     postprocessor. This fixed a convergence issue on windows targets for
+     libsvm and liblinear.
+     Sylvain Marie
+     See <https://github.com/scikit-learn/scikit-learn/pull/13511#issuecomment-481729756>
+
+ */
+#ifndef _NEWRAND_H
+#define _NEWRAND_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Scikit-Learn-specific random number generator replacing `rand()` originally
+// used in LibSVM / LibLinear, to ensure the same behaviour on windows-linux,
+// with increased speed
+// - (1) Init a `mt_rand` object
+#if INT_MAX == 0x7FFFFFFF
+std::mt19937 mt_rand(std::mt19937::default_seed);
+#elif INT_MAX == 0x7FFFFFFFFFFFFFFF
+std::mt19937_64 mt_rand(std::mt19937::default_seed);
+#else
+info("Random number generator is not fixed for this system. Please report issue. INT_MAX=%d\n", INT_MAX);
+exit(1);
+#endif
+
+// - (2) public `set_seed()` function that should be used instead of `srand()` to set a new seed.
+void set_seed(unsigned custom_seed) {
+    mt_rand.seed(custom_seed);
+}
+
+// - (3) New internal `bounded_rand_int` function, used instead of rand() everywhere.
+inline int bounded_rand_int(int orig_range) {
+    // "LibSVM / LibLinear Original way" - make a 31bit or 63bit positive
+    // random number and use modulo to make it fit in the range
+    // return abs( (int)mt_rand()) % orig_range;
+
+    // "Better way": tweaked Lemire post-processor
+    // from http://www.pcg-random.org/posts/bounded-rands.html
+    // TODO how could we make this casting safer, raising an error if lost information?
+    uint32_t range = uint32_t(orig_range);
+    uint32_t x = mt_rand();
+    uint64_t m = uint64_t(x) * uint64_t(range);
+    uint32_t l = uint32_t(m);
+    if (l < range) {
+        uint32_t t = -range;
+        if (t >= range) {
+            t -= range;
+            if (t >= range)
+                t %= range;
+        }
+        while (l < t) {
+            x = mt_rand();
+            m = uint64_t(x) * uint64_t(range);
+            l = uint32_t(m);
+        }
+    }
+    return m >> 32;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _NEWRAND_H */
diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py
index fb811940c2971..e6342a2846e3e 100644
--- a/sklearn/svm/tests/test_svm.py
+++ b/sklearn/svm/tests/test_svm.py
@@ -28,7 +28,8 @@
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.exceptions import NotFittedError, UndefinedMetricWarning
 from sklearn.multiclass import OneVsRestClassifier
-from sklearn.svm import _libsvm
+# mypy error: Module 'sklearn.svm' has no attribute '_libsvm'
+from sklearn.svm import _libsvm  # type: ignore
 
 # toy sample
 X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
@@ -361,12 +362,13 @@ def test_decision_function():
     assert_array_almost_equal(dec.ravel(), clf.decision_function(X))
 
 
-def test_decision_function_shape():
-    # check that decision_function_shape='ovr' gives
+@pytest.mark.parametrize('SVM', (svm.SVC, svm.NuSVC))
+def test_decision_function_shape(SVM):
+    # check that decision_function_shape='ovr' or 'ovo' gives
     # correct shape and is consistent with predict
 
-    clf = svm.SVC(kernel='linear', C=0.1,
-                  decision_function_shape='ovr').fit(iris.data, iris.target)
+    clf = SVM(kernel='linear',
+              decision_function_shape='ovr').fit(iris.data, iris.target)
     dec = clf.decision_function(iris.data)
     assert dec.shape == (len(iris.data), 3)
     assert_array_equal(clf.predict(iris.data), np.argmax(dec, axis=1))
@@ -375,18 +377,21 @@ def test_decision_function_shape():
     X, y = make_blobs(n_samples=80, centers=5, random_state=0)
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
 
-    clf = svm.SVC(kernel='linear', C=0.1,
-                  decision_function_shape='ovr').fit(X_train, y_train)
+    clf = SVM(kernel='linear',
+              decision_function_shape='ovr').fit(X_train, y_train)
     dec = clf.decision_function(X_test)
     assert dec.shape == (len(X_test), 5)
     assert_array_equal(clf.predict(X_test), np.argmax(dec, axis=1))
 
     # check shape of ovo_decition_function=True
-    clf = svm.SVC(kernel='linear', C=0.1,
-                  decision_function_shape='ovo').fit(X_train, y_train)
+    clf = SVM(kernel='linear',
+              decision_function_shape='ovo').fit(X_train, y_train)
     dec = clf.decision_function(X_train)
     assert dec.shape == (len(X_train), 10)
 
+    with pytest.raises(ValueError, match="must be either 'ovr' or 'ovo'"):
+        SVM(decision_function_shape='bad').fit(X_train, y_train)
+
 
 def test_svr_predict():
     # Test SVR's decision_function
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
index 55af69ca6c10e..ca2549f2ea4c1 100644
--- a/sklearn/tests/test_docstring_parameters.py
+++ b/sklearn/tests/test_docstring_parameters.py
@@ -32,8 +32,10 @@
 with warnings.catch_warnings():
     warnings.simplefilter('ignore', FutureWarning)
     PUBLIC_MODULES = set([
-        pckg[1] for pckg in walk_packages(prefix='sklearn.',
-                                          path=sklearn.__path__)
+        pckg[1] for pckg in walk_packages(
+            prefix='sklearn.',
+            # mypy error: Module has no attribute "__path__"
+            path=sklearn.__path__)  # type: ignore  # mypy issue #1422
         if not ("._" in pckg[1] or ".tests." in pckg[1])
     ])
 
diff --git a/sklearn/tests/test_random_projection.py b/sklearn/tests/test_random_projection.py
index 740480d643f76..033bb84279d54 100644
--- a/sklearn/tests/test_random_projection.py
+++ b/sklearn/tests/test_random_projection.py
@@ -1,5 +1,6 @@
 
 import functools
+from typing import List, Any
 
 import numpy as np
 import scipy.sparse as sp
@@ -23,12 +24,12 @@
 from sklearn.utils._testing import assert_warns
 from sklearn.exceptions import DataDimensionalityWarning
 
-all_sparse_random_matrix = [_sparse_random_matrix]
-all_dense_random_matrix = [_gaussian_random_matrix]
+all_sparse_random_matrix: List[Any] = [_sparse_random_matrix]
+all_dense_random_matrix: List[Any] = [_gaussian_random_matrix]
 all_random_matrix = all_sparse_random_matrix + all_dense_random_matrix
 
-all_SparseRandomProjection = [SparseRandomProjection]
-all_DenseRandomProjection = [GaussianRandomProjection]
+all_SparseRandomProjection: List[Any] = [SparseRandomProjection]
+all_DenseRandomProjection: List[Any] = [GaussianRandomProjection]
 all_RandomProjection = set(all_SparseRandomProjection +
                            all_DenseRandomProjection)
 
diff --git a/sklearn/tests/test_site_joblib.py b/sklearn/tests/test_site_joblib.py
deleted file mode 100644
index 07125e9562408..0000000000000
--- a/sklearn/tests/test_site_joblib.py
+++ /dev/null
@@ -1,16 +0,0 @@
-
-
-def test_old_pickle(tmpdir):
-    import joblib
-
-    # Check that a pickle that references sklearn.external.joblib can load
-    f = tmpdir.join('foo.pkl')
-    f.write(b'\x80\x02csklearn.externals.joblib.numpy_pickle\nNumpyArrayWrappe'
-            b'r\nq\x00)\x81q\x01}q\x02(U\x05dtypeq\x03cnumpy\ndtype\nq\x04U'
-            b'\x02i8q\x05K\x00K\x01\x87q\x06Rq\x07(K\x03U\x01<q\x08NNNJ\xff'
-            b'\xff\xff\xffJ\xff\xff\xff\xffK\x00tq\tbU\x05shapeq\nK\x01\x85q'
-            b'\x0bU\x05orderq\x0cU\x01Cq\rU\x08subclassq\x0ecnumpy\nndarray\nq'
-            b'\x0fU\nallow_mmapq\x10\x88ub\x01\x00\x00\x00\x00\x00\x00\x00.',
-            mode='wb')
-
-    joblib.load(str(f))
diff --git a/sklearn/utils/_pprint.py b/sklearn/utils/_pprint.py
index 91e4abd8f7f49..6c27a8c503856 100644
--- a/sklearn/utils/_pprint.py
+++ b/sklearn/utils/_pprint.py
@@ -324,7 +324,8 @@ def _pprint_key_val_tuple(self, object, stream, indent, allowance, context,
     # Note: need to copy _dispatch to prevent instances of the builtin
     # PrettyPrinter class to call methods of _EstimatorPrettyPrinter (see issue
     # 12906)
-    _dispatch = pprint.PrettyPrinter._dispatch.copy()
+    # mypy error: "Type[PrettyPrinter]" has no attribute "_dispatch"
+    _dispatch = pprint.PrettyPrinter._dispatch.copy()  # type: ignore
     _dispatch[BaseEstimator.__repr__] = _pprint_estimator
     _dispatch[KeyValTuple.__repr__] = _pprint_key_val_tuple
 
diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py
index eb6e381f02840..eb4febea3abd8 100644
--- a/sklearn/utils/_testing.py
+++ b/sklearn/utils/_testing.py
@@ -183,7 +183,7 @@ def assert_warns_message(warning_class, message, func, *args, **kw):
             if callable(message):  # add support for certain tests
                 check_in_message = message
             else:
-                check_in_message = lambda msg: message in msg
+                def check_in_message(msg): return message in msg
 
             if check_in_message(msg):
                 message_found = True
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
index e44a2e7af5bcb..622c102fbbd0b 100644
--- a/sklearn/utils/fixes.py
+++ b/sklearn/utils/fixes.py
@@ -39,14 +39,16 @@ def _parse_version(version_string):
 else:
     # Backport of lobpcg functionality from scipy 1.4.0, can be removed
     # once support for sp_version < (1, 4) is dropped
-    from ..externals._lobpcg import lobpcg  # noqa
+    # mypy error: Name 'lobpcg' already defined (possibly by an import)
+    from ..externals._lobpcg import lobpcg  # type: ignore  # noqa
 
 if sp_version >= (1, 3):
     # Preserves earlier default choice of pinvh cutoff `cond` value.
     # Can be removed once issue #14055 is fully addressed.
     from ..externals._scipy_linalg import pinvh
 else:
-    from scipy.linalg import pinvh # noqa
+    # mypy error: Name 'pinvh' already defined (possibly by an import)
+    from scipy.linalg import pinvh  # type: ignore  # noqa
 
 
 def _object_dtype_isnan(X):
diff --git a/sklearn/utils/metaestimators.py b/sklearn/utils/metaestimators.py
index e091bd0f7cbf8..877d576592726 100644
--- a/sklearn/utils/metaestimators.py
+++ b/sklearn/utils/metaestimators.py
@@ -2,6 +2,7 @@
 # Author: Joel Nothman
 #         Andreas Mueller
 # License: BSD
+from typing import List, Any
 
 from abc import ABCMeta, abstractmethod
 from operator import attrgetter
@@ -17,6 +18,8 @@
 class _BaseComposition(BaseEstimator, metaclass=ABCMeta):
     """Handles parameter management for classifiers composed of named estimators.
     """
+    steps: List[Any]
+
     @abstractmethod
     def __init__(self):
         pass
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 6748dbcad9951..5f6df9685a25c 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -1153,3 +1153,22 @@ def test_check_fit_params(indices):
         result['sparse-col'],
         _safe_indexing(fit_params['sparse-col'], indices_)
     )
+
+
+@pytest.mark.parametrize('sp_format', [True, 'csr', 'csc', 'coo', 'bsr'])
+def test_check_sparse_pandas_sp_format(sp_format):
+    # check_array converts pandas dataframe with only sparse arrays into
+    # sparse matrix
+    pd = pytest.importorskip("pandas")
+    sp_mat = _sparse_random_matrix(10, 3)
+
+    sdf = pd.DataFrame.sparse.from_spmatrix(sp_mat)
+    result = check_array(sdf, accept_sparse=sp_format)
+
+    if sp_format is True:
+        # by default pandas converts to coo when accept_sparse is True
+        sp_format = 'coo'
+
+    assert sp.issparse(result)
+    assert result.format == sp_format
+    assert_allclose_dense_sparse(sp_mat, result)
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 08952d6cbcd16..4bb50c3deb5e7 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -451,10 +451,12 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True,
     # DataFrame), and store them. If not, store None.
     dtypes_orig = None
     if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'):
-        # throw warning if pandas dataframe is sparse
+        # throw warning if columns are sparse. If all columns are sparse, then
+        # array.sparse exists and sparsity will be perserved (later).
         with suppress(ImportError):
             from pandas.api.types import is_sparse
-            if array.dtypes.apply(is_sparse).any():
+            if (not hasattr(array, 'sparse') and
+                    array.dtypes.apply(is_sparse).any()):
                 warnings.warn(
                     "pandas.DataFrame with sparse columns found."
                     "It will be converted to a dense numpy array."
@@ -498,6 +500,11 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True,
         estimator_name = "Estimator"
     context = " by %s" % estimator_name if estimator is not None else ""
 
+    # When all dataframe columns are sparse, convert to a sparse array
+    if hasattr(array, 'sparse') and array.ndim > 1:
+        # DataFrame.sparse only supports `to_coo`
+        array = array.sparse.to_coo()
+
     if sp.issparse(array):
         _ensure_no_complex_data(array)
         array = _ensure_sparse_format(array, accept_sparse=accept_sparse,