From cadafb79bad52dc552ec4bdd76286b5a2ace42c4 Mon Sep 17 00:00:00 2001 From: Lucy Liu Date: Sat, 11 Apr 2020 19:10:21 +0200 Subject: [PATCH 001/125] DOC Fix grammar and clarify VotingRegressor (#16896) --- sklearn/ensemble/_voting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py index cab321702c85d..8d2bbbe8c2b8a 100644 --- a/sklearn/ensemble/_voting.py +++ b/sklearn/ensemble/_voting.py @@ -350,8 +350,8 @@ class VotingRegressor(RegressorMixin, _BaseVoting): .. versionadded:: 0.21 - A voting regressor is an ensemble meta-estimator that fits base - regressors each on the whole dataset. It, then, averages the individual + A voting regressor is an ensemble meta-estimator that fits several base + regressors, each on the whole dataset. Then it averages the individual predictions to form a final prediction. Read more in the :ref:`User Guide `. From 8122e77bee8414c787f4bcd730673d2c0e137d06 Mon Sep 17 00:00:00 2001 From: Lucy Liu Date: Sat, 11 Apr 2020 19:15:23 +0200 Subject: [PATCH 002/125] DOC Fix typos, wording in plot_gradient_boosting_regression.py (#16894) --- .../plot_gradient_boosting_regression.py | 48 +++++++++---------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/examples/ensemble/plot_gradient_boosting_regression.py b/examples/ensemble/plot_gradient_boosting_regression.py index 3dbe7dbaac296..860bb14687534 100644 --- a/examples/ensemble/plot_gradient_boosting_regression.py +++ b/examples/ensemble/plot_gradient_boosting_regression.py @@ -11,7 +11,7 @@ and 500 regression trees of depth 4. Note: For larger datasets (n_samples >= 10000), please refer to -:class:`sklearn.ensemble.HistGradientBoostingRegressor` +:class:`sklearn.ensemble.HistGradientBoostingRegressor`. """ print(__doc__) @@ -32,8 +32,7 @@ # Load the data # ------------------------------------- # -# First we need to load the data. We set random state to be consistent with the -# result. +# First we need to load the data. diabetes = datasets.load_diabetes() X, y = diabetes.data, diabetes.target @@ -43,13 +42,11 @@ # ------------------------------------- # # Next, we will split our dataset to use 90% for training and leave the rest -# for testing. We will also prepare the parameters we want to use to fit our -# regression model. You can play with those parameters to see how the -# results change: +# for testing. We will also set the regression model parameters. You can play +# with these parameters to see how the results change. # -# n_estimators : the number of boosting stages which will be performed. -# Later, we will plot and see how the deviance changes with those boosting -# operations. +# n_estimators : the number of boosting stages that will be performed. +# Later, we will plot deviance against boosting iterations. # # max_depth : limits the number of nodes in the tree. # The best value depends on the interaction of the input variables. @@ -57,12 +54,11 @@ # min_samples_split : the minimum number of samples required to split an # internal node. # -# learning_rate : how much the contribution of each tree will shrink +# learning_rate : how much the contribution of each tree will shrink. # -# loss : here, we decided to use least squeares as a loss function. -# However there are many other options (check -# :class:`~sklearn.ensemble.GradientBoostingRegressor` to see what are -# other possibilities) +# loss : loss function to optimize. The least squares function is used in this +# case however, there are many other options (see +# :class:`~sklearn.ensemble.GradientBoostingRegressor` ). X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.1, random_state=13) @@ -80,10 +76,10 @@ # Now we will initiate the gradient boosting regressors and fit it with our # training data. Let's also look and the mean squared error on the test data. -clf = ensemble.GradientBoostingRegressor(**params) -clf.fit(X_train, y_train) +reg = ensemble.GradientBoostingRegressor(**params) +reg.fit(X_train, y_train) -mse = mean_squared_error(y_test, clf.predict(X_test)) +mse = mean_squared_error(y_test, reg.predict(X_test)) print("The mean squared error (MSE) on test set: {:.4f}".format(mse)) ############################################################################## @@ -91,16 +87,16 @@ # ------------------------------------- # # Finally, we will visualize the results. To do that we will first compute the -# test set deviance and then plot it. +# test set deviance and then plot it against boosting iterations. test_score = np.zeros((params['n_estimators'],), dtype=np.float64) -for i, y_pred in enumerate(clf.staged_predict(X_test)): - test_score[i] = clf.loss_(y_test, y_pred) +for i, y_pred in enumerate(reg.staged_predict(X_test)): + test_score[i] = reg.loss_(y_test, y_pred) fig = plt.figure(figsize=(6, 6)) plt.subplot(1, 1, 1) plt.title('Deviance') -plt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-', +plt.plot(np.arange(params['n_estimators']) + 1, reg.train_score_, 'b-', label='Training Set Deviance') plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-', label='Test Set Deviance') @@ -116,16 +112,16 @@ # # Careful, impurity-based feature importances can be misleading for # high cardinality features (many unique values). As an alternative, -# the permutation importances of ``clf`` are computed on a +# the permutation importances of ``reg`` can be computed on a # held out test set. See :ref:`permutation_importance` for more details. # -# In this case, the two methods agree to identify the same top 2 features -# as strongly predictive features but not in the same order. The third most +# For this example, the impurity-based and permutation methods identify the +# same 2 strongly predictive features but not in the same order. The third most # predictive feature, "bp", is also the same for the 2 methods. The remaining # features are less predictive and the error bars of the permutation plot # show that they overlap with 0. -feature_importance = clf.feature_importances_ +feature_importance = reg.feature_importances_ sorted_idx = np.argsort(feature_importance) pos = np.arange(sorted_idx.shape[0]) + .5 fig = plt.figure(figsize=(12, 6)) @@ -134,7 +130,7 @@ plt.yticks(pos, np.array(diabetes.feature_names)[sorted_idx]) plt.title('Feature Importance (MDI)') -result = permutation_importance(clf, X_test, y_test, n_repeats=10, +result = permutation_importance(reg, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2) sorted_idx = result.importances_mean.argsort() plt.subplot(1, 2, 2) From c2b31ac21b8780498e11a42744212231b3fefaa6 Mon Sep 17 00:00:00 2001 From: Lucy Liu Date: Tue, 14 Apr 2020 10:42:15 +0200 Subject: [PATCH 003/125] DOC replace Boston in _classes.py (#16892) * replace boston * fix score --- sklearn/tree/_classes.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index fe77610a20601..f252ba0acbb1c 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -1152,16 +1152,16 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): Examples -------- - >>> from sklearn.datasets import load_boston + >>> from sklearn.datasets import load_diabetes >>> from sklearn.model_selection import cross_val_score >>> from sklearn.tree import DecisionTreeRegressor - >>> X, y = load_boston(return_X_y=True) + >>> X, y = load_diabetes(return_X_y=True) >>> regressor = DecisionTreeRegressor(random_state=0) >>> cross_val_score(regressor, X, y, cv=10) ... # doctest: +SKIP ... - array([ 0.61..., 0.57..., -0.34..., 0.41..., 0.75..., - 0.07..., 0.29..., 0.33..., -1.42..., -1.77...]) + array([-0.39..., -0.46..., 0.02..., 0.06..., -0.50..., + 0.16..., 0.11..., -0.73..., -0.30..., -0.00...]) """ def __init__(self, criterion="mse", @@ -1697,18 +1697,18 @@ class ExtraTreeRegressor(DecisionTreeRegressor): Examples -------- - >>> from sklearn.datasets import load_boston + >>> from sklearn.datasets import load_diabetes >>> from sklearn.model_selection import train_test_split >>> from sklearn.ensemble import BaggingRegressor >>> from sklearn.tree import ExtraTreeRegressor - >>> X, y = load_boston(return_X_y=True) + >>> X, y = load_diabetes(return_X_y=True) >>> X_train, X_test, y_train, y_test = train_test_split( ... X, y, random_state=0) >>> extra_tree = ExtraTreeRegressor(random_state=0) >>> reg = BaggingRegressor(extra_tree, random_state=0).fit( ... X_train, y_train) >>> reg.score(X_test, y_test) - 0.7447... + 0.33... """ def __init__(self, criterion="mse", From a2d361bf40dbe7eee77a8c27aba779358ff96d0e Mon Sep 17 00:00:00 2001 From: Lucy Liu Date: Tue, 14 Apr 2020 11:27:00 +0200 Subject: [PATCH 004/125] DOC Fix wording, typo in plot_voting_regressor.py (#16895) * wording, typo * add suggestions * add n est * remove n est --- examples/ensemble/plot_voting_regressor.py | 31 +++++++++++----------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/examples/ensemble/plot_voting_regressor.py b/examples/ensemble/plot_voting_regressor.py index 6fd629bb9c083..2587dee4352e9 100644 --- a/examples/ensemble/plot_voting_regressor.py +++ b/examples/ensemble/plot_voting_regressor.py @@ -5,28 +5,28 @@ .. currentmodule:: sklearn -A voting regressor is an ensemble meta-estimator that fits base regressors each -on the whole dataset. It, then, averages the individual predictions to form a -final prediction. +A voting regressor is an ensemble meta-estimator that fits several base +regressors, each on the whole dataset. Then it averages the individual +predictions to form a final prediction. We will use three different regressors to predict the data: :class:`~ensemble.GradientBoostingRegressor`, :class:`~ensemble.RandomForestRegressor`, and :class:`~linear_model.LinearRegression`). -Then, using them we will make voting regressor +Then the above 3 regressors will be used for the :class:`~ensemble.VotingRegressor`. -Finally, we will plot all of them for comparison. +Finally, we will plot the predictions made by all models for comparison. -We will work with the diabetes dataset which consists of the 10 features -collected from a cohort of diabetes patients. The target is the disease -progression after one year from the baseline. +We will work with the diabetes dataset which consists of 10 features +collected from a cohort of diabetes patients. The target is a quantitative +measure of disease progression one year after baseline. """ print(__doc__) import matplotlib.pyplot as plt -from sklearn import datasets +from sklearn.datasets import load_diabetes from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import LinearRegression @@ -36,11 +36,11 @@ # Training classifiers # -------------------------------- # -# First, we are going to load diabetes dataset and initiate gradient boosting -# regressor, random forest regressor and linear regression. Next, we are going -# to use each of them to build the voting regressor: +# First, we will load the diabetes dataset and initiate a gradient boosting +# regressor, a random forest regressor and a linear regression. Next, we will +# use the 3 regressors to build the voting regressor: -X, y = datasets.load_diabetes(return_X_y=True) +X, y = load_diabetes(return_X_y=True) # Train classifiers reg1 = GradientBoostingRegressor(random_state=1) @@ -58,8 +58,7 @@ # Making predictions # -------------------------------- # -# Now we will use each of the regressors to make 20 first predictions about the -# diabetes dataset. +# Now we will use each of the regressors to make the 20 first predictions. xt = X[:20] @@ -73,7 +72,7 @@ # -------------------------------- # # Finally, we will visualize the 20 predictions. The red stars show the average -# prediction +# prediction made by :class:`~ensemble.VotingRegressor`. plt.figure() plt.plot(pred1, 'gd', label='GradientBoostingRegressor') From 9cc55587067fa57688a28f9d819f0e0a1881882c Mon Sep 17 00:00:00 2001 From: Lucy Liu Date: Tue, 14 Apr 2020 12:20:47 +0200 Subject: [PATCH 005/125] DOC remove boston from tutorial.rst (#16889) --- doc/tutorial/basic/tutorial.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/tutorial/basic/tutorial.rst b/doc/tutorial/basic/tutorial.rst index 082c5ffa3aa79..28e965bd925a5 100644 --- a/doc/tutorial/basic/tutorial.rst +++ b/doc/tutorial/basic/tutorial.rst @@ -77,8 +77,8 @@ Loading an example dataset `scikit-learn` comes with a few standard datasets, for instance the `iris `_ and `digits `_ -datasets for classification and the `boston house prices dataset -`_ for regression. +datasets for classification and the `diabetes dataset +`_ for regression. In the following, we start a Python interpreter from our shell and then load the ``iris`` and ``digits`` datasets. Our notational convention is that From 9901d8df131e06d8f6ba1677e10330cabfdeb245 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 14 Apr 2020 09:38:32 -0400 Subject: [PATCH 006/125] Fix _deprecate_positional_args for kwonly args w/o default (#16850) --- sklearn/utils/tests/test_validation.py | 9 +++++++++ sklearn/utils/validation.py | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 5f6df9685a25c..b178ccc148d9d 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -1097,6 +1097,15 @@ def f2(a=1, *, b=1, c=1, d=1): match=r"Pass b=2 as keyword args"): f2(1, 2) + # The * is place before a keyword only argument without a default value + @_deprecate_positional_args + def f3(a, *, b, c=1, d=1): + pass + + with pytest.warns(FutureWarning, + match=r"Pass b=2 as keyword args"): + f3(1, 2) + def test_deprecate_positional_args_warns_for_class(): diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 4bb50c3deb5e7..953584fff0f8a 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1301,7 +1301,7 @@ def inner_f(*args, **kwargs): "passing these as positional arguments will " "result in an error".format(", ".join(args_msg)), FutureWarning) - kwargs.update({k: arg for k, arg in zip(all_args, args)}) + kwargs.update({k: arg for k, arg in zip(sig.parameters, args)}) return f(**kwargs) return inner_f From bd9fd0f1a9a222c58bbf8aba45025d42c598a31e Mon Sep 17 00:00:00 2001 From: Kevin Markham Date: Tue, 14 Apr 2020 09:40:15 -0400 Subject: [PATCH 007/125] DOC Minor updates to the Decision Tree User Guide (#16905) --- doc/modules/tree.rst | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst index ecd037d0631ac..af6fc4e1edfe9 100644 --- a/doc/modules/tree.rst +++ b/doc/modules/tree.rst @@ -56,9 +56,9 @@ The disadvantages of decision trees include: - Decision-tree learners can create over-complex trees that do not generalise the data well. This is called overfitting. Mechanisms - such as pruning (not currently supported), setting the minimum - number of samples required at a leaf node or setting the maximum - depth of the tree are necessary to avoid this problem. + such as pruning, setting the minimum number of samples required + at a leaf node or setting the maximum depth of the tree are + necessary to avoid this problem. - Decision trees can be unstable because small variations in the data might result in a completely different tree being generated. @@ -124,10 +124,10 @@ Using the Iris dataset, we can construct a tree as follows:: >>> clf = tree.DecisionTreeClassifier() >>> clf = clf.fit(X, y) -Once trained, you can plot the tree with the plot_tree function:: +Once trained, you can plot the tree with the :func:`plot_tree` function:: - >>> tree.plot_tree(clf.fit(iris.data, iris.target)) # doctest: +SKIP + >>> tree.plot_tree(clf) # doctest: +SKIP .. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_dtc_002.png :target: ../auto_examples/tree/plot_iris_dtc.html @@ -137,10 +137,7 @@ Once trained, you can plot the tree with the plot_tree function:: We can also export the tree in `Graphviz `_ format using the :func:`export_graphviz` exporter. If you use the `conda `_ package manager, the graphviz binaries - -and the python package can be installed with - - conda install python-graphviz +and the python package can be installed with `conda install python-graphviz`. Alternatively binaries for graphviz can be downloaded from the graphviz project homepage, and the Python wrapper installed from pypi with `pip install graphviz`. @@ -188,7 +185,7 @@ of external libraries and is more compact: >>> from sklearn.datasets import load_iris >>> from sklearn.tree import DecisionTreeClassifier - >>> from sklearn.tree.export import export_text + >>> from sklearn.tree import export_text >>> iris = load_iris() >>> decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2) >>> decision_tree = decision_tree.fit(iris.data, iris.target) From 5e2d74bc5f61b382758c0403c577253539e77156 Mon Sep 17 00:00:00 2001 From: Noa Tamir <6564007+noatamir@users.noreply.github.com> Date: Wed, 15 Apr 2020 10:36:51 +0200 Subject: [PATCH 008/125] DOC add versionadded versionchanged v0.19 (#16233) * added v0.19.1 and wip v0.19 * finished adding vchanged strings for v0.19 Towards #15426 @adrinjalali #wimlds #scikitlearnsprint * fixing linter issues * caught line issues with flake8 * caught the last line issue * added lines and cleaned gtiignore * Update sklearn/multiclass.py Co-Authored-By: Thomas J Fan * Update sklearn/multiclass.py Co-Authored-By: Thomas J Fan Co-authored-by: Thomas J Fan --- sklearn/decomposition/_lda.py | 3 +++ sklearn/feature_extraction/_hash.py | 4 ++++ sklearn/model_selection/_search.py | 10 ++++++++++ sklearn/model_selection/_validation.py | 5 +++++ sklearn/multiclass.py | 8 ++++++++ sklearn/multioutput.py | 5 +++++ sklearn/neighbors/_nearest_centroid.py | 3 +++ 7 files changed, 38 insertions(+) diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index 641e68cd7fc8b..a6e253aab1e6e 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -143,6 +143,9 @@ class LatentDirichletAllocation(TransformerMixin, BaseEstimator): n_components : int, optional (default=10) Number of topics. + .. versionchanged:: 0.19 + ``n_topics `` was renamed to ``n_components`` + doc_topic_prior : float, optional (default=None) Prior of document topic distribution `theta`. If the value is None, defaults to `1 / n_components`. diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py index f52e6f296169b..d5cfa913991b6 100644 --- a/sklearn/feature_extraction/_hash.py +++ b/sklearn/feature_extraction/_hash.py @@ -69,6 +69,10 @@ class FeatureHasher(TransformerMixin, BaseEstimator): approximately conserve the inner product in the hashed space even for small n_features. This approach is similar to sparse random projection. + .. versionchanged:: 0.19 + ``alternate_sign`` replaces the now deprecated ``non_negative`` + parameter. + Examples -------- >>> from sklearn.feature_extraction import FeatureHasher diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 3e5b85ed73a02..d283dc2f0b483 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -1002,6 +1002,11 @@ class GridSearchCV(BaseSearchCV): expensive and is not strictly required to select the parameters that yield the best generalization performance. + .. versionadded:: 0.19 + + .. versionchanged:: 0.21 + Default value was changed from ``True`` to ``False`` + Examples -------- @@ -1338,6 +1343,11 @@ class RandomizedSearchCV(BaseSearchCV): expensive and is not strictly required to select the parameters that yield the best generalization performance. + .. versionadded:: 0.19 + + .. versionchanged:: 0.21 + Default value was changed from ``True`` to ``False`` + Attributes ---------- cv_results_ : dict of numpy (masked) ndarrays diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index ae6151a88727b..180c48fc99762 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -134,6 +134,11 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None, expensive and is not strictly required to select the parameters that yield the best generalization performance. + .. versionadded:: 0.19 + + .. versionchanged:: 0.21 + Default value was changed from ``True`` to ``False`` + return_estimator : bool, default=False Whether to return the estimators fitted on each split. diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 9eeb4248f83fd..ae17d998882ea 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -390,6 +390,10 @@ def decision_function(self, X): Returns ------- T : array-like of shape (n_samples, n_classes) + + .. versionchanged:: 0.19 + output shape changed to ``(n_samples,)`` to conform to + scikit-learn conventions for binary classification. """ check_is_fitted(self) if len(self.estimators_) == 1: @@ -643,6 +647,10 @@ def decision_function(self, X): Returns ------- Y : array-like of shape (n_samples, n_classes) + + .. versionchanged:: 0.19 + output shape changed to ``(n_samples,)`` to conform to + scikit-learn conventions for binary classification. """ check_is_fitted(self) diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 2f8976a86c8b8..8f94a0ae634da 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -362,6 +362,11 @@ def predict_proba(self): such arrays if n_outputs > 1. The class probabilities of the input samples. The order of the classes corresponds to that in the attribute :term:`classes_`. + + .. versionchanged:: 0.19 + This function now returns a list of arrays where the length of + the list is ``n_outputs``, and each array is (``n_samples``, + ``n_classes``) for that particular output. """ check_is_fitted(self) if not all([hasattr(estimator, "predict_proba") diff --git a/sklearn/neighbors/_nearest_centroid.py b/sklearn/neighbors/_nearest_centroid.py index 0fdcd597353f5..bf00d8b8f88d2 100644 --- a/sklearn/neighbors/_nearest_centroid.py +++ b/sklearn/neighbors/_nearest_centroid.py @@ -41,6 +41,9 @@ class NearestCentroid(ClassifierMixin, BaseEstimator): If the "manhattan" metric is provided, this centroid is the median and for all other metrics, the centroid is now set to be the mean. + .. versionchanged:: 0.19 + ``metric='precomputed'`` was deprecated and now raises an error + shrink_threshold : float, default=None Threshold for shrinking centroids to remove features. From 6cd77c2c50792127d71cccc8a1296cb8ee178960 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Wed, 15 Apr 2020 18:11:26 +0200 Subject: [PATCH 009/125] API make feature_extraction's constructors' params kwonly (#16866) --- doc/modules/feature_extraction.rst | 2 +- sklearn/feature_extraction/_dict_vectorizer.py | 5 +++-- sklearn/feature_extraction/_hash.py | 5 +++-- sklearn/feature_extraction/image.py | 6 ++++-- .../tests/test_feature_hasher.py | 2 +- sklearn/feature_extraction/text.py | 17 +++++++++-------- 6 files changed, 21 insertions(+), 16 deletions(-) diff --git a/doc/modules/feature_extraction.rst b/doc/modules/feature_extraction.rst index 084e110f5c702..cedc43c23c16c 100644 --- a/doc/modules/feature_extraction.rst +++ b/doc/modules/feature_extraction.rst @@ -1019,7 +1019,7 @@ The :class:`PatchExtractor` class works in the same way as implemented as an estimator, so it can be used in pipelines. See:: >>> five_images = np.arange(5 * 4 * 4 * 3).reshape(5, 4, 4, 3) - >>> patches = image.PatchExtractor((2, 2)).transform(five_images) + >>> patches = image.PatchExtractor(patch_size=(2, 2)).transform(five_images) >>> patches.shape (45, 2, 2, 3) diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py index b527b0d72e6be..303e34d6f0ab9 100644 --- a/sklearn/feature_extraction/_dict_vectorizer.py +++ b/sklearn/feature_extraction/_dict_vectorizer.py @@ -11,6 +11,7 @@ from ..base import BaseEstimator, TransformerMixin from ..utils import check_array, tosequence +from ..utils.validation import _deprecate_positional_args def _tosequence(X): @@ -89,8 +90,8 @@ class DictVectorizer(TransformerMixin, BaseEstimator): sklearn.preprocessing.OrdinalEncoder : handles nominal/categorical features encoded as columns of arbitrary data types. """ - - def __init__(self, dtype=np.float64, separator="=", sparse=True, + @_deprecate_positional_args + def __init__(self, *, dtype=np.float64, separator="=", sparse=True, sort=True): self.dtype = dtype self.separator = separator diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py index d5cfa913991b6..b9c2abaa25a72 100644 --- a/sklearn/feature_extraction/_hash.py +++ b/sklearn/feature_extraction/_hash.py @@ -7,6 +7,7 @@ import scipy.sparse as sp from ..utils import IS_PYPY +from ..utils.validation import _deprecate_positional_args from ..base import BaseEstimator, TransformerMixin if not IS_PYPY: @@ -88,8 +89,8 @@ class FeatureHasher(TransformerMixin, BaseEstimator): DictVectorizer : vectorizes string-valued features using a hash table. sklearn.preprocessing.OneHotEncoder : handles nominal/categorical features. """ - - def __init__(self, n_features=(2 ** 20), input_type="dict", + @_deprecate_positional_args + def __init__(self, n_features=(2 ** 20), *, input_type="dict", dtype=np.float64, alternate_sign=True): self._validate_params(n_features, input_type) diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py index 588abf3fcf896..737f555bbccda 100644 --- a/sklearn/feature_extraction/image.py +++ b/sklearn/feature_extraction/image.py @@ -16,6 +16,7 @@ from numpy.lib.stride_tricks import as_strided from ..utils import check_array, check_random_state, deprecated +from ..utils.validation import _deprecate_positional_args from ..base import BaseEstimator __all__ = ['PatchExtractor', @@ -519,8 +520,9 @@ class PatchExtractor(BaseEstimator): >>> print('Patches shape: {}'.format(pe_trans.shape)) Patches shape: (545706, 2, 2) """ - - def __init__(self, patch_size=None, max_patches=None, random_state=None): + @_deprecate_positional_args + def __init__(self, *, patch_size=None, max_patches=None, + random_state=None): self.patch_size = patch_size self.max_patches = max_patches self.random_state = random_state diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py index 9fa7a191ca279..c0cd50cef6e09 100644 --- a/sklearn/feature_extraction/tests/test_feature_hasher.py +++ b/sklearn/feature_extraction/tests/test_feature_hasher.py @@ -32,7 +32,7 @@ def test_feature_hasher_strings(): it = (x for x in raw_X) # iterable - h = FeatureHasher(n_features, input_type="string", + h = FeatureHasher(n_features=n_features, input_type="string", alternate_sign=False) X = h.transform(it) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index ebc584b6271a9..27c5eb437805b 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -33,6 +33,7 @@ from ..utils import _IS_32BIT, deprecated from ..utils.fixes import _astype_copy_false from ..exceptions import NotFittedError +from ..utils.validation import _deprecate_positional_args __all__ = ['HashingVectorizer', @@ -677,8 +678,8 @@ class HashingVectorizer(TransformerMixin, _VectorizerMixin, BaseEstimator): CountVectorizer, TfidfVectorizer """ - - def __init__(self, input='content', encoding='utf-8', + @_deprecate_positional_args + def __init__(self, *, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern=r"(?u)\b\w\w+\b", @@ -999,8 +1000,8 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): when pickling. This attribute is provided only for introspection and can be safely removed using delattr or set to None before pickling. """ - - def __init__(self, input='content', encoding='utf-8', + @_deprecate_positional_args + def __init__(self, *, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern=r"(?u)\b\w\w+\b", @@ -1409,8 +1410,8 @@ class TfidfTransformer(TransformerMixin, BaseEstimator): Introduction to Information Retrieval. Cambridge University Press, pp. 118-120. """ - - def __init__(self, norm='l2', use_idf=True, smooth_idf=True, + @_deprecate_positional_args + def __init__(self, *, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False): self.norm = norm self.use_idf = use_idf @@ -1715,8 +1716,8 @@ class TfidfVectorizer(CountVectorizer): >>> print(X.shape) (4, 9) """ - - def __init__(self, input='content', encoding='utf-8', + @_deprecate_positional_args + def __init__(self, *, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern=r"(?u)\b\w\w+\b", From cb9ddbb91fad3c663ae0770d07d5e96601a66875 Mon Sep 17 00:00:00 2001 From: Lucy Liu Date: Wed, 15 Apr 2020 19:30:28 +0200 Subject: [PATCH 010/125] TST Replace boston in ensemble test_bagging (#16921) --- sklearn/ensemble/tests/test_bagging.py | 48 +++++++++++++------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py index 883f0067f5e78..3e8401332aeef 100644 --- a/sklearn/ensemble/tests/test_bagging.py +++ b/sklearn/ensemble/tests/test_bagging.py @@ -29,7 +29,7 @@ from sklearn.pipeline import make_pipeline from sklearn.feature_selection import SelectKBest from sklearn.model_selection import train_test_split -from sklearn.datasets import load_boston, load_iris, make_hastie_10_2 +from sklearn.datasets import load_diabetes, load_iris, make_hastie_10_2 from sklearn.utils import check_random_state from sklearn.preprocessing import FunctionTransformer @@ -44,12 +44,12 @@ iris.data = iris.data[perm] iris.target = iris.target[perm] -# also load the boston dataset +# also load the diabetes dataset # and randomly permute it -boston = load_boston() -perm = rng.permutation(boston.target.size) -boston.data = boston.data[perm] -boston.target = boston.target[perm] +diabetes = load_diabetes() +perm = rng.permutation(diabetes.target.size) +diabetes.data = diabetes.data[perm] +diabetes.target = diabetes.target[perm] # TODO: Remove in 0.24 when DummyClassifier's `strategy` default updates @@ -140,8 +140,8 @@ def fit(self, X, y): def test_regression(): # Check regression for various parameter settings. rng = check_random_state(0) - X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], - boston.target[:50], + X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50], + diabetes.target[:50], random_state=rng) grid = ParameterGrid({"max_samples": [0.5, 1.0], "max_features": [0.5, 1.0], @@ -162,8 +162,8 @@ def test_regression(): def test_sparse_regression(): # Check regression for various parameter settings on sparse input. rng = check_random_state(0) - X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], - boston.target[:50], + X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50], + diabetes.target[:50], random_state=rng) class CustomSVR(SVR): @@ -229,8 +229,8 @@ def fit(self, X, y): def test_bootstrap_samples(): # Test that bootstrapping samples generate non-perfect base estimators. rng = check_random_state(0) - X_train, X_test, y_train, y_test = train_test_split(boston.data, - boston.target, + X_train, X_test, y_train, y_test = train_test_split(diabetes.data, + diabetes.target, random_state=rng) base_estimator = DecisionTreeRegressor().fit(X_train, y_train) @@ -268,8 +268,8 @@ def test_bootstrap_samples(): def test_bootstrap_features(): # Test that bootstrapping features may generate duplicate features. rng = check_random_state(0) - X_train, X_test, y_train, y_test = train_test_split(boston.data, - boston.target, + X_train, X_test, y_train, y_test = train_test_split(diabetes.data, + diabetes.target, random_state=rng) ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(), @@ -278,7 +278,7 @@ def test_bootstrap_features(): random_state=rng).fit(X_train, y_train) for features in ensemble.estimators_features_: - assert boston.data.shape[1] == np.unique(features).shape[0] + assert diabetes.data.shape[1] == np.unique(features).shape[0] ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(), max_features=1.0, @@ -286,7 +286,7 @@ def test_bootstrap_features(): random_state=rng).fit(X_train, y_train) for features in ensemble.estimators_features_: - assert boston.data.shape[1] > np.unique(features).shape[0] + assert diabetes.data.shape[1] > np.unique(features).shape[0] def test_probability(): @@ -355,8 +355,8 @@ def test_oob_score_regression(): # Check that oob prediction is a good estimation of the generalization # error. rng = check_random_state(0) - X_train, X_test, y_train, y_test = train_test_split(boston.data, - boston.target, + X_train, X_test, y_train, y_test = train_test_split(diabetes.data, + diabetes.target, random_state=rng) clf = BaggingRegressor(base_estimator=DecisionTreeRegressor(), @@ -383,8 +383,8 @@ def test_oob_score_regression(): def test_single_estimator(): # Check singleton ensembles. rng = check_random_state(0) - X_train, X_test, y_train, y_test = train_test_split(boston.data, - boston.target, + X_train, X_test, y_train, y_test = train_test_split(diabetes.data, + diabetes.target, random_state=rng) clf1 = BaggingRegressor(base_estimator=KNeighborsRegressor(), @@ -488,8 +488,8 @@ def test_parallel_regression(): # Check parallel regression. rng = check_random_state(0) - X_train, X_test, y_train, y_test = train_test_split(boston.data, - boston.target, + X_train, X_test, y_train, y_test = train_test_split(diabetes.data, + diabetes.target, random_state=rng) ensemble = BaggingRegressor(DecisionTreeRegressor(), @@ -553,8 +553,8 @@ def test_base_estimator(): assert isinstance(ensemble.base_estimator_, Perceptron) # Regression - X_train, X_test, y_train, y_test = train_test_split(boston.data, - boston.target, + X_train, X_test, y_train, y_test = train_test_split(diabetes.data, + diabetes.target, random_state=rng) ensemble = BaggingRegressor(None, From 9d366a4f745abc57f3d6c3ffe148cd88d5d415fc Mon Sep 17 00:00:00 2001 From: Geoffrey Bolmier Date: Wed, 15 Apr 2020 13:38:22 -0400 Subject: [PATCH 011/125] ENH Add custom loss support for HistGradientBoosting (#16908) --- .../gradient_boosting.py | 10 +++++++-- .../tests/test_gradient_boosting.py | 21 +++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 796f4f060dda5..6087adb0b6575 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -23,6 +23,7 @@ from .binning import _BinMapper from .grower import TreeGrower from .loss import _LOSSES +from .loss import BaseLoss class BaseHistGradientBoosting(BaseEstimator, ABC): @@ -58,7 +59,8 @@ def _validate_parameters(self): The parameters that are directly passed to the grower are checked in TreeGrower.""" - if self.loss not in self._VALID_LOSSES: + if (self.loss not in self._VALID_LOSSES and + not isinstance(self.loss, BaseLoss)): raise ValueError( "Loss {} is not supported for {}. Accepted losses: " "{}.".format(self.loss, self.__class__.__name__, @@ -150,7 +152,11 @@ def fit(self, X, y, sample_weight=None): # data. self._in_fit = True - self.loss_ = self._get_loss(sample_weight=sample_weight) + if isinstance(self.loss, str): + self.loss_ = self._get_loss(sample_weight=sample_weight) + elif isinstance(self.loss, BaseLoss): + self.loss_ = self.loss + if self.early_stopping == 'auto': self.do_early_stopping_ = n_samples > 10000 else: diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 1b61e65793422..6fc412942d180 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -12,6 +12,8 @@ from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES +from sklearn.ensemble._hist_gradient_boosting.loss import LeastSquares +from sklearn.ensemble._hist_gradient_boosting.loss import BinaryCrossEntropy from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper from sklearn.utils import shuffle @@ -681,3 +683,22 @@ def test_single_node_trees(Est): for predictor in est._predictors) # Still gives correct predictions thanks to the baseline prediction assert_allclose(est.predict(X), y) + + +@pytest.mark.parametrize('Est, loss, X, y', [ + ( + HistGradientBoostingClassifier, + BinaryCrossEntropy(sample_weight=None), + X_classification, + y_classification + ), + ( + HistGradientBoostingRegressor, + LeastSquares(sample_weight=None), + X_regression, + y_regression + ) +]) +def test_custom_loss(Est, loss, X, y): + est = Est(loss=loss, max_iter=20) + est.fit(X, y) From 9358a6ee8f93511fd615d3264fa7ee9de0f21b93 Mon Sep 17 00:00:00 2001 From: Pierre Delanoue Date: Wed, 15 Apr 2020 20:51:04 +0200 Subject: [PATCH 012/125] DOC Update random_state description for Multiclass (#16839) --- sklearn/multiclass.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index ae17d998882ea..96ec40743fe2c 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -707,10 +707,9 @@ class OutputCodeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator): than one-vs-the-rest. random_state : int, RandomState instance or None, optional, default: None - The generator used to initialize the codebook. If int, random_state is - the seed used by the random number generator; If RandomState instance, - random_state is the random number generator; If None, the random number - generator is the RandomState instance used by `np.random`. + The generator used to initialize the codebook. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. n_jobs : int or None, optional (default=None) The number of jobs to use for the computation. From abfb6fd11e97cefd1947078646399cecec5bbe9c Mon Sep 17 00:00:00 2001 From: Angela Ambroz Date: Fri, 17 Apr 2020 11:14:21 -0400 Subject: [PATCH 013/125] [MRG] Add jitter to LassoLars (#15179) * Adding jitter to LassoLars fit * CircleCI fail * MR comments * Jitter becomes default, added test based on issue description * flake8 fixes * Removing unexpected cython files * Better coverage * PR comments * PR comments * PR comments * PR comments * PR comments * Linting * Apply suggestions from code review * addressed comments * added whatnew entry * test both estimators * update whatsnew * removed random_state for lassolarsIC Co-authored-by: Nicolas Hug --- doc/whats_new/v0.23.rst | 5 +++ sklearn/linear_model/_least_angle.py | 36 +++++++++++++++++-- .../linear_model/tests/test_least_angle.py | 24 +++++++++++++ 3 files changed, 63 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 4c489c1887815..c4fa3818aa1bc 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -298,6 +298,11 @@ Changelog of strictly inferior for maximum of `absgrad` and `tol` in `utils.optimize._newton_cg`. :pr:`16266` by :user:`Rushabh Vasani `. +- |Enhancement| :class:`linear_model.LassoLars` and + :class:`linear_model.Lars` now support a `jitter` parameter that adds + random noise to the target. This might help with stability in some edge + cases. :pr:`15179` by :user:`angelaambroz`. + :mod:`sklearn.metrics` ...................... diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py index a3781cf981710..bc71d7a1fccbd 100644 --- a/sklearn/linear_model/_least_angle.py +++ b/sklearn/linear_model/_least_angle.py @@ -21,6 +21,7 @@ from ..base import RegressorMixin, MultiOutputMixin # mypy error: Module 'sklearn.utils' has no attribute 'arrayfuncs' from ..utils import arrayfuncs, as_float_array # type: ignore +from ..utils import check_random_state from ..model_selection import check_cv from ..exceptions import ConvergenceWarning @@ -800,6 +801,16 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel): setting ``fit_path`` to ``False`` will lead to a speedup, especially with a small alpha. + jitter : float, default=None + Upper bound on a uniform noise parameter to be added to the + `y` values, to satisfy the model's assumption of + one-at-a-time computations. Might help with stability. + + random_state : int, RandomState instance or None (default) + Determines random number generation for jittering. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. Ignored if `jitter` is None. + Attributes ---------- alphas_ : array-like of shape (n_alphas + 1,) | list of n_targets such \ @@ -846,7 +857,8 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel): def __init__(self, fit_intercept=True, verbose=False, normalize=True, precompute='auto', n_nonzero_coefs=500, - eps=np.finfo(np.float).eps, copy_X=True, fit_path=True): + eps=np.finfo(np.float).eps, copy_X=True, fit_path=True, + jitter=None, random_state=None): self.fit_intercept = fit_intercept self.verbose = verbose self.normalize = normalize @@ -855,6 +867,8 @@ def __init__(self, fit_intercept=True, verbose=False, normalize=True, self.eps = eps self.copy_X = copy_X self.fit_path = fit_path + self.jitter = jitter + self.random_state = random_state @staticmethod def _get_gram(precompute, X, y): @@ -954,6 +968,12 @@ def fit(self, X, y, Xy=None): else: max_iter = self.max_iter + if self.jitter is not None: + rng = check_random_state(self.random_state) + + noise = rng.uniform(high=self.jitter, size=len(y)) + y = y + noise + self._fit(X, y, max_iter=max_iter, alpha=alpha, fit_path=self.fit_path, Xy=Xy) @@ -1031,6 +1051,16 @@ class LassoLars(Lars): algorithm are typically in congruence with the solution of the coordinate descent Lasso estimator. + jitter : float, default=None + Upper bound on a uniform noise parameter to be added to the + `y` values, to satisfy the model's assumption of + one-at-a-time computations. Might help with stability. + + random_state : int, RandomState instance or None (default) + Determines random number generation for jittering. Pass an int + for reproducible output across multiple function calls. + See :term:`Glossary `. Ignored if `jitter` is None. + Attributes ---------- alphas_ : array-like of shape (n_alphas + 1,) | list of n_targets such \ @@ -1083,7 +1113,7 @@ class LassoLars(Lars): def __init__(self, alpha=1.0, fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=np.finfo(np.float).eps, copy_X=True, fit_path=True, - positive=False): + positive=False, jitter=None, random_state=None): self.alpha = alpha self.fit_intercept = fit_intercept self.max_iter = max_iter @@ -1094,6 +1124,8 @@ def __init__(self, alpha=1.0, fit_intercept=True, verbose=False, self.copy_X = copy_X self.eps = eps self.fit_path = fit_path + self.jitter = jitter + self.random_state = random_state ############################################################################### diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py index 6e7c1fb37096a..e198dfb15e323 100644 --- a/sklearn/linear_model/tests/test_least_angle.py +++ b/sklearn/linear_model/tests/test_least_angle.py @@ -6,6 +6,7 @@ import pytest from scipy import linalg +from sklearn.base import clone from sklearn.model_selection import train_test_split from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_almost_equal @@ -17,6 +18,7 @@ from sklearn import linear_model, datasets from sklearn.linear_model._least_angle import _lars_path_residues from sklearn.linear_model import LassoLarsIC, lars_path +from sklearn.linear_model import Lars, LassoLars # TODO: use another dataset that has multiple drops diabetes = datasets.load_diabetes() @@ -733,6 +735,28 @@ def test_lasso_lars_fit_copyX_behaviour(copy_X): assert copy_X == np.array_equal(X, X_copy) +@pytest.mark.parametrize('est', (LassoLars(alpha=1e-3), Lars())) +def test_lars_with_jitter(est): + # Test that a small amount of jitter helps stability, + # using example provided in issue #2746 + + X = np.array([[0.0, 0.0, 0.0, -1.0, 0.0], + [0.0, -1.0, 0.0, 0.0, 0.0]]) + y = [-2.5, -2.5] + expected_coef = [0, 2.5, 0, 2.5, 0] + + # set to fit_intercept to False since target is constant and we want check + # the value of coef. coef would be all zeros otherwise. + est.set_params(fit_intercept=False) + est_jitter = clone(est).set_params(jitter=10e-8, random_state=0) + + est.fit(X, y) + est_jitter.fit(X, y) + + assert np.mean((est.coef_ - est_jitter.coef_)**2) > .1 + np.testing.assert_allclose(est_jitter.coef_, expected_coef, rtol=1e-3) + + def test_X_none_gram_not_none(): with pytest.raises(ValueError, match="X cannot be None if Gram is not None"): From 269afa3a77972e883aa1d64081b8f25d1819d5ac Mon Sep 17 00:00:00 2001 From: Mariana Meireles Date: Fri, 17 Apr 2020 17:32:15 +0200 Subject: [PATCH 014/125] DOC Fixed Plot Mnist Example (#16200) --- examples/neural_networks/plot_mnist_filters.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/examples/neural_networks/plot_mnist_filters.py b/examples/neural_networks/plot_mnist_filters.py index 57314a218f6ee..33f421a226c33 100644 --- a/examples/neural_networks/plot_mnist_filters.py +++ b/examples/neural_networks/plot_mnist_filters.py @@ -18,10 +18,16 @@ To make the example run faster, we use very few hidden units, and train only for a very short time. Training longer would result in weights with a much -smoother spatial appearance. +smoother spatial appearance. The example will throw a warning because it +doesn't converge, in this case this is what we want because of CI's time +constraints. """ + +import warnings + import matplotlib.pyplot as plt from sklearn.datasets import fetch_openml +from sklearn.exceptions import ConvergenceWarning from sklearn.neural_network import MLPClassifier print(__doc__) @@ -38,7 +44,13 @@ solver='sgd', verbose=10, random_state=1, learning_rate_init=.1) -mlp.fit(X_train, y_train) +# this example won't converge because of CI's time constraints, so we catch the +# warning and are ignore it here +with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=ConvergenceWarning, + module="sklearn") + mlp.fit(X_train, y_train) + print("Training set score: %f" % mlp.score(X_train, y_train)) print("Test set score: %f" % mlp.score(X_test, y_test)) From 7a77214c3cf2240b43873c62f11a36a85ebe36a6 Mon Sep 17 00:00:00 2001 From: Hao Chun Chang Date: Fri, 17 Apr 2020 23:33:24 +0800 Subject: [PATCH 015/125] DOC Improve neighbors documentation (#16923) --- sklearn/neighbors/_binary_tree.pxi | 48 ++++++++++++++++++++++++++--- sklearn/neighbors/_dist_metrics.pyx | 4 ++- sklearn/neighbors/_lof.py | 17 +++++----- 3 files changed, 56 insertions(+), 13 deletions(-) diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi index ef6a2a2d5d330..599a4e9cc6426 100755 --- a/sklearn/neighbors/_binary_tree.pxi +++ b/sklearn/neighbors/_binary_tree.pxi @@ -239,9 +239,10 @@ cdef NodeData_t[::1] get_memview_NodeData_1D( # Define doc strings, substituting the appropriate class name using # the DOC_DICT variable defined in the pyx files. CLASS_DOC = \ -"""{BinaryTree} for fast generalized N-point problems +""" +{BinaryTree}(X, leaf_size=40, metric='minkowski', **kwargs) -{BinaryTree}(X, leaf_size=40, metric='minkowski', \\**kwargs) +{BinaryTree} for fast generalized N-point problems Parameters ---------- @@ -1159,15 +1160,50 @@ cdef class BinaryTree: self._update_memviews() def get_tree_stats(self): + """ + get_tree_stats(self) + + Get tree status. + + Returns + ------- + tree_stats: tuple of int + (number of trims, number of leaves, number of splits) + """ return (self.n_trims, self.n_leaves, self.n_splits) def reset_n_calls(self): + """ + reset_n_calls(self) + + Reset number of calls to 0. + """ self.n_calls = 0 def get_n_calls(self): + """ + get_n_calls(self) + + Get number of calls. + + Returns + ------- + n_calls: int + number of distance computation calls + """ return self.n_calls def get_arrays(self): + """ + get_arrays(self) + + Get data and node arrays. + + Returns + ------- + arrays: tuple of array + Arrays for storing tree data, index, node data and node bounds. + """ return (self.data_arr, self.idx_array_arr, self.node_data_arr, self.node_bounds_arr) @@ -1362,7 +1398,8 @@ cdef class BinaryTree: def query_radius(self, X, r, int return_distance=False, int count_only=False, int sort_results=False): """ - query_radius(self, X, r, count_only = False): + query_radius(X, r, return_distance=False, + count_only=False, sort_results=False) query the tree for neighbors within a radius r @@ -1694,7 +1731,10 @@ cdef class BinaryTree: return np.exp(log_density_arr) def two_point_correlation(self, X, r, dualtree=False): - """Compute the two-point correlation function + """ + two_point_correlation(X, r, dualtree=False) + + Compute the two-point correlation function Parameters ---------- diff --git a/sklearn/neighbors/_dist_metrics.pyx b/sklearn/neighbors/_dist_metrics.pyx index 94c67f8ee9fa3..0c24efdd214e6 100755 --- a/sklearn/neighbors/_dist_metrics.pyx +++ b/sklearn/neighbors/_dist_metrics.pyx @@ -110,8 +110,10 @@ cdef class DistanceMetric: This class provides a uniform interface to fast distance metric functions. The various metrics can be accessed via the :meth:`get_metric` class method and the metric string identifier (see below). - For example, to use the Euclidean distance: + Examples + -------- + >>> from sklearn.neighbors import DistanceMetric >>> dist = DistanceMetric.get_metric('euclidean') >>> X = [[0, 1, 2], [3, 4, 5]] diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py index fc27b7ed69420..e03c4d9cb1e0e 100644 --- a/sklearn/neighbors/_lof.py +++ b/sklearn/neighbors/_lof.py @@ -176,8 +176,9 @@ def __init__(self, n_neighbors=20, algorithm='auto', leaf_size=30, @property def fit_predict(self): - """"Fits the model to the training set X and returns the labels. + """Fits the model to the training set X and returns the labels. + **Only available for novelty detection (when novelty is set to True).** Label is 1 for an inlier and -1 for an outlier according to the LOF score and the contamination parameter. @@ -207,7 +208,7 @@ def fit_predict(self): return self._fit_predict def _fit_predict(self, X, y=None): - """"Fits the model to the training set X and returns the labels. + """Fits the model to the training set X and returns the labels. Label is 1 for an inlier and -1 for an outlier according to the LOF score and the contamination parameter. @@ -286,9 +287,9 @@ def fit(self, X, y=None): def predict(self): """Predict the labels (1 inlier, -1 outlier) of X according to LOF. + **Only available for novelty detection (when novelty is set to True).** This method allows to generalize prediction to *new observations* (not - in the training set). Only available for novelty detection (when - novelty is set to True). + in the training set). Parameters ---------- @@ -345,8 +346,8 @@ def decision_function(self): Bigger is better, i.e. large values correspond to inliers. + **Only available for novelty detection (when novelty is set to True).** The shift offset allows a zero threshold for being an outlier. - Only available for novelty detection (when novelty is set to True). The argument X is supposed to contain *new data*: if X contains a point from training, it considers the later in its own neighborhood. Also, the samples in X are not considered in the neighborhood of any @@ -381,8 +382,8 @@ def _decision_function(self, X): Bigger is better, i.e. large values correspond to inliers. + **Only available for novelty detection (when novelty is set to True).** The shift offset allows a zero threshold for being an outlier. - Only available for novelty detection (when novelty is set to True). The argument X is supposed to contain *new data*: if X contains a point from training, it considers the later in its own neighborhood. Also, the samples in X are not considered in the neighborhood of any @@ -411,7 +412,7 @@ def score_samples(self): It is the opposite as bigger is better, i.e. large values correspond to inliers. - Only available for novelty detection (when novelty is set to True). + **Only available for novelty detection (when novelty is set to True).** The argument X is supposed to contain *new data*: if X contains a point from training, it considers the later in its own neighborhood. Also, the samples in X are not considered in the neighborhood of any @@ -447,7 +448,7 @@ def _score_samples(self, X): It is the opposite as bigger is better, i.e. large values correspond to inliers. - Only available for novelty detection (when novelty is set to True). + **Only available for novelty detection (when novelty is set to True).** The argument X is supposed to contain *new data*: if X contains a point from training, it considers the later in its own neighborhood. Also, the samples in X are not considered in the neighborhood of any From 2d03d781a9f6333f1e3e1be452e37c3340396881 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 17 Apr 2020 13:44:09 -0400 Subject: [PATCH 016/125] MNT simplify xfail check marking logic (#16949) Co-Authored-By: Roman Yurchak --- doc/developers/develop.rst | 29 ++------------ sklearn/base.py | 2 +- sklearn/decomposition/_sparse_pca.py | 2 +- sklearn/dummy.py | 2 +- sklearn/neural_network/_rbm.py | 2 +- sklearn/svm/_classes.py | 2 +- sklearn/utils/estimator_checks.py | 56 ++++++++++++++++++---------- 7 files changed, 46 insertions(+), 49 deletions(-) diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst index 96aa942fb9238..d8ae6dd224840 100644 --- a/doc/developers/develop.rst +++ b/doc/developers/develop.rst @@ -246,7 +246,9 @@ whether it is just for you or for contributing it to scikit-learn, there are several internals of scikit-learn that you should be aware of in addition to the scikit-learn API outlined above. You can check whether your estimator adheres to the scikit-learn interface and standards by running -:func:`utils.estimator_checks.check_estimator` on the class:: +:func:`utils.estimator_checks.check_estimator` on the class or using +:func:`~sklearn.utils.parametrize_with_checks` pytest decorator (see its +docstring for details and possible interactions with `pytest`):: >>> from sklearn.utils.estimator_checks import check_estimator >>> from sklearn.svm import LinearSVC @@ -257,29 +259,6 @@ interface might be that you want to use it together with model evaluation and selection tools such as :class:`model_selection.GridSearchCV` and :class:`pipeline.Pipeline`. -Setting `generate_only=True` returns a generator that yields (estimator, check) -tuples where the check can be called independently from each other, i.e. -`check(estimator)`. This allows all checks to be run independently and report -the checks that are failing. scikit-learn provides a pytest specific decorator, -:func:`~sklearn.utils.parametrize_with_checks`, making it easier to test -multiple estimators:: - - from sklearn.utils.estimator_checks import parametrize_with_checks - from sklearn.linear_model import LogisticRegression - from sklearn.tree import DecisionTreeRegressor - - @parametrize_with_checks([LogisticRegression, DecisionTreeRegressor]) - def test_sklearn_compatible_estimator(estimator, check): - check(estimator) - -This decorator sets the `id` keyword in `pytest.mark.parameterize` exposing -the name of the underlying estimator and check in the test name. This allows -`pytest -k` to be used to specify which tests to run. - -.. code-block: bash - - pytest test_check_estimators.py -k check_estimators_fit_returns_self - Before detailing the required interface below, we describe two ways to achieve the correct interface more easily. @@ -538,7 +517,7 @@ _skip_test (default=False) whether to skip common tests entirely. Don't use this unless you have a *very good* reason. -_xfail_test (default=False) +_xfail_checks (default=False) dictionary ``{check_name : reason}`` of common checks to mark as a known failure, with the associated reason. Don't use this unless you have a *very good* reason. diff --git a/sklearn/base.py b/sklearn/base.py index 70dec8c030418..8a6041cc17982 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -33,7 +33,7 @@ 'stateless': False, 'multilabel': False, '_skip_test': False, - '_xfail_test': False, + '_xfail_checks': False, 'multioutput_only': False, 'binary_only': False, 'requires_fit': True} diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py index 888d5d79e1e4b..cf1f5a2608e1c 100644 --- a/sklearn/decomposition/_sparse_pca.py +++ b/sklearn/decomposition/_sparse_pca.py @@ -234,7 +234,7 @@ def transform(self, X): def _more_tags(self): return { - '_xfail_test': { + '_xfail_checks': { "check_methods_subset_invariance": "fails for the transform method" } diff --git a/sklearn/dummy.py b/sklearn/dummy.py index 634943231860f..37e9145f7536c 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -358,7 +358,7 @@ def predict_log_proba(self, X): def _more_tags(self): return { 'poor_score': True, 'no_validation': True, - '_xfail_test': { + '_xfail_checks': { 'check_methods_subset_invariance': 'fails for the predict method' } diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py index 06e7cc71bad3c..03b69c656b4a3 100644 --- a/sklearn/neural_network/_rbm.py +++ b/sklearn/neural_network/_rbm.py @@ -375,7 +375,7 @@ def fit(self, X, y=None): def _more_tags(self): return { - '_xfail_test': { + '_xfail_checks': { 'check_methods_subset_invariance': 'fails for the decision_function method' } diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 46086729af35c..10975a6f8e4a2 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -855,7 +855,7 @@ def __init__(self, nu=0.5, kernel='rbf', degree=3, gamma='scale', def _more_tags(self): return { - '_xfail_test': { + '_xfail_checks': { 'check_methods_subset_invariance': 'fails for the decision_function method', 'check_class_weight_classifiers': 'class_weight is ignored.' diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 34a0e25c7fcaa..eef9109fb56f5 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -359,38 +359,37 @@ def _generate_class_checks(Estimator): def _mark_xfail_checks(estimator, check, pytest): - """Mark estimator check pairs with xfail""" + """Mark (estimator, check) pairs with xfail according to the + _xfail_checks_ tag""" if isinstance(estimator, type): - # try to construct estimator to get tags, if it is unable to then - # return the estimator class + # try to construct estimator instance, if it is unable to then + # return the estimator class, ignoring the tag try: - xfail_checks = _safe_tags(_construct_instance(estimator), - '_xfail_test') + estimator = _construct_instance(estimator), except Exception: return estimator, check - else: - xfail_checks = _safe_tags(estimator, '_xfail_test') - - if not xfail_checks: - return estimator, check + xfail_checks = _safe_tags(estimator, '_xfail_checks') or {} check_name = _set_check_estimator_ids(check) - msg = xfail_checks.get(check_name, None) - if msg is None: + if check_name not in xfail_checks: + # check isn't part of the xfail_checks tags, just return it return estimator, check - - return pytest.param( - estimator, check, marks=pytest.mark.xfail(reason=msg)) + else: + # check is in the tag, mark it as xfail for pytest + reason = xfail_checks[check_name] + return pytest.param(estimator, check, + marks=pytest.mark.xfail(reason=reason)) def parametrize_with_checks(estimators): """Pytest specific decorator for parametrizing estimator checks. - The `id` of each test is set to be a pprint version of the estimator + The `id` of each check is set to be a pprint version of the estimator and the name of the check with its keyword arguments. + This allows to use `pytest -k` to specify which tests to run:: - Read more in the :ref:`User Guide`. + pytest test_check_estimators.py -k check_estimators_fit_returns_self Parameters ---------- @@ -400,6 +399,17 @@ def parametrize_with_checks(estimators): Returns ------- decorator : `pytest.mark.parametrize` + + Examples + -------- + >>> from sklearn.utils.estimator_checks import parametrize_with_checks + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.tree import DecisionTreeRegressor + + >>> @parametrize_with_checks([LogisticRegression, DecisionTreeRegressor]) + >>> def test_sklearn_compatible_estimator(estimator, check): + >>> check(estimator) + """ import pytest @@ -419,7 +429,8 @@ def check_estimator(Estimator, generate_only=False): """Check if estimator adheres to scikit-learn conventions. This estimator will run an extensive test-suite for input validation, - shapes, etc. + shapes, etc, making sure that the estimator complies with `scikit-leanrn` + conventions as detailed in :ref:`rolling_your_own_estimator`. Additional tests for classifiers, regressors, clustering or transformers will be run if the Estimator class inherits from the corresponding mixin from sklearn.base. @@ -428,7 +439,14 @@ def check_estimator(Estimator, generate_only=False): Classes currently have some additional tests that related to construction, while passing instances allows the testing of multiple options. - Read more in :ref:`rolling_your_own_estimator`. + Setting `generate_only=True` returns a generator that yields (estimator, + check) tuples where the check can be called independently from each + other, i.e. `check(estimator)`. This allows all checks to be run + independently and report the checks that are failing. + + scikit-learn provides a pytest specific decorator, + :func:`~sklearn.utils.parametrize_with_checks`, making it easier to test + multiple estimators. Parameters ---------- From 522ecac61330887838722db9007c4be30ecd8744 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Fri, 17 Apr 2020 16:19:02 -0400 Subject: [PATCH 017/125] DOC Fix docstring issue in parametrize_with_checks (#16953) --- sklearn/utils/estimator_checks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index eef9109fb56f5..e2a51e94653b7 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -407,8 +407,8 @@ def parametrize_with_checks(estimators): >>> from sklearn.tree import DecisionTreeRegressor >>> @parametrize_with_checks([LogisticRegression, DecisionTreeRegressor]) - >>> def test_sklearn_compatible_estimator(estimator, check): - >>> check(estimator) + ... def test_sklearn_compatible_estimator(estimator, check): + ... check(estimator) """ import pytest From 5abd22f58f152a0a899f33bb22609cc085fbfdec Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Fri, 17 Apr 2020 20:03:15 -0400 Subject: [PATCH 018/125] FIX Bug in mark_xfail_checks (#16954) --- sklearn/utils/estimator_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index e2a51e94653b7..1623902b202f3 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -365,7 +365,7 @@ def _mark_xfail_checks(estimator, check, pytest): # try to construct estimator instance, if it is unable to then # return the estimator class, ignoring the tag try: - estimator = _construct_instance(estimator), + estimator = _construct_instance(estimator) except Exception: return estimator, check From cb49ad475155ed482829bb1a0278a5e19b9ca17c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 18 Apr 2020 20:14:22 -0400 Subject: [PATCH 019/125] MNT removed _safe_tags utility (#16950) --- sklearn/tests/test_docstring_parameters.py | 5 +- sklearn/utils/estimator_checks.py | 91 +++++++++----------- sklearn/utils/tests/test_estimator_checks.py | 3 +- 3 files changed, 43 insertions(+), 56 deletions(-) diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index ca2549f2ea4c1..8ea0ec97f9fc2 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -17,7 +17,6 @@ from sklearn.utils._testing import _get_func_name from sklearn.utils._testing import ignore_warnings from sklearn.utils._testing import all_estimators -from sklearn.utils.estimator_checks import _safe_tags from sklearn.utils.estimator_checks import _enforce_estimator_tags_y from sklearn.utils.estimator_checks import _enforce_estimator_tags_x from sklearn.utils.deprecation import _is_deprecated @@ -206,9 +205,9 @@ def test_fit_docstring_attributes(name, Estimator): y = _enforce_estimator_tags_y(est, y) X = _enforce_estimator_tags_x(est, X) - if '1dlabels' in _safe_tags(est, 'X_types'): + if '1dlabels' in est._get_tags()['X_types']: est.fit(y) - elif '2dlabels' in _safe_tags(est, 'X_types'): + elif '2dlabels' in est._get_tags()['X_types']: est.fit(np.c_[y, y]) else: est.fit(X, y) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 1623902b202f3..351f24b66283e 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -33,7 +33,7 @@ from ..linear_model import Ridge from ..base import (clone, ClusterMixin, is_classifier, is_regressor, - _DEFAULT_TAGS, RegressorMixin, is_outlier_detector) + RegressorMixin, is_outlier_detector) from ..metrics import accuracy_score, adjusted_rand_score, f1_score from ..random_projection import BaseRandomProjection @@ -58,22 +58,9 @@ BOSTON = None CROSS_DECOMPOSITION = ['PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'] -def _safe_tags(estimator, key=None): - # if estimator doesn't have _get_tags, use _DEFAULT_TAGS - # if estimator has tags but not key, use _DEFAULT_TAGS[key] - if hasattr(estimator, "_get_tags"): - if key is not None: - return estimator._get_tags().get(key, _DEFAULT_TAGS[key]) - tags = estimator._get_tags() - return {key: tags.get(key, _DEFAULT_TAGS[key]) - for key in _DEFAULT_TAGS.keys()} - if key is not None: - return _DEFAULT_TAGS[key] - return _DEFAULT_TAGS - def _yield_checks(name, estimator): - tags = _safe_tags(estimator) + tags = estimator._get_tags() yield check_no_attributes_set_in_init yield check_estimators_dtypes yield check_fit_score_takes_y @@ -116,7 +103,7 @@ def _yield_checks(name, estimator): def _yield_classifier_checks(name, classifier): - tags = _safe_tags(classifier) + tags = classifier._get_tags() # test classifiers can handle non-array data and pandas objects yield check_classifier_data_not_an_array @@ -171,7 +158,7 @@ def check_supervised_y_no_nan(name, estimator_orig): def _yield_regressor_checks(name, regressor): - tags = _safe_tags(regressor) + tags = regressor._get_tags() # TODO: test with intercept # TODO: test with multiple responses # basic testing @@ -198,12 +185,12 @@ def _yield_regressor_checks(name, regressor): def _yield_transformer_checks(name, transformer): # All transformers should either deal with sparse data or raise an # exception with type TypeError and an intelligible error message - if not _safe_tags(transformer, "no_validation"): + if not transformer._get_tags()["no_validation"]: yield check_transformer_data_not_an_array # these don't actually fit the data, so don't raise errors yield check_transformer_general yield partial(check_transformer_general, readonly_memmap=True) - if not _safe_tags(transformer, "stateless"): + if not transformer._get_tags()["stateless"]: yield check_transformers_unfitted # Dependent on external solvers and hence accessing the iter # param is non-trivial. @@ -237,12 +224,12 @@ def _yield_outliers_checks(name, estimator): # test outlier detectors can handle non-array data yield check_classifier_data_not_an_array # test if NotFittedError is raised - if _safe_tags(estimator, "requires_fit"): + if estimator._get_tags()["requires_fit"]: yield check_estimators_unfitted def _yield_all_checks(name, estimator): - tags = _safe_tags(estimator) + tags = estimator._get_tags() if "2darray" not in tags["X_types"]: warnings.warn("Can't test estimator {} which requires input " " of type {}".format(name, tags["X_types"]), @@ -369,7 +356,7 @@ def _mark_xfail_checks(estimator, check, pytest): except Exception: return estimator, check - xfail_checks = _safe_tags(estimator, '_xfail_checks') or {} + xfail_checks = estimator._get_tags()['_xfail_checks'] or {} check_name = _set_check_estimator_ids(check) if check_name not in xfail_checks: @@ -701,7 +688,7 @@ def check_estimator_sparse_data(name, estimator_orig): X[X < .8] = 0 X = _pairwise_estimator_convert_X(X, estimator_orig) X_csr = sparse.csr_matrix(X) - tags = _safe_tags(estimator_orig) + tags = estimator_orig._get_tags() if tags['binary_only']: y = (2 * rng.rand(40)).astype(np.int) else: @@ -767,7 +754,7 @@ def check_sample_weights_pandas_series(name, estimator_orig): X = pd.DataFrame(_pairwise_estimator_convert_X(X, estimator_orig)) y = pd.Series([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2]) weights = pd.Series([1] * 12) - if _safe_tags(estimator, "multioutput_only"): + if estimator._get_tags()["multioutput_only"]: y = pd.DataFrame(y) try: estimator.fit(X, y, sample_weight=weights) @@ -792,7 +779,7 @@ def check_sample_weights_not_an_array(name, estimator_orig): X = _NotAnArray(pairwise_estimator_convert_X(X, estimator_orig)) y = _NotAnArray([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2]) weights = _NotAnArray([1] * 12) - if _safe_tags(estimator, "multioutput_only"): + if estimator._get_tags()["multioutput_only"]: y = _NotAnArray(y.data.reshape(-1, 1)) estimator.fit(X, y, sample_weight=weights) @@ -806,8 +793,8 @@ def check_sample_weights_list(name, estimator_orig): rnd = np.random.RandomState(0) n_samples = 30 X = _pairwise_estimator_convert_X(rnd.uniform(size=(n_samples, 3)), - estimator_orig) - if _safe_tags(estimator, 'binary_only'): + estimator_orig) + if estimator._get_tags()['binary_only']: y = np.arange(n_samples) % 2 else: y = np.arange(n_samples) % 3 @@ -886,7 +873,7 @@ def check_dtype_object(name, estimator_orig): rng = np.random.RandomState(0) X = _pairwise_estimator_convert_X(rng.rand(40, 10), estimator_orig) X = X.astype(object) - tags = _safe_tags(estimator_orig) + tags = estimator_orig._get_tags() if tags['binary_only']: y = (X[:, 0] * 2).astype(np.int) else: @@ -990,7 +977,7 @@ def check_dont_overwrite_parameters(name, estimator_orig): X = 3 * rnd.uniform(size=(20, 3)) X = _pairwise_estimator_convert_X(X, estimator_orig) y = X[:, 0].astype(np.int) - if _safe_tags(estimator, 'binary_only'): + if estimator._get_tags()['binary_only']: y[y == 2] = 1 y = _enforce_estimator_tags_y(estimator, y) @@ -1041,7 +1028,7 @@ def check_fit2d_predict1d(name, estimator_orig): X = 3 * rnd.uniform(size=(20, 3)) X = _pairwise_estimator_convert_X(X, estimator_orig) y = X[:, 0].astype(np.int) - tags = _safe_tags(estimator_orig) + tags = estimator_orig._get_tags() if tags['binary_only']: y[y == 2] = 1 estimator = clone(estimator_orig) @@ -1092,7 +1079,7 @@ def check_methods_subset_invariance(name, estimator_orig): X = 3 * rnd.uniform(size=(20, 3)) X = _pairwise_estimator_convert_X(X, estimator_orig) y = X[:, 0].astype(np.int) - if _safe_tags(estimator_orig, 'binary_only'): + if estimator_orig._get_tags()['binary_only']: y[y == 2] = 1 estimator = clone(estimator_orig) y = _enforce_estimator_tags_y(estimator, y) @@ -1193,7 +1180,7 @@ def check_fit1d(name, estimator_orig): X = 3 * rnd.uniform(size=(20)) y = X.astype(np.int) estimator = clone(estimator_orig) - tags = _safe_tags(estimator) + tags = estimator._get_tags() if tags["no_validation"]: # FIXME this is a bit loose return @@ -1285,7 +1272,7 @@ def _check_transformer(name, transformer_orig, X, y): X_pred2 = transformer.transform(X) X_pred3 = transformer.fit_transform(X, y=y_) - if _safe_tags(transformer_orig, 'non_deterministic'): + if transformer_orig._get_tags()['non_deterministic']: msg = name + ' is non deterministic' raise SkipTest(msg) if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple): @@ -1316,7 +1303,7 @@ def _check_transformer(name, transformer_orig, X, y): # raises error on malformed input for transform if hasattr(X, 'shape') and \ - not _safe_tags(transformer, "stateless") and \ + not transformer._get_tags()["stateless"] and \ X.ndim == 2 and X.shape[1] > 1: # If it's not an array, it does not have a 'T' property @@ -1330,7 +1317,7 @@ def _check_transformer(name, transformer_orig, X, y): @ignore_warnings def check_pipeline_consistency(name, estimator_orig): - if _safe_tags(estimator_orig, 'non_deterministic'): + if estimator_orig._get_tags()['non_deterministic']: msg = name + ' is non deterministic' raise SkipTest(msg) @@ -1365,7 +1352,7 @@ def check_fit_score_takes_y(name, estimator_orig): n_samples = 30 X = rnd.uniform(size=(n_samples, 3)) X = _pairwise_estimator_convert_X(X, estimator_orig) - if _safe_tags(estimator_orig, 'binary_only'): + if estimator_orig._get_tags()['binary_only']: y = np.arange(n_samples) % 2 else: y = np.arange(n_samples) % 3 @@ -1398,7 +1385,7 @@ def check_estimators_dtypes(name, estimator_orig): X_train_int_64 = X_train_32.astype(np.int64) X_train_int_32 = X_train_32.astype(np.int32) y = X_train_int_64[:, 0] - if _safe_tags(estimator_orig, 'binary_only'): + if estimator_orig._get_tags()['binary_only']: y[y == 2] = 1 y = _enforce_estimator_tags_y(estimator_orig, y) @@ -1534,7 +1521,7 @@ def check_estimators_pickle(name, estimator_orig): X -= X.min() X = _pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel) - tags = _safe_tags(estimator_orig) + tags = estimator_orig._get_tags() # include NaN values when the estimator should deal with them if tags['allow_nan']: # set randomly 10 elements to np.nan @@ -1599,7 +1586,7 @@ def check_estimators_partial_fit_n_features(name, estimator_orig): @ignore_warnings(category=FutureWarning) def check_classifier_multioutput(name, estimator): n_samples, n_labels, n_classes = 42, 5, 3 - tags = _safe_tags(estimator) + tags = estimator._get_tags() estimator = clone(estimator) X, y = make_multilabel_classification(random_state=42, n_samples=n_samples, @@ -1706,7 +1693,7 @@ def check_clustering(name, clusterer_orig, readonly_memmap=False): pred = clusterer.labels_ assert pred.shape == (n_samples,) assert adjusted_rand_score(pred, y) > 0.4 - if _safe_tags(clusterer, 'non_deterministic'): + if clusterer._get_tags()['non_deterministic']: return set_random_state(clusterer) with warnings.catch_warnings(record=True): @@ -1805,7 +1792,7 @@ def check_classifiers_train(name, classifier_orig, readonly_memmap=False, X_m, y_m, X_b, y_b = create_memmap_backed_data([X_m, y_m, X_b, y_b]) problems = [(X_b, y_b)] - tags = _safe_tags(classifier_orig) + tags = classifier_orig._get_tags() if not tags['binary_only']: problems.append((X_m, y_m)) @@ -2044,7 +2031,7 @@ def check_classifiers_multilabel_representation_invariance(name, def check_estimators_fit_returns_self(name, estimator_orig, readonly_memmap=False): """Check if self is returned when calling fit""" - if _safe_tags(estimator_orig, 'binary_only'): + if estimator_orig._get_tags()['binary_only']: n_centers = 2 else: n_centers = 3 @@ -2081,7 +2068,7 @@ def check_estimators_unfitted(name, estimator_orig): @ignore_warnings(category=FutureWarning) def check_supervised_y_2d(name, estimator_orig): - tags = _safe_tags(estimator_orig) + tags = estimator_orig._get_tags() if tags['multioutput_only']: # These only work on 2d, so this test makes no sense return @@ -2197,7 +2184,7 @@ def check_classifiers_classes(name, classifier_orig): y_names_binary = np.take(labels_binary, y_binary) problems = [(X_binary, y_binary, y_names_binary)] - if not _safe_tags(classifier_orig, 'binary_only'): + if not classifier_orig._get_tags()['binary_only']: problems.append((X_multiclass, y_multiclass, y_names_multiclass)) for X, y, y_names in problems: @@ -2282,7 +2269,7 @@ def check_regressors_train(name, regressor_orig, readonly_memmap=False, # TODO: find out why PLS and CCA fail. RANSAC is random # and furthermore assumes the presence of outliers, hence # skipped - if not _safe_tags(regressor, "poor_score"): + if not regressor._get_tags()["poor_score"]: assert regressor.score(X, y_) > 0.5 @@ -2315,7 +2302,7 @@ def check_regressors_no_decision_function(name, regressor_orig): @ignore_warnings(category=FutureWarning) def check_class_weight_classifiers(name, classifier_orig): - if _safe_tags(classifier_orig, 'binary_only'): + if classifier_orig._get_tags()['binary_only']: problems = [2] else: problems = [2, 3] @@ -2418,7 +2405,7 @@ def check_class_weight_balanced_linear_classifier(name, Classifier): @ignore_warnings(category=FutureWarning) def check_estimators_overwrite_params(name, estimator_orig): - if _safe_tags(estimator_orig, 'binary_only'): + if estimator_orig._get_tags()['binary_only']: n_centers = 2 else: n_centers = 3 @@ -2654,13 +2641,13 @@ def enforce_estimator_tags_y(estimator, y): def _enforce_estimator_tags_y(estimator, y): # Estimators with a `requires_positive_y` tag only accept strictly positive # data - if _safe_tags(estimator, "requires_positive_y"): + if estimator._get_tags()["requires_positive_y"]: # Create strictly positive y. The minimal increment above 0 is 1, as # y could be of integer dtype. y += 1 + abs(y.min()) # Estimators in mono_output_task_error raise ValueError if y is of 1-D # Convert into a 2-D y for those estimators. - if _safe_tags(estimator, "multioutput_only"): + if estimator._get_tags()["multioutput_only"]: return np.reshape(y, (-1, 1)) return y @@ -2672,11 +2659,11 @@ def _enforce_estimator_tags_x(estimator, X): X = X.dot(X.T) # Estimators with `1darray` in `X_types` tag only accept # X of shape (`n_samples`,) - if '1darray' in _safe_tags(estimator, 'X_types'): + if '1darray' in estimator._get_tags()['X_types']: X = X[:, 0] # Estimators with a `requires_positive_X` tag only accept # strictly positive data - if _safe_tags(estimator, 'requires_positive_X'): + if estimator._get_tags()['requires_positive_X']: X -= X.min() return X @@ -2814,7 +2801,7 @@ def check_classifiers_regression_target(name, estimator_orig): X, y = load_boston(return_X_y=True) e = clone(estimator_orig) msg = 'Unknown label type: ' - if not _safe_tags(e, "no_validation"): + if not e._get_tags()["no_validation"]: assert_raises_regex(ValueError, msg, e.fit, X, y) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index a7f4911791467..a755daa842ef5 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -363,7 +363,8 @@ def test_check_estimator(): # check that we have a set_params and can clone msg = "it does not implement a 'get_params' method" assert_raises_regex(TypeError, msg, check_estimator, object) - assert_raises_regex(TypeError, msg, check_estimator, object()) + msg = "object has no attribute '_get_tags'" + assert_raises_regex(AttributeError, msg, check_estimator, object()) # check that values returned by get_params match set_params msg = "get_params result does not match what was passed to set_params" assert_raises_regex(AssertionError, msg, check_estimator, From 5dfca463ac7aef27aec9d3588e0a903e33693119 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20van=20Gelder?= Date: Sun, 19 Apr 2020 16:21:19 +0200 Subject: [PATCH 020/125] =?UTF-8?q?DOC=20DataConversionWarning:=20Add=20ex?= =?UTF-8?q?ample=20to=20doc=20of=20DataCon=E2=80=A6=20(#16704)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sklearn/exceptions.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py index ea34365afa703..0083632418c8b 100644 --- a/sklearn/exceptions.py +++ b/sklearn/exceptions.py @@ -85,6 +85,24 @@ class DataConversionWarning(UserWarning): implementation's data-type expectations; - passes an input whose shape can be interpreted ambiguously. + Examples + -------- + >>> from sklearn.utils import validation + >>> Y = [[1],[2],[3]] + >>> import warnings + >>> from sklearn.exceptions import DataConversionWarning + >>> warnings.simplefilter('always', DataConversionWarning) + >>> with warnings.catch_warnings(record=True) as w: + ... try: + ... # will trigger warning as Y is a column-vector + ... Y = validation.column_or_1d(Y,warn=True) + ... except ValueError: + ... pass + ... print(repr(w[-1].message)) + DataConversionWarning('A column-vector y was passed when a + 1d array was expected. Please change the shape of y to + (n_samples, ), for example using ravel().') + .. versionchanged:: 0.18 Moved from sklearn.utils.validation. """ From a0e6b95540a8ddc7778f90fd60721f4d9fda85cb Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Mon, 20 Apr 2020 00:53:29 +1000 Subject: [PATCH 021/125] MNT add pip-wheel-metadata to gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 9b158da07a2ec..b8ee8d20322c3 100644 --- a/.gitignore +++ b/.gitignore @@ -39,6 +39,7 @@ doc/samples *.prof .tox/ .coverage +pip-wheel-metadata lfw_preprocessed/ nips2010_pdf/ From 670b85c9e9cec05210e8596bc1fb9ca66787162f Mon Sep 17 00:00:00 2001 From: lrjball <50599110+lrjball@users.noreply.github.com> Date: Sun, 19 Apr 2020 16:24:20 +0100 Subject: [PATCH 022/125] ENH ColumnTransformer.get_feature_names() handles passthrough (#14048) --- doc/whats_new/v0.23.rst | 5 ++ sklearn/compose/_column_transformer.py | 29 ++++--- .../compose/tests/test_column_transformer.py | 85 ++++++++++++++++--- 3 files changed, 97 insertions(+), 22 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index c4fa3818aa1bc..9343f1ee46da9 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -105,6 +105,11 @@ Changelog a column name that is not unique in the dataframe. :pr:`16431` by `Thomas Fan`_. +- |Enhancement| :class:`compose.ColumnTransformer` method ``get_feature_names`` + now supports `'passthrough'` columns, with the feature name being either + the column name for a dataframe, or `'xi'` for column index `i`. + :pr:`14048` by :user:`Lewis Ball `. + :mod:`sklearn.datasets` ....................... diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 903c63a00fd22..2ef8876b0c4e7 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -315,19 +315,18 @@ def _validate_remainder(self, X): self.remainder) # Make it possible to check for reordered named columns on transform - if (hasattr(X, 'columns') and - any(_determine_key_type(cols) == 'str' - for cols in self._columns)): + self._has_str_cols = any(_determine_key_type(cols) == 'str' + for cols in self._columns) + if hasattr(X, 'columns'): self._df_columns = X.columns self._n_features = X.shape[1] cols = [] for columns in self._columns: cols.extend(_get_column_indices(X, columns)) - remaining_idx = list(set(range(self._n_features)) - set(cols)) - remaining_idx = sorted(remaining_idx) or None - self._remainder = ('remainder', self.remainder, remaining_idx) + remaining_idx = sorted(set(range(self._n_features)) - set(cols)) + self._remainder = ('remainder', self.remainder, remaining_idx or None) @property def named_transformers_(self): @@ -356,11 +355,18 @@ def get_feature_names(self): if trans == 'drop' or ( hasattr(column, '__len__') and not len(column)): continue - elif trans == 'passthrough': - raise NotImplementedError( - "get_feature_names is not yet supported when using " - "a 'passthrough' transformer.") - elif not hasattr(trans, 'get_feature_names'): + if trans == 'passthrough': + if hasattr(self, '_df_columns'): + if ((not isinstance(column, slice)) + and all(isinstance(col, str) for col in column)): + feature_names.extend(column) + else: + feature_names.extend(self._df_columns[column]) + else: + indices = np.arange(self._n_features) + feature_names.extend(['x%d' % i for i in indices[column]]) + continue + if not hasattr(trans, 'get_feature_names'): raise AttributeError("Transformer %s (type %s) does not " "provide get_feature_names." % (str(name), type(trans).__name__)) @@ -582,6 +588,7 @@ def transform(self, X): # name order and count. See #14237 for details. if (self._remainder[2] is not None and hasattr(self, '_df_columns') and + self._has_str_cols and hasattr(X, 'columns')): n_cols_fit = len(self._df_columns) n_cols_transform = len(X.columns) diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index ca1c185c91e06..a9f1764eb97e4 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -668,25 +668,88 @@ def test_column_transformer_get_feature_names(): ct.fit(X) assert ct.get_feature_names() == ['col0__a', 'col0__b', 'col1__c'] - # passthrough transformers not supported + # drop transformer + ct = ColumnTransformer( + [('col0', DictVectorizer(), 0), ('col1', 'drop', 1)]) + ct.fit(X) + assert ct.get_feature_names() == ['col0__a', 'col0__b'] + + # passthrough transformer ct = ColumnTransformer([('trans', 'passthrough', [0, 1])]) ct.fit(X) - assert_raise_message( - NotImplementedError, 'get_feature_names is not yet supported', - ct.get_feature_names) + assert ct.get_feature_names() == ['x0', 'x1'] ct = ColumnTransformer([('trans', DictVectorizer(), 0)], remainder='passthrough') ct.fit(X) - assert_raise_message( - NotImplementedError, 'get_feature_names is not yet supported', - ct.get_feature_names) + assert ct.get_feature_names() == ['trans__a', 'trans__b', 'x1'] - # drop transformer - ct = ColumnTransformer( - [('col0', DictVectorizer(), 0), ('col1', 'drop', 1)]) + ct = ColumnTransformer([('trans', 'passthrough', [1])], + remainder='passthrough') ct.fit(X) - assert ct.get_feature_names() == ['col0__a', 'col0__b'] + assert ct.get_feature_names() == ['x1', 'x0'] + + ct = ColumnTransformer([('trans', 'passthrough', lambda x: [1])], + remainder='passthrough') + ct.fit(X) + assert ct.get_feature_names() == ['x1', 'x0'] + + ct = ColumnTransformer([('trans', 'passthrough', np.array([False, True]))], + remainder='passthrough') + ct.fit(X) + assert ct.get_feature_names() == ['x1', 'x0'] + + ct = ColumnTransformer([('trans', 'passthrough', slice(1, 2))], + remainder='passthrough') + ct.fit(X) + assert ct.get_feature_names() == ['x1', 'x0'] + + +def test_column_transformer_get_feature_names_dataframe(): + # passthough transformer with a dataframe + pd = pytest.importorskip('pandas') + X = np.array([[{'a': 1, 'b': 2}, {'a': 3, 'b': 4}], + [{'c': 5}, {'c': 6}]], dtype=object).T + X_df = pd.DataFrame(X, columns=['col0', 'col1']) + + ct = ColumnTransformer([('trans', 'passthrough', ['col0', 'col1'])]) + ct.fit(X_df) + assert ct.get_feature_names() == ['col0', 'col1'] + + ct = ColumnTransformer([('trans', 'passthrough', [0, 1])]) + ct.fit(X_df) + assert ct.get_feature_names() == ['col0', 'col1'] + + ct = ColumnTransformer([('col0', DictVectorizer(), 0)], + remainder='passthrough') + ct.fit(X_df) + assert ct.get_feature_names() == ['col0__a', 'col0__b', 'col1'] + + ct = ColumnTransformer([('trans', 'passthrough', ['col1'])], + remainder='passthrough') + ct.fit(X_df) + assert ct.get_feature_names() == ['col1', 'col0'] + + ct = ColumnTransformer([('trans', 'passthrough', + lambda x: x[['col1']].columns)], + remainder='passthrough') + ct.fit(X_df) + assert ct.get_feature_names() == ['col1', 'col0'] + + ct = ColumnTransformer([('trans', 'passthrough', np.array([False, True]))], + remainder='passthrough') + ct.fit(X_df) + assert ct.get_feature_names() == ['col1', 'col0'] + + ct = ColumnTransformer([('trans', 'passthrough', slice(1, 2))], + remainder='passthrough') + ct.fit(X_df) + assert ct.get_feature_names() == ['col1', 'col0'] + + ct = ColumnTransformer([('trans', 'passthrough', [1])], + remainder='passthrough') + ct.fit(X_df) + assert ct.get_feature_names() == ['col1', 'col0'] def test_column_transformer_special_strings(): From 4d9478f433b8760018fcab16498f1bf4b83bf187 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 19 Apr 2020 16:55:50 -0400 Subject: [PATCH 023/125] DOC User Guide and docs for LDA and QDA (#16243) --- doc/modules/lda_qda.rst | 179 ++++++++++++++++++++----------- sklearn/discriminant_analysis.py | 141 +++++++++++++++--------- 2 files changed, 207 insertions(+), 113 deletions(-) diff --git a/doc/modules/lda_qda.rst b/doc/modules/lda_qda.rst index e1dfb0c03ea4b..c3ac94dedefa9 100644 --- a/doc/modules/lda_qda.rst +++ b/doc/modules/lda_qda.rst @@ -7,9 +7,9 @@ Linear and Quadratic Discriminant Analysis .. currentmodule:: sklearn Linear Discriminant Analysis -(:class:`discriminant_analysis.LinearDiscriminantAnalysis`) and Quadratic +(:class:`~discriminant_analysis.LinearDiscriminantAnalysis`) and Quadratic Discriminant Analysis -(:class:`discriminant_analysis.QuadraticDiscriminantAnalysis`) are two classic +(:class:`~discriminant_analysis.QuadraticDiscriminantAnalysis`) are two classic classifiers, with, as their names suggest, a linear and a quadratic decision surface, respectively. @@ -37,68 +37,59 @@ flexible. Dimensionality reduction using Linear Discriminant Analysis =========================================================== -:class:`discriminant_analysis.LinearDiscriminantAnalysis` can be used to +:class:`~discriminant_analysis.LinearDiscriminantAnalysis` can be used to perform supervised dimensionality reduction, by projecting the input data to a linear subspace consisting of the directions which maximize the separation between classes (in a precise sense discussed in the mathematics section below). The dimension of the output is necessarily less than the number of -classes, so this is, in general, a rather strong dimensionality reduction, and +classes, so this is in general a rather strong dimensionality reduction, and only makes sense in a multiclass setting. -This is implemented in -:func:`discriminant_analysis.LinearDiscriminantAnalysis.transform`. The desired -dimensionality can be set using the ``n_components`` constructor parameter. -This parameter has no influence on -:func:`discriminant_analysis.LinearDiscriminantAnalysis.fit` or -:func:`discriminant_analysis.LinearDiscriminantAnalysis.predict`. +This is implemented in the `transform` method. The desired dimensionality can +be set using the ``n_components`` parameter. This parameter has no influence +on the `fit` and `predict` methods. .. topic:: Examples: :ref:`sphx_glr_auto_examples_decomposition_plot_pca_vs_lda.py`: Comparison of LDA and PCA for dimensionality reduction of the Iris dataset +.. _lda_qda_math: + Mathematical formulation of the LDA and QDA classifiers ======================================================= Both LDA and QDA can be derived from simple probabilistic models which model the class conditional distribution of the data :math:`P(X|y=k)` for each class -:math:`k`. Predictions can then be obtained by using Bayes' rule: +:math:`k`. Predictions can then be obtained by using Bayes' rule, for each +training sample :math:`x \in \mathcal{R}^d`: .. math:: - P(y=k | X) = \frac{P(X | y=k) P(y=k)}{P(X)} = \frac{P(X | y=k) P(y = k)}{ \sum_{l} P(X | y=l) \cdot P(y=l)} + P(y=k | x) = \frac{P(x | y=k) P(y=k)}{P(x)} = \frac{P(x | y=k) P(y = k)}{ \sum_{l} P(x | y=l) \cdot P(y=l)} -and we select the class :math:`k` which maximizes this conditional probability. +and we select the class :math:`k` which maximizes this posterior probability. More specifically, for linear and quadratic discriminant analysis, -:math:`P(X|y)` is modeled as a multivariate Gaussian distribution with +:math:`P(x|y)` is modeled as a multivariate Gaussian distribution with density: -.. math:: P(X | y=k) = \frac{1}{(2\pi)^{d/2} |\Sigma_k|^{1/2}}\exp\left(-\frac{1}{2} (X-\mu_k)^t \Sigma_k^{-1} (X-\mu_k)\right) +.. math:: P(x | y=k) = \frac{1}{(2\pi)^{d/2} |\Sigma_k|^{1/2}}\exp\left(-\frac{1}{2} (x-\mu_k)^t \Sigma_k^{-1} (x-\mu_k)\right) where :math:`d` is the number of features. -To use this model as a classifier, we just need to estimate from the training -data the class priors :math:`P(y=k)` (by the proportion of instances of class -:math:`k`), the class means :math:`\mu_k` (by the empirical sample class means) -and the covariance matrices (either by the empirical sample class covariance -matrices, or by a regularized estimator: see the section on shrinkage below). +QDA +--- -In the case of LDA, the Gaussians for each class are assumed to share the same -covariance matrix: :math:`\Sigma_k = \Sigma` for all :math:`k`. This leads to -linear decision surfaces, which can be seen by comparing the -log-probability ratios :math:`\log[P(y=k | X) / P(y=l | X)]`: +According to the model above, the log of the posterior is: .. math:: - \log\left(\frac{P(y=k|X)}{P(y=l|X)}\right)= - \log\left(\frac{P(X|y=k)P(y=k)}{P(X|y=l)P(y=l)}\right)=0 \Leftrightarrow - (\mu_k-\mu_l)^t\Sigma^{-1} X = - \frac{1}{2} (\mu_k^t \Sigma^{-1} \mu_k - \mu_l^t \Sigma^{-1} \mu_l) - - \log\frac{P(y=k)}{P(y=l)} + \log P(y=k | x) &= \log P(x | y=k) + \log P(y = k) + Cst \\ + &= -\frac{1}{2} \log |\Sigma_k| -\frac{1}{2} (x-\mu_k)^t \Sigma_k^{-1} (x-\mu_k) + \log P(y = k) + Cst, -In the case of QDA, there are no assumptions on the covariance matrices -:math:`\Sigma_k` of the Gaussians, leading to quadratic decision surfaces. See -[#1]_ for more details. +where the constant term :math:`Cst` corresponds to the denominator +:math:`P(x)`, in addition to other constant terms from the Gaussian. The +predicted class is the one that maximises this log-posterior. .. note:: **Relation with Gaussian Naive Bayes** @@ -107,22 +98,60 @@ In the case of QDA, there are no assumptions on the covariance matrices and the resulting classifier is equivalent to the Gaussian Naive Bayes classifier :class:`naive_bayes.GaussianNB`. -Mathematical formulation of LDA dimensionality reduction -======================================================== +LDA +--- + +LDA is a special case of QDA, where the Gaussians for each class are assumed +to share the same covariance matrix: :math:`\Sigma_k = \Sigma` for all +:math:`k`. This reduces the log posterior to: -To understand the use of LDA in dimensionality reduction, it is useful to start -with a geometric reformulation of the LDA classification rule explained above. -We write :math:`K` for the total number of target classes. Since in LDA we -assume that all classes have the same estimated covariance :math:`\Sigma`, we -can rescale the data so that this covariance is the identity: +.. math:: \log P(y=k | x) = -\frac{1}{2} (x-\mu_k)^t \Sigma^{-1} (x-\mu_k) + \log P(y = k) + Cst. + +The term :math:`(x-\mu_k)^t \Sigma^{-1} (x-\mu_k)` corresponds to the +`Mahalanobis Distance `_ +between the sample :math:`x` and the mean :math:`\mu_k`. The Mahalanobis +distance tells how close :math:`x` is from :math:`\mu_k`, while also +accounting for the variance of each feature. We can thus interpret LDA as +assigning :math:`x` to the class whose mean is the closest in terms of +Mahalanobis distance, while also accounting for the class prior +probabilities. + +The log-posterior of LDA can also be written [3]_ as: + +.. math:: -.. math:: X^* = D^{-1/2}U^t X\text{ with }\Sigma = UDU^t + \log P(y=k | x) = \omega_k^t x + \omega_{k0} + Cst. -Then one can show that to classify a data point after scaling is equivalent to -finding the estimated class mean :math:`\mu^*_k` which is closest to the data -point in the Euclidean distance. But this can be done just as well after -projecting on the :math:`K-1` affine subspace :math:`H_K` generated by all the -:math:`\mu^*_k` for all classes. This shows that, implicit in the LDA +where :math:`\omega_k = \Sigma^{-1} \mu_k` and :math:`\omega_{k0} = +-\frac{1}{2} \mu_k^t\Sigma^{-1}\mu_k + \log P (y = k)`. These quantities +correspond to the `coef_` and `intercept_` attributes, respectively. + +From the above formula, it is clear that LDA has a linear decision surface. +In the case of QDA, there are no assumptions on the covariance matrices +:math:`\Sigma_k` of the Gaussians, leading to quadratic decision surfaces. +See [1]_ for more details. + +Mathematical formulation of LDA dimensionality reduction +======================================================== + +First note that the K means :math:`\mu_k` are vectors in +:math:`\mathcal{R}^d`, and they lie in an affine subspace :math:`H` of +dimension at least :math:`K - 1` (2 points lie on a line, 3 points lie on a +plane, etc). + +As mentioned above, we can interpret LDA as assigning :math:`x` to the class +whose mean :math:`\mu_k` is the closest in terms of Mahalanobis distance, +while also accounting for the class prior probabilities. Alternatively, LDA +is equivalent to first *sphering* the data so that the covariance matrix is +the identity, and then assigning :math:`x` to the closest mean in terms of +Euclidean distance (still accounting for the class priors). + +Computing Euclidean distances in this d-dimensional space is equivalent to +first projecting the data points into :math:`H`, and computing the distances +there (since the other dimensions will contribute equally to each class in +terms of distance). In other words, if :math:`x` is closest to :math:`\mu_k` +in the original space, it will also be the case in :math:`H`. +This shows that, implicit in the LDA classifier, there is a dimensionality reduction by linear projection onto a :math:`K-1` dimensional space. @@ -131,19 +160,22 @@ onto the linear subspace :math:`H_L` which maximizes the variance of the :math:`\mu^*_k` after projection (in effect, we are doing a form of PCA for the transformed class means :math:`\mu^*_k`). This :math:`L` corresponds to the ``n_components`` parameter used in the -:func:`discriminant_analysis.LinearDiscriminantAnalysis.transform` method. See -[#1]_ for more details. +:func:`~discriminant_analysis.LinearDiscriminantAnalysis.transform` method. See +[1]_ for more details. Shrinkage ========= -Shrinkage is a tool to improve estimation of covariance matrices in situations -where the number of training samples is small compared to the number of -features. In this scenario, the empirical sample covariance is a poor -estimator. Shrinkage LDA can be used by setting the ``shrinkage`` parameter of -the :class:`discriminant_analysis.LinearDiscriminantAnalysis` class to 'auto'. +Shrinkage is a form of regularization used to improve the estimation of +covariance matrices in situations where the number of training samples is +small compared to the number of features. +In this scenario, the empirical sample covariance is a poor +estimator, and shrinkage helps improving the generalization performance of +the classifier. +Shrinkage LDA can be used by setting the ``shrinkage`` parameter of +the :class:`~discriminant_analysis.LinearDiscriminantAnalysis` class to 'auto'. This automatically determines the optimal shrinkage parameter in an analytic -way following the lemma introduced by Ledoit and Wolf [#2]_. Note that +way following the lemma introduced by Ledoit and Wolf [2]_. Note that currently shrinkage only works when setting the ``solver`` parameter to 'lsqr' or 'eigen'. @@ -165,13 +197,33 @@ matrix. Estimation algorithms ===================== -The default solver is 'svd'. It can perform both classification and transform, -and it does not rely on the calculation of the covariance matrix. This can be -an advantage in situations where the number of features is large. However, the -'svd' solver cannot be used with shrinkage. - -The 'lsqr' solver is an efficient algorithm that only works for classification. -It supports shrinkage. +Using LDA and QDA requires computing the log-posterior which depends on the +class priors :math:`P(y=k)`, the class means :math:`\mu_k`, and the +covariance matrices. + +The 'svd' solver is the default solver used for +:class:`~sklearn.discriminant_analysis.LinearDiscriminantAnalysis`, and it is +the only available solver for +:class:`~sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis`. +It can perform both classification and transform (for LDA). +As it does not rely on the calculation of the covariance matrix, the 'svd' +solver may be preferable in situations where the number of features is large. +The 'svd' solver cannot be used with shrinkage. +For QDA, the use of the SVD solver relies on the fact that the covariance +matrix :math:`\Sigma_k` is, by definition, equal to :math:`\frac{1}{n - 1} +X_k^tX_k = V S^2 V^t` where :math:`V` comes from the SVD of the (centered) +matrix: :math:`X_k = U S V^t`. It turns out that we can compute the +log-posterior above without having to explictly compute :math:`\Sigma`: +computing :math:`S` and :math:`V` via the SVD of :math:`X` is enough. For +LDA, two SVDs are computed: the SVD of the centered input matrix :math:`X` +and the SVD of the class-wise mean vectors. + +The 'lsqr' solver is an efficient algorithm that only works for +classification. It needs to explicitly compute the covariance matrix +:math:`\Sigma`, and supports shrinkage. This solver computes the coefficients +:math:`\omega_k = \Sigma^{-1}\mu_k` by solving for :math:`\Sigma \omega = +\mu_k`, thus avoiding the explicit computation of the inverse +:math:`\Sigma^{-1}`. The 'eigen' solver is based on the optimization of the between class scatter to within class scatter ratio. It can be used for both classification and @@ -186,8 +238,11 @@ a high number of features. .. topic:: References: - .. [#1] "The Elements of Statistical Learning", Hastie T., Tibshirani R., + .. [1] "The Elements of Statistical Learning", Hastie T., Tibshirani R., Friedman J., Section 4.3, p.106-119, 2008. - .. [#2] Ledoit O, Wolf M. Honey, I Shrunk the Sample Covariance Matrix. + .. [2] Ledoit O, Wolf M. Honey, I Shrunk the Sample Covariance Matrix. The Journal of Portfolio Management 30(4), 110-119, 2004. + + .. [3] R. O. Duda, P. E. Hart, D. G. Stork. Pattern Classification + (Second Edition), section 2.6.2. diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index 45e623904b9ea..b07570a3f0a75 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -94,7 +94,9 @@ def _class_means(X, y): def _class_cov(X, y, priors, shrinkage=None): - """Compute class covariance matrix. + """Compute weighted within-class covariance matrix. + + The per-class covariance are weighted by the class priors. Parameters ---------- @@ -116,7 +118,7 @@ def _class_cov(X, y, priors, shrinkage=None): Returns ------- cov : array-like of shape (n_features, n_features) - Class covariance matrix. + Weighted within-class covariance matrix """ classes = np.unique(y) cov = np.zeros(shape=(X.shape[1], X.shape[1])) @@ -137,7 +139,8 @@ class LinearDiscriminantAnalysis(BaseEstimator, LinearClassifierMixin, share the same covariance matrix. The fitted model can also be used to reduce the dimensionality of the input - by projecting it to the most discriminative directions. + by projecting it to the most discriminative directions, using the + `transform` method. .. versionadded:: 0.17 *LinearDiscriminantAnalysis*. @@ -163,21 +166,27 @@ class LinearDiscriminantAnalysis(BaseEstimator, LinearClassifierMixin, Note that shrinkage works only with 'lsqr' and 'eigen' solvers. priors : array-like of shape (n_classes,), default=None - Class priors. + The class prior probabilities. By default, the class proportions are + inferred from the training data. n_components : int, default=None Number of components (<= min(n_classes - 1, n_features)) for dimensionality reduction. If None, will be set to - min(n_classes - 1, n_features). + min(n_classes - 1, n_features). This parameter only affects the + `transform` method. store_covariance : bool, default=False - Additionally compute class covariance matrix (default False), used - only in 'svd' solver. + If True, explicitely compute the weighted within-class covariance + matrix when solver is 'svd'. The matrix is always computed + and stored for the other solvers. .. versionadded:: 0.17 tol : float, default=1.0e-4 - Threshold used for rank estimation in SVD solver. + Absolute threshold for a singular value of X to be considered + significant, used to estimate the rank of X. Dimensions whose + singular values are non-significant are discarded. Only used if + solver is 'svd'. .. versionadded:: 0.17 @@ -190,8 +199,11 @@ class LinearDiscriminantAnalysis(BaseEstimator, LinearClassifierMixin, Intercept term. covariance_ : array-like of shape (n_features, n_features) - Covariance matrix (shared by all classes). Only available - `store_covariance` is True. + Weighted within-class covariance matrix. It corresponds to + `sum_k prior_k * C_k` where `C_k` is the covariance matrix of the + samples in class `k`. The `C_k` are estimated using the (potentially + shrunk) biased estimator of covariance. If solver is 'svd', only + exists when `store_covariance` is True. explained_variance_ratio_ : ndarray of shape (n_components,) Percentage of variance explained by each of the selected components. @@ -200,16 +212,17 @@ class LinearDiscriminantAnalysis(BaseEstimator, LinearClassifierMixin, or svd solver is used. means_ : array-like of shape (n_classes, n_features) - Class means. + Class-wise means. priors_ : array-like of shape (n_classes,) Class priors (sum to 1). scalings_ : array-like of shape (rank, n_classes - 1) Scaling of the features in the space spanned by the class centroids. + Only available for 'svd' and 'eigen' solvers. xbar_ : array-like of shape (n_features,) - Overall mean. + Overall mean. Only present if solver is 'svd'. classes_ : array-like of shape (n_classes,) Unique class labels. @@ -219,22 +232,6 @@ class LinearDiscriminantAnalysis(BaseEstimator, LinearClassifierMixin, sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis: Quadratic Discriminant Analysis - Notes - ----- - The default solver is 'svd'. It can perform both classification and - transform, and it does not rely on the calculation of the covariance - matrix. This can be an advantage in situations where the number of features - is large. However, the 'svd' solver cannot be used with shrinkage. - - The 'lsqr' solver is an efficient algorithm that only works for - classification. It supports shrinkage. - - The 'eigen' solver is based on the optimization of the between class - scatter to within class scatter ratio. It can be used for both - classification and transform, and it supports shrinkage. However, the - 'eigen' solver needs to compute the covariance matrix, so it might not be - suitable for situations with a high number of features. - Examples -------- >>> import numpy as np @@ -542,6 +539,29 @@ def predict_log_proba(self, X): """ return np.log(self.predict_proba(X)) + def decision_function(self, X): + """Apply decision function to an array of samples. + + The decision function is equal (up to a constant factor) to the + log-posterior of the model, i.e. `log p(y = k | x)`. In a binary + classification setting this instead corresponds to the difference + `log p(y = 1 | x) - log p(y = 0 | x)`. See :ref:`lda_qda_math`. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Array of samples (test vectors). + + Returns + ------- + C : ndarray of shape (n_samples,) or (n_samples, n_classes) + Decision function values related to each class, per sample. + In the two-class case, the shape is (n_samples,), giving the + log likelihood ratio of the positive class. + """ + # Only override for the doc + return super().decision_function(X) + class QuadraticDiscriminantAnalysis(ClassifierMixin, BaseEstimator): """Quadratic Discriminant Analysis @@ -560,47 +580,60 @@ class QuadraticDiscriminantAnalysis(ClassifierMixin, BaseEstimator): Parameters ---------- priors : ndarray of shape (n_classes,), default=None - Priors on classes + Class priors. By default, the class proportions are inferred from the + training data. reg_param : float, default=0.0 - Regularizes the covariance estimate as - ``(1-reg_param)*Sigma + reg_param*np.eye(n_features)`` + Regularizes the per-class covariance estimates by transforming S2 as + ``S2 = (1 - reg_param) * S2 + reg_param * np.eye(n_features)``, + where S2 corresponds to the `scaling_` attribute of a given class. store_covariance : bool, default=False - If True the covariance matrices are computed and stored in the - `self.covariance_` attribute. + If True, the class covariance matrices are explicitely computed and + stored in the `self.covariance_` attribute. .. versionadded:: 0.17 tol : float, default=1.0e-4 - Threshold used for rank estimation. + Absolute threshold for a singular value to be considered significant, + used to estimate the rank of `Xk` where `Xk` is the centered matrix + of samples in class k. This parameter does not affect the + predictions. It only controls a warning that is raised when features + are considered to be colinear. .. versionadded:: 0.17 Attributes ---------- - covariance_ : list of array-like of shape (n_features, n_features) - Covariance matrices of each class. Only available + covariance_ : list of len n_classes of ndarray \ + of shape (n_features, n_features) + For each class, gives the covariance matrix estimated using the + samples of that class. The estimations are unbiased. Only present if `store_covariance` is True. means_ : array-like of shape (n_classes, n_features) - Class means. + Class-wise means. priors_ : array-like of shape (n_classes,) Class priors (sum to 1). - rotations_ : list of ndarrays - For each class k an array of shape (n_features, n_k), with + rotations_ : list of len n_classes of ndarray of shape (n_features, n_k) + For each class k an array of shape (n_features, n_k), where ``n_k = min(n_features, number of elements in class k)`` It is the rotation of the Gaussian distribution, i.e. its - principal axis. - - scalings_ : list of ndarrays - For each class k an array of shape (n_k,). It contains the scaling - of the Gaussian distributions along its principal axes, i.e. the - variance in the rotated coordinate system. - - classes_ : array-like of shape (n_classes,) + principal axis. It corresponds to `V`, the matrix of eigenvectors + coming from the SVD of `Xk = U S Vt` where `Xk` is the centered + matrix of samples from class k. + + scalings_ : list of len n_classes of ndarray of shape (n_k,) + For each class, contains the scaling of + the Gaussian distributions along its principal axes, i.e. the + variance in the rotated coordinate system. It corresponds to `S^2 / + (n_samples - 1)`, where `S` is the diagonal matrix of singular values + from the SVD of `Xk`, where `Xk` is the centered matrix of samples + from class k. + + classes_ : ndarray of shape (n_classes,) Unique class labels. Examples @@ -676,7 +709,7 @@ def fit(self, X, y): 'is ill defined.' % str(self.classes_[ind])) Xgc = Xg - meang # Xgc = U * S * V.T - U, S, Vt = np.linalg.svd(Xgc, full_matrices=False) + _, S, Vt = np.linalg.svd(Xgc, full_matrices=False) rank = np.sum(S > self.tol) if rank < n_features: warnings.warn("Variables are collinear") @@ -695,6 +728,7 @@ def fit(self, X, y): return self def _decision_function(self, X): + # return log posterior, see eq (4.12) p. 110 of the ESL. check_is_fitted(self) X = check_array(X) @@ -704,7 +738,7 @@ def _decision_function(self, X): S = self.scalings_[i] Xm = X - self.means_[i] X2 = np.dot(Xm, R * (S ** (-0.5))) - norm2.append(np.sum(X2 ** 2, 1)) + norm2.append(np.sum(X2 ** 2, axis=1)) norm2 = np.array(norm2).T # shape = [len(X), n_classes] u = np.asarray([np.sum(np.log(s)) for s in self.scalings_]) return (-0.5 * (norm2 + u) + np.log(self.priors_)) @@ -712,6 +746,11 @@ def _decision_function(self, X): def decision_function(self, X): """Apply decision function to an array of samples. + The decision function is equal (up to a constant factor) to the + log-posterior of the model, i.e. `log p(y = k | x)`. In a binary + classification setting this instead corresponds to the difference + `log p(y = 1 | x) - log p(y = 0 | x)`. See :ref:`lda_qda_math`. + Parameters ---------- X : array-like of shape (n_samples, n_features) @@ -721,7 +760,7 @@ def decision_function(self, X): ------- C : ndarray of shape (n_samples,) or (n_samples, n_classes) Decision function values related to each class, per sample. - In the two-class case, the shape is [n_samples,], giving the + In the two-class case, the shape is (n_samples,), giving the log likelihood ratio of the positive class. """ dec_func = self._decision_function(X) @@ -768,7 +807,7 @@ def predict_proba(self, X): return likelihood / likelihood.sum(axis=1)[:, np.newaxis] def predict_log_proba(self, X): - """Return posterior probabilities of classification. + """Return log of posterior probabilities of classification. Parameters ---------- From dc0cc6e9bec5a9fac6358384ba7a827adc62d87c Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Sun, 19 Apr 2020 22:58:47 +0200 Subject: [PATCH 024/125] [MRG] API kwonly args in impute, inspection, kernel_ridge (#16926) , and linear_model --- sklearn/impute/_base.py | 10 ++++--- sklearn/impute/_iterative.py | 3 +-- sklearn/impute/_knn.py | 5 ++-- sklearn/impute/tests/test_impute.py | 4 +-- sklearn/inspection/_partial_dependence.py | 4 ++- sklearn/inspection/_permutation_importance.py | 4 ++- .../inspection/_plot/partial_dependence.py | 15 ++++++++--- sklearn/kernel_ridge.py | 6 +++-- sklearn/linear_model/_base.py | 5 ++-- sklearn/linear_model/_bayes.py | 9 ++++--- sklearn/linear_model/_coordinate_descent.py | 27 +++++++++++++------ sklearn/linear_model/_huber.py | 5 ++-- sklearn/linear_model/_least_angle.py | 22 ++++++++++----- sklearn/linear_model/_logistic.py | 8 +++--- sklearn/linear_model/_omp.py | 18 ++++++++----- sklearn/linear_model/_passive_aggressive.py | 7 +++-- sklearn/linear_model/_perceptron.py | 5 ++-- sklearn/linear_model/_ransac.py | 5 ++-- sklearn/linear_model/_ridge.py | 26 ++++++++++-------- sklearn/linear_model/_sag.py | 2 ++ sklearn/linear_model/_stochastic_gradient.py | 19 ++++++++----- sklearn/linear_model/_theil_sen.py | 5 ++-- sklearn/linear_model/tests/test_omp.py | 17 +++++++----- sklearn/linear_model/tests/test_ransac.py | 4 ++- 24 files changed, 151 insertions(+), 84 deletions(-) diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 608f8f54ee162..5f1069708a20e 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -14,6 +14,7 @@ from ..utils.sparsefuncs import _get_median from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES +from ..utils.validation import _deprecate_positional_args from ..utils._mask import _get_mask from ..utils import is_scalar_nan @@ -67,7 +68,7 @@ class _BaseImputer(TransformerMixin, BaseEstimator): It adds automatically support for `add_indicator`. """ - def __init__(self, missing_values=np.nan, add_indicator=False): + def __init__(self, *, missing_values=np.nan, add_indicator=False): self.missing_values = missing_values self.add_indicator = add_indicator @@ -205,7 +206,8 @@ class SimpleImputer(_BaseImputer): upon :meth:`transform` if strategy is not "constant". """ - def __init__(self, missing_values=np.nan, strategy="mean", + @_deprecate_positional_args + def __init__(self, *, missing_values=np.nan, strategy="mean", fill_value=None, verbose=0, copy=True, add_indicator=False): super().__init__( missing_values=missing_values, @@ -525,8 +527,8 @@ class MissingIndicator(TransformerMixin, BaseEstimator): [False, False]]) """ - - def __init__(self, missing_values=np.nan, features="missing-only", + @_deprecate_positional_args + def __init__(self, *, missing_values=np.nan, features="missing-only", sparse="auto", error_on_new=True): self.missing_values = missing_values self.features = features diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index 88eff8503d510..17a3d05507205 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -206,9 +206,8 @@ class IterativeImputer(_BaseImputer): Journal of the Royal Statistical Society 22(2): 302-306. `_ """ - def __init__(self, - estimator=None, + estimator=None, *, missing_values=np.nan, sample_posterior=False, max_iter=10, diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index f782a46a6b40d..44fccf024247e 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -14,6 +14,7 @@ from ..utils import is_scalar_nan from ..utils._mask import _get_mask from ..utils.validation import check_is_fitted +from ..utils.validation import _deprecate_positional_args class KNNImputer(_BaseImputer): @@ -94,8 +95,8 @@ class KNNImputer(_BaseImputer): [5.5, 6. , 5. ], [8. , 8. , 7. ]]) """ - - def __init__(self, missing_values=np.nan, n_neighbors=5, + @_deprecate_positional_args + def __init__(self, *, missing_values=np.nan, n_neighbors=5, weights="uniform", metric="nan_euclidean", copy=True, add_indicator=False): super().__init__( diff --git a/sklearn/impute/tests/test_impute.py b/sklearn/impute/tests/test_impute.py index 50f60ff6e96ad..58c71660b401d 100644 --- a/sklearn/impute/tests/test_impute.py +++ b/sklearn/impute/tests/test_impute.py @@ -48,14 +48,14 @@ def _check_statistics(X, X_true, assert_ae = assert_array_almost_equal # Normal matrix - imputer = SimpleImputer(missing_values, strategy=strategy) + imputer = SimpleImputer(missing_values=missing_values, strategy=strategy) X_trans = imputer.fit(X).transform(X.copy()) assert_ae(imputer.statistics_, statistics, err_msg=err_msg.format(False)) assert_ae(X_trans, X_true, err_msg=err_msg.format(False)) # Sparse matrix - imputer = SimpleImputer(missing_values, strategy=strategy) + imputer = SimpleImputer(missing_values=missing_values, strategy=strategy) imputer.fit(sparse.csc_matrix(X)) X_trans = imputer.transform(sparse.csc_matrix(X.copy())) diff --git a/sklearn/inspection/_partial_dependence.py b/sklearn/inspection/_partial_dependence.py index f0fbc23333266..f3bb10a1a3275 100644 --- a/sklearn/inspection/_partial_dependence.py +++ b/sklearn/inspection/_partial_dependence.py @@ -20,6 +20,7 @@ from ..utils import _determine_key_type from ..utils import _get_column_indices from ..utils.validation import check_is_fitted +from ..utils.validation import _deprecate_positional_args from ..tree import DecisionTreeRegressor from ..ensemble import RandomForestRegressor from ..exceptions import NotFittedError @@ -181,7 +182,8 @@ def _partial_dependence_brute(est, grid, features, X, response_method): return averaged_predictions -def partial_dependence(estimator, X, features, response_method='auto', +@_deprecate_positional_args +def partial_dependence(estimator, X, features, *, response_method='auto', percentiles=(0.05, 0.95), grid_resolution=100, method='auto'): """Partial dependence of ``features``. diff --git a/sklearn/inspection/_permutation_importance.py b/sklearn/inspection/_permutation_importance.py index 8efafd8a7eef4..e8d77360a7ca0 100644 --- a/sklearn/inspection/_permutation_importance.py +++ b/sklearn/inspection/_permutation_importance.py @@ -7,6 +7,7 @@ from ..utils import Bunch from ..utils import check_random_state from ..utils import check_array +from ..utils.validation import _deprecate_positional_args def _calculate_permutation_scores(estimator, X, y, col_idx, random_state, @@ -37,7 +38,8 @@ def _calculate_permutation_scores(estimator, X, y, col_idx, random_state, return scores -def permutation_importance(estimator, X, y, scoring=None, n_repeats=5, +@_deprecate_positional_args +def permutation_importance(estimator, X, y, *, scoring=None, n_repeats=5, n_jobs=None, random_state=None): """Permutation importance for feature evaluation [BRE]_. diff --git a/sklearn/inspection/_plot/partial_dependence.py b/sklearn/inspection/_plot/partial_dependence.py index f39c604cac77b..4a83ac057d91e 100644 --- a/sklearn/inspection/_plot/partial_dependence.py +++ b/sklearn/inspection/_plot/partial_dependence.py @@ -13,9 +13,11 @@ from ...utils import check_array from ...utils import check_matplotlib_support # noqa from ...utils import _safe_indexing +from ...utils.validation import _deprecate_positional_args -def plot_partial_dependence(estimator, X, features, feature_names=None, +@_deprecate_positional_args +def plot_partial_dependence(estimator, X, features, *, feature_names=None, target=None, response_method='auto', n_cols=3, grid_resolution=100, percentiles=(0.05, 0.95), method='auto', n_jobs=None, verbose=0, fig=None, @@ -322,8 +324,12 @@ def convert_feature(fx): fig.clear() ax = fig.gca() - display = PartialDependenceDisplay(pd_results, features, feature_names, - target_idx, pdp_lim, deciles) + display = PartialDependenceDisplay(pd_results=pd_results, + features=features, + feature_names=feature_names, + target_idx=target_idx, + pdp_lim=pdp_lim, + deciles=deciles) return display.plot(ax=ax, n_cols=n_cols, line_kw=line_kw, contour_kw=contour_kw) @@ -406,7 +412,8 @@ class PartialDependenceDisplay: Figure containing partial dependence plots. """ - def __init__(self, pd_results, features, feature_names, target_idx, + @_deprecate_positional_args + def __init__(self, pd_results, *, features, feature_names, target_idx, pdp_lim, deciles): self.pd_results = pd_results self.features = features diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py index d08c706caefc4..f11501960b29c 100644 --- a/sklearn/kernel_ridge.py +++ b/sklearn/kernel_ridge.py @@ -10,6 +10,7 @@ from .metrics.pairwise import pairwise_kernels from .linear_model._ridge import _solve_cholesky_kernel from .utils.validation import check_is_fitted, _check_sample_weight +from .utils.validation import _deprecate_positional_args class KernelRidge(MultiOutputMixin, RegressorMixin, BaseEstimator): @@ -113,8 +114,9 @@ class KernelRidge(MultiOutputMixin, RegressorMixin, BaseEstimator): >>> clf.fit(X, y) KernelRidge(alpha=1.0) """ - def __init__(self, alpha=1, kernel="linear", gamma=None, degree=3, coef0=1, - kernel_params=None): + @_deprecate_positional_args + def __init__(self, alpha=1, *, kernel="linear", gamma=None, degree=3, + coef0=1, kernel_params=None): self.alpha = alpha self.kernel = kernel self.gamma = gamma diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 8e91767b9ff53..56e5e24761128 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -28,6 +28,7 @@ MultiOutputMixin) from ..utils import check_array from ..utils.validation import FLOAT_DTYPES +from ..utils.validation import _deprecate_positional_args from ..utils import check_random_state from ..utils.extmath import safe_sparse_dot from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale @@ -466,8 +467,8 @@ class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel): >>> reg.predict(np.array([[3, 5]])) array([16.]) """ - - def __init__(self, fit_intercept=True, normalize=False, copy_X=True, + @_deprecate_positional_args + def __init__(self, *, fit_intercept=True, normalize=False, copy_X=True, n_jobs=None): self.fit_intercept = fit_intercept self.normalize = normalize diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py index 397461e73d8be..eb12a271695cd 100644 --- a/sklearn/linear_model/_bayes.py +++ b/sklearn/linear_model/_bayes.py @@ -14,6 +14,7 @@ from ..utils.extmath import fast_logdet from ..utils.fixes import pinvh from ..utils.validation import _check_sample_weight +from ..utils.validation import _deprecate_positional_args ############################################################################### @@ -145,8 +146,8 @@ class BayesianRidge(RegressorMixin, LinearModel): M. E. Tipping, Sparse Bayesian Learning and the Relevance Vector Machine, Journal of Machine Learning Research, Vol. 1, 2001. """ - - def __init__(self, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6, + @_deprecate_positional_args + def __init__(self, *, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6, lambda_1=1.e-6, lambda_2=1.e-6, alpha_init=None, lambda_init=None, compute_score=False, fit_intercept=True, normalize=False, copy_X=True, verbose=False): @@ -489,8 +490,8 @@ class ARDRegression(RegressorMixin, LinearModel): which ``self.lambda_ < self.threshold_lambda`` are kept and the rest are discarded. """ - - def __init__(self, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6, + @_deprecate_positional_args + def __init__(self, *, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6, lambda_1=1.e-6, lambda_2=1.e-6, compute_score=False, threshold_lambda=1.e+4, fit_intercept=True, normalize=False, copy_X=True, verbose=False): diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index 46e924abbc1d0..5d932033dbd0d 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -24,6 +24,7 @@ from ..utils.fixes import _astype_copy_false, _joblib_parallel_args from ..utils.validation import check_is_fitted, _check_sample_weight from ..utils.validation import column_or_1d +from ..utils.validation import _deprecate_positional_args # mypy error: Module 'sklearn.linear_model' has no attribute '_cd_fast' from . import _cd_fast as cd_fast # type: ignore @@ -690,7 +691,8 @@ class ElasticNet(MultiOutputMixin, RegressorMixin, LinearModel): """ path = staticmethod(enet_path) - def __init__(self, alpha=1.0, l1_ratio=0.5, fit_intercept=True, + @_deprecate_positional_args + def __init__(self, alpha=1.0, *, l1_ratio=0.5, fit_intercept=True, normalize=False, precompute=False, max_iter=1000, copy_X=True, tol=1e-4, warm_start=False, positive=False, random_state=None, selection='cyclic'): @@ -1003,7 +1005,8 @@ class Lasso(ElasticNet): """ path = staticmethod(enet_path) - def __init__(self, alpha=1.0, fit_intercept=True, normalize=False, + @_deprecate_positional_args + def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False, precompute=False, copy_X=True, max_iter=1000, tol=1e-4, warm_start=False, positive=False, random_state=None, selection='cyclic'): @@ -1466,7 +1469,9 @@ class LassoCV(RegressorMixin, LinearModelCV): """ path = staticmethod(lasso_path) - def __init__(self, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True, + @_deprecate_positional_args + def __init__(self, *, eps=1e-3, n_alphas=100, alphas=None, + fit_intercept=True, normalize=False, precompute='auto', max_iter=1000, tol=1e-4, copy_X=True, cv=None, verbose=False, n_jobs=None, positive=False, random_state=None, selection='cyclic'): @@ -1662,7 +1667,8 @@ class ElasticNetCV(RegressorMixin, LinearModelCV): """ path = staticmethod(enet_path) - def __init__(self, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None, + @_deprecate_positional_args + def __init__(self, *, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True, normalize=False, precompute='auto', max_iter=1000, tol=1e-4, cv=None, copy_X=True, verbose=0, n_jobs=None, positive=False, random_state=None, @@ -1801,7 +1807,8 @@ class MultiTaskElasticNet(Lasso): To avoid unnecessary memory duplication the X argument of the fit method should be directly passed as a Fortran-contiguous numpy array. """ - def __init__(self, alpha=1.0, l1_ratio=0.5, fit_intercept=True, + @_deprecate_positional_args + def __init__(self, alpha=1.0, *, l1_ratio=0.5, fit_intercept=True, normalize=False, copy_X=True, max_iter=1000, tol=1e-4, warm_start=False, random_state=None, selection='cyclic'): self.l1_ratio = l1_ratio @@ -1983,7 +1990,8 @@ class MultiTaskLasso(MultiTaskElasticNet): To avoid unnecessary memory duplication the X argument of the fit method should be directly passed as a Fortran-contiguous numpy array. """ - def __init__(self, alpha=1.0, fit_intercept=True, normalize=False, + @_deprecate_positional_args + def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False, copy_X=True, max_iter=1000, tol=1e-4, warm_start=False, random_state=None, selection='cyclic'): self.alpha = alpha @@ -2162,7 +2170,8 @@ class MultiTaskElasticNetCV(RegressorMixin, LinearModelCV): """ path = staticmethod(enet_path) - def __init__(self, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None, + @_deprecate_positional_args + def __init__(self, *, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True, normalize=False, max_iter=1000, tol=1e-4, cv=None, copy_X=True, verbose=0, n_jobs=None, random_state=None, @@ -2333,7 +2342,9 @@ class MultiTaskLassoCV(RegressorMixin, LinearModelCV): """ path = staticmethod(lasso_path) - def __init__(self, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True, + @_deprecate_positional_args + def __init__(self, *, eps=1e-3, n_alphas=100, alphas=None, + fit_intercept=True, normalize=False, max_iter=1000, tol=1e-4, copy_X=True, cv=None, verbose=False, n_jobs=None, random_state=None, selection='cyclic'): diff --git a/sklearn/linear_model/_huber.py b/sklearn/linear_model/_huber.py index d9046d3a1ee9b..77e6ff944b78d 100644 --- a/sklearn/linear_model/_huber.py +++ b/sklearn/linear_model/_huber.py @@ -9,6 +9,7 @@ from ._base import LinearModel from ..utils import axis0_safe_slice from ..utils.validation import _check_sample_weight +from ..utils.validation import _deprecate_positional_args from ..utils.extmath import safe_sparse_dot from ..utils.optimize import _check_optimize_result @@ -222,8 +223,8 @@ class HuberRegressor(LinearModel, RegressorMixin, BaseEstimator): .. [2] Art B. Owen (2006), A robust hybrid of lasso and ridge regression. https://statweb.stanford.edu/~owen/reports/hhu.pdf """ - - def __init__(self, epsilon=1.35, max_iter=100, alpha=0.0001, + @_deprecate_positional_args + def __init__(self, *, epsilon=1.35, max_iter=100, alpha=0.0001, warm_start=False, fit_intercept=True, tol=1e-05): self.epsilon = epsilon self.max_iter = max_iter diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py index bc71d7a1fccbd..255baacea9a59 100644 --- a/sklearn/linear_model/_least_angle.py +++ b/sklearn/linear_model/_least_angle.py @@ -24,11 +24,13 @@ from ..utils import check_random_state from ..model_selection import check_cv from ..exceptions import ConvergenceWarning +from ..utils.validation import _deprecate_positional_args SOLVE_TRIANGULAR_ARGS = {'check_finite': False} -def lars_path(X, y, Xy=None, Gram=None, max_iter=500, alpha_min=0, +@_deprecate_positional_args +def lars_path(X, y, Xy=None, *, Gram=None, max_iter=500, alpha_min=0, method='lar', copy_X=True, eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0, return_path=True, return_n_iter=False, positive=False): @@ -157,7 +159,8 @@ def lars_path(X, y, Xy=None, Gram=None, max_iter=500, alpha_min=0, return_n_iter=return_n_iter, positive=positive) -def lars_path_gram(Xy, Gram, n_samples, max_iter=500, alpha_min=0, +@_deprecate_positional_args +def lars_path_gram(Xy, Gram, *, n_samples, max_iter=500, alpha_min=0, method='lar', copy_X=True, eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0, return_path=True, return_n_iter=False, positive=False): @@ -855,7 +858,8 @@ class Lars(MultiOutputMixin, RegressorMixin, LinearModel): method = 'lar' positive = False - def __init__(self, fit_intercept=True, verbose=False, normalize=True, + @_deprecate_positional_args + def __init__(self, *, fit_intercept=True, verbose=False, normalize=True, precompute='auto', n_nonzero_coefs=500, eps=np.finfo(np.float).eps, copy_X=True, fit_path=True, jitter=None, random_state=None): @@ -1110,7 +1114,8 @@ class LassoLars(Lars): """ method = 'lasso' - def __init__(self, alpha=1.0, fit_intercept=True, verbose=False, + @_deprecate_positional_args + def __init__(self, alpha=1.0, *, fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=np.finfo(np.float).eps, copy_X=True, fit_path=True, positive=False, jitter=None, random_state=None): @@ -1367,7 +1372,8 @@ class LarsCV(Lars): method = 'lar' - def __init__(self, fit_intercept=True, verbose=False, max_iter=500, + @_deprecate_positional_args + def __init__(self, *, fit_intercept=True, verbose=False, max_iter=500, normalize=True, precompute='auto', cv=None, max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps, copy_X=True): @@ -1608,7 +1614,8 @@ class LassoLarsCV(LarsCV): method = 'lasso' - def __init__(self, fit_intercept=True, verbose=False, max_iter=500, + @_deprecate_positional_args + def __init__(self, *, fit_intercept=True, verbose=False, max_iter=500, normalize=True, precompute='auto', cv=None, max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps, copy_X=True, positive=False): @@ -1741,7 +1748,8 @@ class LassoLarsIC(LassoLars): -------- lars_path, LassoLars, LassoLarsCV """ - def __init__(self, criterion='aic', fit_intercept=True, verbose=False, + @_deprecate_positional_args + def __init__(self, criterion='aic', *, fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=np.finfo(np.float).eps, copy_X=True, positive=False): self.criterion = criterion diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 874dc743f4cc2..9ef3a21e4a76d 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -29,6 +29,7 @@ from ..utils.extmath import row_norms from ..utils.optimize import _newton_cg, _check_optimize_result from ..utils.validation import check_is_fitted, _check_sample_weight +from ..utils.validation import _deprecate_positional_args from ..utils.multiclass import check_classification_targets from ..utils.fixes import _joblib_parallel_args from ..model_selection import check_cv @@ -1246,8 +1247,8 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, >>> clf.score(X, y) 0.97... """ - - def __init__(self, penalty='l2', dual=False, tol=1e-4, C=1.0, + @_deprecate_positional_args + def __init__(self, penalty='l2', *, dual=False, tol=1e-4, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, @@ -1737,7 +1738,8 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator, LogisticRegression """ - def __init__(self, Cs=10, fit_intercept=True, cv=None, dual=False, + @_deprecate_positional_args + def __init__(self, *, Cs=10, fit_intercept=True, cv=None, dual=False, penalty='l2', scoring=None, solver='lbfgs', tol=1e-4, max_iter=100, class_weight=None, n_jobs=None, verbose=0, refit=True, intercept_scaling=1., multi_class='auto', diff --git a/sklearn/linear_model/_omp.py b/sklearn/linear_model/_omp.py index 0d572dd17c6d7..44371e9fa76e7 100644 --- a/sklearn/linear_model/_omp.py +++ b/sklearn/linear_model/_omp.py @@ -16,6 +16,7 @@ from ._base import LinearModel, _pre_fit from ..base import RegressorMixin, MultiOutputMixin from ..utils import as_float_array, check_array +from ..utils.validation import _deprecate_positional_args from ..model_selection import check_cv premature = """ Orthogonal matching pursuit ended prematurely due to linear @@ -262,7 +263,8 @@ def _gram_omp(Gram, Xy, n_nonzero_coefs, tol_0=None, tol=None, return gamma, indices[:n_active], n_active -def orthogonal_mp(X, y, n_nonzero_coefs=None, tol=None, precompute=False, +@_deprecate_positional_args +def orthogonal_mp(X, y, *, n_nonzero_coefs=None, tol=None, precompute=False, copy_X=True, return_path=False, return_n_iter=False): r"""Orthogonal Matching Pursuit (OMP) @@ -371,7 +373,8 @@ def orthogonal_mp(X, y, n_nonzero_coefs=None, tol=None, precompute=False, norms_squared = np.sum((y ** 2), axis=0) else: norms_squared = None - return orthogonal_mp_gram(G, Xy, n_nonzero_coefs, tol, norms_squared, + return orthogonal_mp_gram(G, Xy, n_nonzero_coefs=n_nonzero_coefs, + tol=tol, norms_squared=norms_squared, copy_Gram=copy_X, copy_Xy=False, return_path=return_path) @@ -404,7 +407,8 @@ def orthogonal_mp(X, y, n_nonzero_coefs=None, tol=None, precompute=False, return np.squeeze(coef) -def orthogonal_mp_gram(Gram, Xy, n_nonzero_coefs=None, tol=None, +@_deprecate_positional_args +def orthogonal_mp_gram(Gram, Xy, *, n_nonzero_coefs=None, tol=None, norms_squared=None, copy_Gram=True, copy_Xy=True, return_path=False, return_n_iter=False): @@ -616,7 +620,8 @@ class OrthogonalMatchingPursuit(MultiOutputMixin, RegressorMixin, LinearModel): decomposition.sparse_encode OrthogonalMatchingPursuitCV """ - def __init__(self, n_nonzero_coefs=None, tol=None, fit_intercept=True, + @_deprecate_positional_args + def __init__(self, *, n_nonzero_coefs=None, tol=None, fit_intercept=True, normalize=True, precompute='auto'): self.n_nonzero_coefs = n_nonzero_coefs self.tol = tol @@ -660,7 +665,7 @@ def fit(self, X, y): if Gram is False: coef_, self.n_iter_ = orthogonal_mp( - X, y, self.n_nonzero_coefs_, self.tol, + X, y, n_nonzero_coefs=self.n_nonzero_coefs_, tol=self.tol, precompute=False, copy_X=True, return_n_iter=True) else: @@ -853,7 +858,8 @@ class OrthogonalMatchingPursuitCV(RegressorMixin, LinearModel): decomposition.sparse_encode """ - def __init__(self, copy=True, fit_intercept=True, normalize=True, + @_deprecate_positional_args + def __init__(self, *, copy=True, fit_intercept=True, normalize=True, max_iter=None, cv=None, n_jobs=None, verbose=False): self.copy = copy self.fit_intercept = fit_intercept diff --git a/sklearn/linear_model/_passive_aggressive.py b/sklearn/linear_model/_passive_aggressive.py index 3b8354f5a7352..22c47fb1fcf07 100644 --- a/sklearn/linear_model/_passive_aggressive.py +++ b/sklearn/linear_model/_passive_aggressive.py @@ -1,6 +1,7 @@ # Authors: Rob Zinkov, Mathieu Blondel # License: BSD 3 clause +from ..utils.validation import _deprecate_positional_args from ._stochastic_gradient import BaseSGDClassifier from ._stochastic_gradient import BaseSGDRegressor from ._stochastic_gradient import DEFAULT_EPSILON @@ -163,7 +164,8 @@ class PassiveAggressiveClassifier(BaseSGDClassifier): K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR (2006) """ - def __init__(self, C=1.0, fit_intercept=True, max_iter=1000, tol=1e-3, + @_deprecate_positional_args + def __init__(self, *, C=1.0, fit_intercept=True, max_iter=1000, tol=1e-3, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, shuffle=True, verbose=0, loss="hinge", n_jobs=None, random_state=None, warm_start=False, @@ -390,7 +392,8 @@ class PassiveAggressiveRegressor(BaseSGDRegressor): K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR (2006) """ - def __init__(self, C=1.0, fit_intercept=True, max_iter=1000, tol=1e-3, + @_deprecate_positional_args + def __init__(self, *, C=1.0, fit_intercept=True, max_iter=1000, tol=1e-3, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, shuffle=True, verbose=0, loss="epsilon_insensitive", epsilon=DEFAULT_EPSILON, diff --git a/sklearn/linear_model/_perceptron.py b/sklearn/linear_model/_perceptron.py index ff50f6ebbc06e..54d7888109702 100644 --- a/sklearn/linear_model/_perceptron.py +++ b/sklearn/linear_model/_perceptron.py @@ -1,6 +1,7 @@ # Author: Mathieu Blondel # License: BSD 3 clause +from ..utils.validation import _deprecate_positional_args from ._stochastic_gradient import BaseSGDClassifier @@ -143,8 +144,8 @@ class Perceptron(BaseSGDClassifier): https://en.wikipedia.org/wiki/Perceptron and references therein. """ - - def __init__(self, penalty=None, alpha=0.0001, fit_intercept=True, + @_deprecate_positional_args + def __init__(self, *, penalty=None, alpha=0.0001, fit_intercept=True, max_iter=1000, tol=1e-3, shuffle=True, verbose=0, eta0=1.0, n_jobs=None, random_state=0, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index cd5e3db49842d..86ceb0d5e311f 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -12,6 +12,7 @@ from ..utils import check_random_state, check_array, check_consistent_length from ..utils.random import sample_without_replacement from ..utils.validation import check_is_fitted, _check_sample_weight +from ..utils.validation import _deprecate_positional_args from ._base import LinearRegression from ..utils.validation import has_fit_parameter from ..exceptions import ConvergenceWarning @@ -201,8 +202,8 @@ class RANSACRegressor(MetaEstimatorMixin, RegressorMixin, .. [2] https://www.sri.com/sites/default/files/publications/ransac-publication.pdf .. [3] http://www.bmva.org/bmvc/2009/Papers/Paper355/Paper355.pdf """ - - def __init__(self, base_estimator=None, min_samples=None, + @_deprecate_positional_args + def __init__(self, base_estimator=None, *, min_samples=None, residual_threshold=None, is_data_valid=None, is_model_valid=None, max_trials=100, max_skips=np.inf, stop_n_inliers=np.inf, stop_score=np.inf, diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 9c3f703ac478e..309137bed2b5d 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -27,6 +27,7 @@ from ..utils import compute_sample_weight from ..utils import column_or_1d from ..utils.validation import _check_sample_weight +from ..utils.validation import _deprecate_positional_args from ..preprocessing import LabelBinarizer from ..model_selection import GridSearchCV from ..metrics import check_scoring @@ -234,7 +235,8 @@ def _get_valid_accept_sparse(is_X_sparse, solver): return ['csr', 'csc', 'coo'] -def ridge_regression(X, y, alpha, sample_weight=None, solver='auto', +@_deprecate_positional_args +def ridge_regression(X, y, alpha, *, sample_weight=None, solver='auto', max_iter=None, tol=1e-3, verbose=0, random_state=None, return_n_iter=False, return_intercept=False, check_input=True): @@ -518,7 +520,8 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto', class _BaseRidge(LinearModel, metaclass=ABCMeta): @abstractmethod - def __init__(self, alpha=1.0, fit_intercept=True, normalize=False, + @_deprecate_positional_args + def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=1e-3, solver="auto", random_state=None): self.alpha = alpha @@ -727,8 +730,8 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge): >>> clf.fit(X, y) Ridge() """ - - def __init__(self, alpha=1.0, fit_intercept=True, normalize=False, + @_deprecate_positional_args + def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=1e-3, solver="auto", random_state=None): super().__init__( @@ -885,8 +888,8 @@ class RidgeClassifier(LinearClassifierMixin, _BaseRidge): >>> clf.score(X, y) 0.9595... """ - - def __init__(self, alpha=1.0, fit_intercept=True, normalize=False, + @_deprecate_positional_args + def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=1e-3, class_weight=None, solver="auto", random_state=None): super().__init__( @@ -1112,8 +1115,8 @@ class _RidgeGCV(LinearModel): http://cbcl.mit.edu/publications/ps/MIT-CSAIL-TR-2007-025.pdf https://www.mit.edu/~9.520/spring07/Classes/rlsslides.pdf """ - - def __init__(self, alphas=(0.1, 1.0, 10.0), + @_deprecate_positional_args + def __init__(self, alphas=(0.1, 1.0, 10.0), *, fit_intercept=True, normalize=False, scoring=None, copy_X=True, gcv_mode=None, store_cv_values=False, @@ -1546,7 +1549,8 @@ def fit(self, X, y, sample_weight=None): class _BaseRidgeCV(LinearModel): - def __init__(self, alphas=(0.1, 1.0, 10.0), + @_deprecate_positional_args + def __init__(self, alphas=(0.1, 1.0, 10.0), *, fit_intercept=True, normalize=False, scoring=None, cv=None, gcv_mode=None, store_cv_values=False): @@ -1854,8 +1858,8 @@ class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV): a one-versus-all approach. Concretely, this is implemented by taking advantage of the multi-variate response support in Ridge. """ - - def __init__(self, alphas=(0.1, 1.0, 10.0), fit_intercept=True, + @_deprecate_positional_args + def __init__(self, alphas=(0.1, 1.0, 10.0), *, fit_intercept=True, normalize=False, scoring=None, cv=None, class_weight=None, store_cv_values=False): super().__init__( diff --git a/sklearn/linear_model/_sag.py b/sklearn/linear_model/_sag.py index 9fe6f076f5145..caa9b2d133003 100644 --- a/sklearn/linear_model/_sag.py +++ b/sklearn/linear_model/_sag.py @@ -13,6 +13,7 @@ from ..exceptions import ConvergenceWarning from ..utils import check_array from ..utils.validation import _check_sample_weight +from ..utils.validation import _deprecate_positional_args from ..utils.extmath import row_norms @@ -84,6 +85,7 @@ def get_auto_step_size(max_squared_sum, alpha_scaled, loss, fit_intercept, return step +@_deprecate_positional_args def sag_solver(X, y, sample_weight=None, loss='log', alpha=1., beta=0., max_iter=1000, tol=0.001, verbose=0, random_state=None, check_input=True, max_squared_sum=None, diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index bf1e77e3e355b..3bedd8a26674b 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -19,6 +19,7 @@ from ..utils.extmath import safe_sparse_dot from ..utils.multiclass import _check_partial_fit_first_call from ..utils.validation import check_is_fitted, _check_sample_weight +from ..utils.validation import _deprecate_positional_args from ..exceptions import ConvergenceWarning from ..model_selection import StratifiedShuffleSplit, ShuffleSplit @@ -68,8 +69,8 @@ def __call__(self, coef, intercept): class BaseSGD(SparseCoefMixin, BaseEstimator, metaclass=ABCMeta): """Base class for SGD classification and regression.""" - - def __init__(self, loss, penalty='l2', alpha=0.0001, C=1.0, + @_deprecate_positional_args + def __init__(self, loss, *, penalty='l2', alpha=0.0001, C=1.0, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3, shuffle=True, verbose=0, epsilon=0.1, random_state=None, learning_rate="optimal", eta0=0.0, power_t=0.5, @@ -461,7 +462,8 @@ class BaseSGDClassifier(LinearClassifierMixin, BaseSGD, metaclass=ABCMeta): } @abstractmethod - def __init__(self, loss="hinge", penalty='l2', alpha=0.0001, + @_deprecate_positional_args + def __init__(self, loss="hinge", *, penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3, shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON, n_jobs=None, random_state=None, learning_rate="optimal", eta0=0.0, @@ -950,8 +952,9 @@ class SGDClassifier(BaseSGDClassifier): >>> print(clf.predict([[-0.8, -1]])) [1] """ - - def __init__(self, loss="hinge", penalty='l2', alpha=0.0001, l1_ratio=0.15, + @_deprecate_positional_args + def __init__(self, loss="hinge", *, penalty='l2', alpha=0.0001, + l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3, shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON, n_jobs=None, random_state=None, learning_rate="optimal", eta0=0.0, @@ -1097,7 +1100,8 @@ class BaseSGDRegressor(RegressorMixin, BaseSGD): } @abstractmethod - def __init__(self, loss="squared_loss", penalty="l2", alpha=0.0001, + @_deprecate_positional_args + def __init__(self, loss="squared_loss", *, penalty="l2", alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3, shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON, random_state=None, learning_rate="invscaling", eta0=0.01, @@ -1543,7 +1547,8 @@ class SGDRegressor(BaseSGDRegressor): Ridge, ElasticNet, Lasso, sklearn.svm.SVR """ - def __init__(self, loss="squared_loss", penalty="l2", alpha=0.0001, + @_deprecate_positional_args + def __init__(self, loss="squared_loss", *, penalty="l2", alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=1e-3, shuffle=True, verbose=0, epsilon=DEFAULT_EPSILON, random_state=None, learning_rate="invscaling", eta0=0.01, diff --git a/sklearn/linear_model/_theil_sen.py b/sklearn/linear_model/_theil_sen.py index 16f0adae12c9c..28d2dba3f8719 100644 --- a/sklearn/linear_model/_theil_sen.py +++ b/sklearn/linear_model/_theil_sen.py @@ -20,6 +20,7 @@ from ._base import LinearModel from ..base import RegressorMixin from ..utils import check_random_state +from ..utils.validation import _deprecate_positional_args from ..exceptions import ConvergenceWarning _EPSILON = np.finfo(np.double).eps @@ -290,8 +291,8 @@ class TheilSenRegressor(RegressorMixin, LinearModel): Xin Dang, Hanxiang Peng, Xueqin Wang and Heping Zhang http://home.olemiss.edu/~xdang/papers/MTSE.pdf """ - - def __init__(self, fit_intercept=True, copy_X=True, + @_deprecate_positional_args + def __init__(self, *, fit_intercept=True, copy_X=True, max_subpopulation=1e4, n_subsamples=None, max_iter=300, tol=1.e-3, random_state=None, n_jobs=None, verbose=False): self.fit_intercept = fit_intercept diff --git a/sklearn/linear_model/tests/test_omp.py b/sklearn/linear_model/tests/test_omp.py index e742689bcde3d..791983ba62cc2 100644 --- a/sklearn/linear_model/tests/test_omp.py +++ b/sklearn/linear_model/tests/test_omp.py @@ -94,8 +94,8 @@ def test_bad_input(): def test_perfect_signal_recovery(): idx, = gamma[:, 0].nonzero() - gamma_rec = orthogonal_mp(X, y[:, 0], 5) - gamma_gram = orthogonal_mp_gram(G, Xy[:, 0], 5) + gamma_rec = orthogonal_mp(X, y[:, 0], n_nonzero_coefs=5) + gamma_gram = orthogonal_mp_gram(G, Xy[:, 0], n_nonzero_coefs=5) assert_array_equal(idx, np.flatnonzero(gamma_rec)) assert_array_equal(idx, np.flatnonzero(gamma_gram)) assert_array_almost_equal(gamma[:, 0], gamma_rec, decimal=2) @@ -110,7 +110,8 @@ def test_orthogonal_mp_gram_readonly(): G_readonly.setflags(write=False) Xy_readonly = Xy.copy() Xy_readonly.setflags(write=False) - gamma_gram = orthogonal_mp_gram(G_readonly, Xy_readonly[:, 0], 5, + gamma_gram = orthogonal_mp_gram(G_readonly, Xy_readonly[:, 0], + n_nonzero_coefs=5, copy_Gram=False, copy_Xy=False) assert_array_equal(idx, np.flatnonzero(gamma_gram)) assert_array_almost_equal(gamma[:, 0], gamma_gram, decimal=2) @@ -163,8 +164,8 @@ def test_swapped_regressors(): gamma[0] = 0.5 new_y = np.dot(X, gamma) new_Xy = np.dot(X.T, new_y) - gamma_hat = orthogonal_mp(X, new_y, 2) - gamma_hat_gram = orthogonal_mp_gram(G, new_Xy, 2) + gamma_hat = orthogonal_mp(X, new_y, n_nonzero_coefs=2) + gamma_hat_gram = orthogonal_mp_gram(G, new_Xy, n_nonzero_coefs=2) assert_array_equal(np.flatnonzero(gamma_hat), [0, 21]) assert_array_equal(np.flatnonzero(gamma_hat_gram), [0, 21]) @@ -172,8 +173,10 @@ def test_swapped_regressors(): def test_no_atoms(): y_empty = np.zeros_like(y) Xy_empty = np.dot(X.T, y_empty) - gamma_empty = ignore_warnings(orthogonal_mp)(X, y_empty, 1) - gamma_empty_gram = ignore_warnings(orthogonal_mp)(G, Xy_empty, 1) + gamma_empty = ignore_warnings(orthogonal_mp)(X, y_empty, + n_nonzero_coefs=1) + gamma_empty_gram = ignore_warnings(orthogonal_mp)(G, Xy_empty, + n_nonzero_coefs=1) assert np.all(gamma_empty == 0) assert np.all(gamma_empty_gram == 0) diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py index 1f7d3c2569bab..3710f3857a2a7 100644 --- a/sklearn/linear_model/tests/test_ransac.py +++ b/sklearn/linear_model/tests/test_ransac.py @@ -288,7 +288,9 @@ def test_ransac_none_estimator(): ransac_estimator = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, random_state=0) - ransac_none_estimator = RANSACRegressor(None, 2, 5, random_state=0) + ransac_none_estimator = RANSACRegressor(None, min_samples=2, + residual_threshold=5, + random_state=0) ransac_estimator.fit(X, y) ransac_none_estimator.fit(X, y) From b4757f7fe5b8d238ebb4cb150aeba52306c12071 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 19 Apr 2020 17:28:07 -0400 Subject: [PATCH 025/125] ENH add vlines_ attribute to PDP Display to hide deciles (#15785) --- doc/whats_new/v0.23.rst | 7 +++ .../inspection/_plot/partial_dependence.py | 43 +++++++++++++------ .../tests/test_plot_partial_dependence.py | 19 ++++++++ 3 files changed, 57 insertions(+), 12 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 9343f1ee46da9..b672c1c156c97 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -373,6 +373,13 @@ Changelog :class:`neural_network.MLPClassifier` by clipping the probabilities. :pr:`16117` by `Thomas Fan`_. +:mod:`sklearn.inspection` +......................... + +- |Enhancement| :class:`inspection.PartialDependenceDisplay` now exposes the + deciles lines as attributes so they can be hidden or customized. :pr:`15785` + by `Nicolas Hug`_ + :mod:`sklearn.preprocessing` ............................ diff --git a/sklearn/inspection/_plot/partial_dependence.py b/sklearn/inspection/_plot/partial_dependence.py index 4a83ac057d91e..812005f5ab2ae 100644 --- a/sklearn/inspection/_plot/partial_dependence.py +++ b/sklearn/inspection/_plot/partial_dependence.py @@ -391,21 +391,36 @@ class PartialDependenceDisplay: axes_ : ndarray of matplotlib Axes If `ax` is an axes or None, `axes_[i, j]` is the axes on the i-th row and j-th column. If `ax` is a list of axes, `axes_[i]` is the i-th item - in `ax`. Elements that are None corresponds to a nonexisting axes in + in `ax`. Elements that are None correspond to a nonexisting axes in that position. lines_ : ndarray of matplotlib Artists - If `ax` is an axes or None, `line_[i, j]` is the partial dependence + If `ax` is an axes or None, `lines_[i, j]` is the partial dependence curve on the i-th row and j-th column. If `ax` is a list of axes, `lines_[i]` is the partial dependence curve corresponding to the i-th - item in `ax`. Elements that are None corresponds to a nonexisting axes + item in `ax`. Elements that are None correspond to a nonexisting axes or an axes that does not include a line plot. + deciles_vlines_ : ndarray of matplotlib LineCollection + If `ax` is an axes or None, `vlines_[i, j]` is the line collection + representing the x axis deciles of the i-th row and j-th column. If + `ax` is a list of axes, `vlines_[i]` corresponds to the i-th item in + `ax`. Elements that are None correspond to a nonexisting axes or an + axes that does not include a PDP plot. + .. versionadded:: 0.23 + deciles_hlines_ : ndarray of matplotlib LineCollection + If `ax` is an axes or None, `vlines_[i, j]` is the line collection + representing the y axis deciles of the i-th row and j-th column. If + `ax` is a list of axes, `vlines_[i]` corresponds to the i-th item in + `ax`. Elements that are None correspond to a nonexisting axes or an + axes that does not include a 2-way plot. + .. versionadded:: 0.23 + contours_ : ndarray of matplotlib Artists If `ax` is an axes or None, `contours_[i, j]` is the partial dependence plot on the i-th row and j-th column. If `ax` is a list of axes, `contours_[i]` is the partial dependence plot corresponding to the i-th - item in `ax`. Elements that are None corresponds to a nonexisting axes + item in `ax`. Elements that are None correspond to a nonexisting axes or an axes that does not include a contour plot. figure_ : matplotlib Figure @@ -490,8 +505,6 @@ def plot(self, ax=None, n_cols=3, line_kw=None, contour_kw=None): n_rows = int(np.ceil(n_features / float(n_cols))) self.axes_ = np.empty((n_rows, n_cols), dtype=np.object) - self.lines_ = np.empty((n_rows, n_cols), dtype=np.object) - self.contours_ = np.empty((n_rows, n_cols), dtype=np.object) axes_ravel = self.axes_.ravel() @@ -514,14 +527,20 @@ def plot(self, ax=None, n_cols=3, line_kw=None, contour_kw=None): self.bounding_ax_ = None self.figure_ = ax.ravel()[0].figure self.axes_ = ax - self.lines_ = np.empty_like(ax, dtype=np.object) - self.contours_ = np.empty_like(ax, dtype=np.object) # create contour levels for two-way plots if 2 in self.pdp_lim: Z_level = np.linspace(*self.pdp_lim[2], num=8) + + self.lines_ = np.empty_like(self.axes_, dtype=np.object) + self.contours_ = np.empty_like(self.axes_, dtype=np.object) + self.deciles_vlines_ = np.empty_like(self.axes_, dtype=np.object) + self.deciles_hlines_ = np.empty_like(self.axes_, dtype=np.object) + # Create 1d views of these 2d arrays for easy indexing lines_ravel = self.lines_.ravel(order='C') contours_ravel = self.contours_.ravel(order='C') + vlines_ravel = self.deciles_vlines_.ravel(order='C') + hlines_ravel = self.deciles_hlines_.ravel(order='C') for i, axi, fx, (avg_preds, values) in zip(count(), self.axes_.ravel(), @@ -547,8 +566,8 @@ def plot(self, ax=None, n_cols=3, line_kw=None, contour_kw=None): trans = transforms.blended_transform_factory(axi.transData, axi.transAxes) ylim = axi.get_ylim() - axi.vlines(self.deciles[fx[0]], 0, 0.05, transform=trans, - color='k') + vlines_ravel[i] = axi.vlines(self.deciles[fx[0]], 0, 0.05, + transform=trans, color='k') axi.set_ylim(ylim) # Set xlabel if it is not already set @@ -566,8 +585,8 @@ def plot(self, ax=None, n_cols=3, line_kw=None, contour_kw=None): trans = transforms.blended_transform_factory(axi.transAxes, axi.transData) xlim = axi.get_xlim() - axi.hlines(self.deciles[fx[1]], 0, 0.05, transform=trans, - color='k') + hlines_ravel[i] = axi.hlines(self.deciles[fx[1]], 0, 0.05, + transform=trans, color='k') # hline erases xlim axi.set_ylabel(self.feature_names[fx[1]]) axi.set_xlim(xlim) diff --git a/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py b/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py index abae91d4d2642..41da3f08c9094 100644 --- a/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py +++ b/sklearn/inspection/_plot/tests/test_plot_partial_dependence.py @@ -51,11 +51,20 @@ def test_plot_partial_dependence(grid_resolution, pyplot, clf_boston, boston): assert disp.axes_.shape == (1, 3) assert disp.lines_.shape == (1, 3) assert disp.contours_.shape == (1, 3) + assert disp.deciles_vlines_.shape == (1, 3) + assert disp.deciles_hlines_.shape == (1, 3) assert disp.lines_[0, 2] is None assert disp.contours_[0, 0] is None assert disp.contours_[0, 1] is None + # deciles lines: always show on xaxis, only show on yaxis if 2-way PDP + for i in range(3): + assert disp.deciles_vlines_[0, i] is not None + assert disp.deciles_hlines_[0, 0] is None + assert disp.deciles_hlines_[0, 1] is None + assert disp.deciles_hlines_[0, 2] is not None + assert disp.features == [(0, ), (1, ), (0, 1)] assert np.all(disp.feature_names == feature_names) assert len(disp.deciles) == 2 @@ -132,9 +141,15 @@ def test_plot_partial_dependence_str_features(pyplot, clf_boston, boston, assert disp.axes_.shape == (2, 1) assert disp.lines_.shape == (2, 1) assert disp.contours_.shape == (2, 1) + assert disp.deciles_vlines_.shape == (2, 1) + assert disp.deciles_hlines_.shape == (2, 1) assert disp.lines_[0, 0] is None + assert disp.deciles_vlines_[0, 0] is not None + assert disp.deciles_hlines_[0, 0] is not None assert disp.contours_[1, 0] is None + assert disp.deciles_hlines_[1, 0] is None + assert disp.deciles_vlines_[1, 0] is not None # line ax = disp.axes_[1, 0] @@ -309,6 +324,8 @@ def test_plot_partial_dependence_multiclass(pyplot): assert disp_target_0.axes_.shape == (1, 2) assert disp_target_0.lines_.shape == (1, 2) assert disp_target_0.contours_.shape == (1, 2) + assert disp_target_0.deciles_vlines_.shape == (1, 2) + assert disp_target_0.deciles_hlines_.shape == (1, 2) assert all(c is None for c in disp_target_0.contours_.flat) assert disp_target_0.target_idx == 0 @@ -323,6 +340,8 @@ def test_plot_partial_dependence_multiclass(pyplot): assert disp_symbol.axes_.shape == (1, 2) assert disp_symbol.lines_.shape == (1, 2) assert disp_symbol.contours_.shape == (1, 2) + assert disp_symbol.deciles_vlines_.shape == (1, 2) + assert disp_symbol.deciles_hlines_.shape == (1, 2) assert all(c is None for c in disp_symbol.contours_.flat) assert disp_symbol.target_idx == 0 From 6973096a0f0a2d1eb6c5d3cadd55a89276368311 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 20 Apr 2020 11:31:18 -0400 Subject: [PATCH 026/125] DOC details on the use of xfail_checks (#16968) --- doc/developers/develop.rst | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst index d8ae6dd224840..cc4e8f6678c01 100644 --- a/doc/developers/develop.rst +++ b/doc/developers/develop.rst @@ -518,9 +518,16 @@ _skip_test (default=False) *very good* reason. _xfail_checks (default=False) - dictionary ``{check_name : reason}`` of common checks to mark as a - known failure, with the associated reason. Don't use this unless you have a - *very good* reason. + dictionary ``{check_name: reason}`` of common checks that will be marked + as `XFAIL` for pytest, when using + :func:`~sklearn.utils.estimator_checks.parametrize_with_checks`. This tag + currently has no effect on + :func:`~sklearn.utils.estimator_checks.check_estimator`. + Don't use this unless there is a *very good* reason for your estimator + not to pass the check. + Also note that the usage of this tag is highly subject to change because + we are trying to make it more flexible: be prepared for breaking changes + in the future. stateless (default=False) whether the estimator needs access to data for fitting. Even though an From 7e15285d2e0587af976848fbc44c95c706640839 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Mon, 20 Apr 2020 19:48:36 +0200 Subject: [PATCH 027/125] API make gaussian_process __init__ params kwarg (#16870) --- sklearn/gaussian_process/_gpc.py | 17 ++++++++++++----- sklearn/gaussian_process/_gpr.py | 4 +++- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/sklearn/gaussian_process/_gpc.py b/sklearn/gaussian_process/_gpc.py index e70838c6d251a..2c9c0ef483d4f 100644 --- a/sklearn/gaussian_process/_gpc.py +++ b/sklearn/gaussian_process/_gpc.py @@ -19,6 +19,7 @@ from ..utils.optimize import _check_optimize_result from ..preprocessing import LabelEncoder from ..multiclass import OneVsRestClassifier, OneVsOneClassifier +from ..utils.validation import _deprecate_positional_args # Values required for approximating the logistic sigmoid by @@ -144,7 +145,8 @@ def optimizer(obj_func, initial_theta, bounds): The log-marginal-likelihood of ``self.kernel_.theta`` """ - def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b", + @_deprecate_positional_args + def __init__(self, kernel=None, *, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=0, max_iter_predict=100, warm_start=False, copy_X_train=True, random_state=None): self.kernel = kernel @@ -586,7 +588,8 @@ def optimizer(obj_func, initial_theta, bounds): .. versionadded:: 0.18 """ - def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b", + @_deprecate_positional_args + def __init__(self, kernel=None, *, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=0, max_iter_predict=100, warm_start=False, copy_X_train=True, random_state=None, multi_class="one_vs_rest", n_jobs=None): @@ -623,9 +626,13 @@ def fit(self, X, y): ensure_2d=False, dtype=None) self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace( - self.kernel, self.optimizer, self.n_restarts_optimizer, - self.max_iter_predict, self.warm_start, self.copy_X_train, - self.random_state) + kernel=self.kernel, + optimizer=self.optimizer, + n_restarts_optimizer=self.n_restarts_optimizer, + max_iter_predict=self.max_iter_predict, + warm_start=self.warm_start, + copy_X_train=self.copy_X_train, + random_state=self.random_state) self.classes_ = np.unique(y) self.n_classes_ = self.classes_.size diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py index caf94ce41c1b4..0ba594a7ffaac 100644 --- a/sklearn/gaussian_process/_gpr.py +++ b/sklearn/gaussian_process/_gpr.py @@ -17,6 +17,7 @@ from ..utils import check_random_state from ..utils.validation import check_array from ..utils.optimize import _check_optimize_result +from ..utils.validation import _deprecate_positional_args class GaussianProcessRegressor(MultiOutputMixin, @@ -149,7 +150,8 @@ def optimizer(obj_func, initial_theta, bounds): (array([653.0..., 592.1...]), array([316.6..., 316.6...])) """ - def __init__(self, kernel=None, alpha=1e-10, + @_deprecate_positional_args + def __init__(self, kernel=None, *, alpha=1e-10, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=0, normalize_y=False, copy_X_train=True, random_state=None): self.kernel = kernel From c8e055883b5fcc3f3ac8850eefd7b3c5e7f7ff2d Mon Sep 17 00:00:00 2001 From: Lucy Liu Date: Mon, 20 Apr 2020 21:44:38 +0200 Subject: [PATCH 028/125] DOC Add link, fix wording of KNeighborsRegressor (#16969) --- sklearn/neighbors/_regression.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py index 00d8f10c8880d..a00d83c98102b 100644 --- a/sklearn/neighbors/_regression.py +++ b/sklearn/neighbors/_regression.py @@ -77,10 +77,10 @@ class KNeighborsRegressor(NeighborsBase, KNeighborsMixin, metric : str or callable, default='minkowski' the distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean - metric. See the documentation of the DistanceMetric class for a + metric. See the documentation of :class:`DistanceMetric` for a list of available metrics. If metric is "precomputed", X is assumed to be a distance matrix and - must be square during fit. X may be a :term:`Glossary `, + must be square during fit. X may be a :term:`sparse graph`, in which case only "nonzero" elements may be considered neighbors. metric_params : dict, default=None From 7736062aaeb301b367500b6c387b1806dd7ff1eb Mon Sep 17 00:00:00 2001 From: smarie Date: Tue, 21 Apr 2020 14:06:07 +0200 Subject: [PATCH 029/125] DOC Libsvm liblinear rand fix - minor doc and header fixes (#16979) --- doc/whats_new/v0.23.rst | 6 +++--- sklearn/svm/src/liblinear/linear.cpp | 2 +- sklearn/svm/src/libsvm/svm.cpp | 2 +- sklearn/svm/src/newrand/newrand.h | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index b672c1c156c97..b5cb4b7012182 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -407,14 +407,14 @@ Changelog generators used to randomly select coordinates in the coordinate descent algorithms. Platform-dependent C ``rand()`` was used, which is only able to generate numbers up to ``32767`` on windows platform (see this `blog - post `) and also has poor + post `_) and also has poor randomization power as suggested by `this presentation - `. + `_. It was replaced with C++11 ``mt19937``, a Mersenne Twister that correctly generates 31bits/63bits random numbers on all platforms. In addition, the crude "modulo" postprocessor used to get a random number in a bounded interval was replaced by the tweaked Lemire method as suggested by `this blog - post `. + post `_. Any model using the :func:`svm.libsvm` or the :func:`svm.liblinear` solver, including :class:`svm.LinearSVC`, :class:`svm.LinearSVR`, :class:`svm.NuSVC`, :class:`svm.NuSVR`, :class:`svm.OneClassSVM`, diff --git a/sklearn/svm/src/liblinear/linear.cpp b/sklearn/svm/src/liblinear/linear.cpp index cc603b435f655..29a5581c280dc 100644 --- a/sklearn/svm/src/liblinear/linear.cpp +++ b/sklearn/svm/src/liblinear/linear.cpp @@ -26,7 +26,7 @@ Modified 2020: - Improved random number generator by using a mersenne twister + tweaked lemire postprocessor. This fixed a convergence issue on windows targets. - Sylvain Marie + Sylvain Marie, Schneider Electric See */ diff --git a/sklearn/svm/src/libsvm/svm.cpp b/sklearn/svm/src/libsvm/svm.cpp index c9a5df10c4924..a5f735d8111cf 100644 --- a/sklearn/svm/src/libsvm/svm.cpp +++ b/sklearn/svm/src/libsvm/svm.cpp @@ -52,7 +52,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Improved random number generator by using a mersenne twister + tweaked lemire postprocessor. This fixed a convergence issue on windows targets. - Sylvain Marie, + Sylvain Marie, Schneider Electric see */ diff --git a/sklearn/svm/src/newrand/newrand.h b/sklearn/svm/src/newrand/newrand.h index b46861b71e765..7cd7b4c9fbf2b 100644 --- a/sklearn/svm/src/newrand/newrand.h +++ b/sklearn/svm/src/newrand/newrand.h @@ -3,7 +3,7 @@ - New random number generator using a mersenne twister + tweaked lemire postprocessor. This fixed a convergence issue on windows targets for libsvm and liblinear. - Sylvain Marie + Sylvain Marie, Schneider Electric See */ From c531bd0d865702af7b033b872b6214aded1a710b Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 21 Apr 2020 10:21:48 -0400 Subject: [PATCH 030/125] API Adds missing keyword only argument to PCA (#16975) --- sklearn/decomposition/_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 87092d7ccd17e..14a993c56dce8 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -320,7 +320,7 @@ class PCA(_BasePCA): [6.30061...] """ @_deprecate_positional_args - def __init__(self, n_components=None, copy=True, whiten=False, + def __init__(self, n_components=None, *, copy=True, whiten=False, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None): self.n_components = n_components From f82a2cb33871a67b36150647ece1c7e56d3132bb Mon Sep 17 00:00:00 2001 From: Lucy Liu Date: Tue, 21 Apr 2020 19:48:00 +0200 Subject: [PATCH 031/125] replace boston (#16922) --- sklearn/ensemble/tests/test_voting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/tests/test_voting.py b/sklearn/ensemble/tests/test_voting.py index 4eb47bea0a514..f81b9e59a5f1b 100644 --- a/sklearn/ensemble/tests/test_voting.py +++ b/sklearn/ensemble/tests/test_voting.py @@ -33,7 +33,7 @@ iris = datasets.load_iris() X, y = iris.data[:, 1:3], iris.target -X_r, y_r = datasets.load_boston(return_X_y=True) +X_r, y_r = datasets.load_diabetes(return_X_y=True) @pytest.mark.parametrize( @@ -120,7 +120,7 @@ def test_weights_iris(): def test_weights_regressor(): - """Check weighted average regression prediction on boston dataset.""" + """Check weighted average regression prediction on diabetes dataset.""" reg1 = DummyRegressor(strategy='mean') reg2 = DummyRegressor(strategy='median') reg3 = DummyRegressor(strategy='quantile', quantile=.2) From 089c8a17a166b039691bfee45d14577e14292a41 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 22 Apr 2020 08:41:45 -0400 Subject: [PATCH 032/125] [MRG] MNT requires_y tag with y=None validation (#16622) --- doc/developers/develop.rst | 5 ++ doc/whats_new/v0.23.rst | 8 +- sklearn/base.py | 36 +++++++- sklearn/covariance/_empirical_covariance.py | 1 + sklearn/cross_decomposition/_pls.py | 3 +- sklearn/ensemble/_gb.py | 6 +- sklearn/feature_selection/_rfe.py | 4 +- .../_univariate_selection.py | 3 + sklearn/linear_model/_base.py | 3 + sklearn/linear_model/_coordinate_descent.py | 89 ++++++++++++------- sklearn/linear_model/_ransac.py | 10 ++- sklearn/neighbors/_base.py | 14 ++- sklearn/neighbors/_graph.py | 8 +- sklearn/neighbors/_lof.py | 4 +- sklearn/neighbors/_nca.py | 3 + sklearn/neighbors/_unsupervised.py | 4 +- sklearn/tree/_classes.py | 10 ++- sklearn/utils/estimator_checks.py | 34 +++++++ 18 files changed, 185 insertions(+), 60 deletions(-) diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst index cc4e8f6678c01..f17c58cee0d7f 100644 --- a/doc/developers/develop.rst +++ b/doc/developers/develop.rst @@ -510,6 +510,11 @@ requires_fit (default=True) requires_positive_X (default=False) whether the estimator requires positive X. +requires_y (default=False) + whether the estimator requires y to be passed to `fit`, `fit_predict` or + `fit_transform` methods. The tag is True for estimators inheriting from + `~sklearn.base.RegressorMixin` and `~sklearn.base.ClassifierMixin`. + requires_positive_y (default=False) whether the estimator requires a positive y (only applicable for regression). diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index b5cb4b7012182..3b8af7ccf416d 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -476,4 +476,10 @@ Miscellaneous attribute is equal to the number of features passed to the `fit` method. See `SLEP010 `_ - for details. :pr:`16112` by `Nicolas Hug`_. + for details. :pr:`16112` and :pr:`16622` by `Nicolas Hug`_. + +- |API| Estimators now have a `requires_y` tags which is False by default + except for estimators that inherit from `~sklearn.base.RegressorMixin` or + `~sklearn.base.ClassifierMixin`. This tag is used to ensure that a proper + error message is raised when y was expected but None was passed. + :pr:`16622` by `Nicolas Hug`_. diff --git a/sklearn/base.py b/sklearn/base.py index 8a6041cc17982..c0328e00d84d0 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -36,7 +36,9 @@ '_xfail_checks': False, 'multioutput_only': False, 'binary_only': False, - 'requires_fit': True} + 'requires_fit': True, + 'requires_y': False, + } def clone(estimator, safe=True): @@ -374,7 +376,8 @@ def _check_n_features(self, X, reset): self.n_features_in_) ) - def _validate_data(self, X, y=None, reset=True, **check_params): + def _validate_data(self, X, y=None, reset=True, + validate_separately=False, **check_params): """Validate input data and set or check the `n_features_in_` attribute. Parameters @@ -389,9 +392,14 @@ def _validate_data(self, X, y=None, reset=True, **check_params): Whether to reset the `n_features_in_` attribute. If False, the input will be checked for consistency with data provided when reset was last True. + validate_separately : False or tuple of dicts, default=False + Only used if y is not None. + If False, call validate_X_y(). Else, it must be a tuple of kwargs + to be used for calling check_array() on X and y respectively. **check_params : kwargs Parameters passed to :func:`sklearn.utils.check_array` or - :func:`sklearn.utils.check_X_y`. + :func:`sklearn.utils.check_X_y`. Ignored if validate_separately + is not False. Returns ------- @@ -400,10 +408,24 @@ def _validate_data(self, X, y=None, reset=True, **check_params): """ if y is None: + if self._get_tags()['requires_y']: + raise ValueError( + f"This {self.__class__.__name__} estimator " + f"requires y to be passed, but the target y is None." + ) X = check_array(X, **check_params) out = X else: - X, y = check_X_y(X, y, **check_params) + if validate_separately: + # We need this because some estimators validate X and y + # separately, and in general, separately calling check_array() + # on X and y isn't equivalent to just calling check_X_y() + # :( + check_X_params, check_y_params = validate_separately + X = check_array(X, **check_X_params) + y = check_array(y, **check_y_params) + else: + X, y = check_X_y(X, y, **check_params) out = X, y if check_params.get('ensure_2d', True): @@ -444,6 +466,9 @@ def score(self, X, y, sample_weight=None): from .metrics import accuracy_score return accuracy_score(y, self.predict(X), sample_weight=sample_weight) + def _more_tags(self): + return {'requires_y': True} + class RegressorMixin: """Mixin class for all regression estimators in scikit-learn.""" @@ -494,6 +519,9 @@ def score(self, X, y, sample_weight=None): y_pred = self.predict(X) return r2_score(y, y_pred, sample_weight=sample_weight) + def _more_tags(self): + return {'requires_y': True} + class ClusterMixin: """Mixin class for all cluster estimators in scikit-learn.""" diff --git a/sklearn/covariance/_empirical_covariance.py b/sklearn/covariance/_empirical_covariance.py index c83dbc89697e1..684a57fdeb296 100644 --- a/sklearn/covariance/_empirical_covariance.py +++ b/sklearn/covariance/_empirical_covariance.py @@ -79,6 +79,7 @@ def empirical_covariance(X, assume_centered=False): [0.25, 0.25, 0.25]]) """ X = np.asarray(X) + if X.ndim == 1: X = np.reshape(X, (1, -1)) diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py index 508448c3ede39..b6912f81105a8 100644 --- a/sklearn/cross_decomposition/_pls.py +++ b/sklearn/cross_decomposition/_pls.py @@ -519,7 +519,8 @@ def fit_transform(self, X, y=None): return self.fit(X, y).transform(X, y) def _more_tags(self): - return {'poor_score': True} + return {'poor_score': True, + 'requires_y': False} class PLSRegression(_PLS): diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index 32e534fdc8517..439500c1917d8 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -405,15 +405,15 @@ def fit(self, X, y, sample_weight=None, monitor=None): # Check input # Since check_array converts both X and y to the same dtype, but the # trees use different types for X and y, checking them separately. - X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], - dtype=DTYPE) + + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'], + dtype=DTYPE, multi_output=True) n_samples, self.n_features_ = X.shape sample_weight_is_none = sample_weight is None sample_weight = _check_sample_weight(sample_weight, X) - y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None) y = column_or_1d(y, warn=True) y = self._validate_y(y, sample_weight) diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index 7e7aada0d70b3..6d9bb8c463df6 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -340,7 +340,9 @@ def predict_log_proba(self, X): def _more_tags(self): estimator_tags = self.estimator._get_tags() return {'poor_score': True, - 'allow_nan': estimator_tags.get('allow_nan', True)} + 'allow_nan': estimator_tags.get('allow_nan', True), + 'requires_y': True, + } class RFECV(RFE): diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 7ca0ce4a36715..21a2ddc10a1eb 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -363,6 +363,9 @@ def fit(self, X, y): def _check_params(self, X, y): pass + def _more_tags(self): + return {'requires_y': True} + ###################################################################### # Specific filters diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 56e5e24761128..c1f6b8233bdac 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -246,6 +246,9 @@ def _set_intercept(self, X_offset, y_offset, X_scale): else: self.intercept_ = 0. + def _more_tags(self): + return {'requires_y': True} + # XXX Should this derive from LinearModel? It should be a mixin, not an ABC. # Maybe the n_features checking can be moved to LinearModel. diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index 5d932033dbd0d..f4430c5bcac55 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -1157,8 +1157,52 @@ def fit(self, X, y): y : array-like of shape (n_samples,) or (n_samples, n_targets) Target values """ - y = check_array(y, copy=False, dtype=[np.float64, np.float32], - ensure_2d=False) + # This makes sure that there is no duplication in memory. + # Dealing right with copy_X is important in the following: + # Multiple functions touch X and subsamples of X and can induce a + # lot of duplication of memory + copy_X = self.copy_X and self.fit_intercept + + check_y_params = dict(copy=False, dtype=[np.float64, np.float32], + ensure_2d=False) + if isinstance(X, np.ndarray) or sparse.isspmatrix(X): + # Keep a reference to X + reference_to_old_X = X + # Let us not impose fortran ordering so far: it is + # not useful for the cross-validation loop and will be done + # by the model fitting itself + + # Need to validate separately here. + # We can't pass multi_ouput=True because that would allow y to be + # csr. We also want to allow y to be 64 or 32 but check_X_y only + # allows to convert for 64. + check_X_params = dict(accept_sparse='csc', + dtype=[np.float64, np.float32], copy=False) + X, y = self._validate_data(X, y, + validate_separately=(check_X_params, + check_y_params)) + if sparse.isspmatrix(X): + if (hasattr(reference_to_old_X, "data") and + not np.may_share_memory(reference_to_old_X.data, X.data)): + # X is a sparse matrix and has been copied + copy_X = False + elif not np.may_share_memory(reference_to_old_X, X): + # X has been copied + copy_X = False + del reference_to_old_X + else: + # Need to validate separately here. + # We can't pass multi_ouput=True because that would allow y to be + # csr. We also want to allow y to be 64 or 32 but check_X_y only + # allows to convert for 64. + check_X_params = dict(accept_sparse='csc', + dtype=[np.float64, np.float32], order='F', + copy=copy_X) + X, y = self._validate_data(X, y, + validate_separately=(check_X_params, + check_y_params)) + copy_X = False + if y.shape[0] == 0: raise ValueError("y has 0 samples: %r" % y) @@ -1191,35 +1235,6 @@ def fit(self, X, y): if self.selection not in ["random", "cyclic"]: raise ValueError("selection should be either random or cyclic.") - # This makes sure that there is no duplication in memory. - # Dealing right with copy_X is important in the following: - # Multiple functions touch X and subsamples of X and can induce a - # lot of duplication of memory - copy_X = self.copy_X and self.fit_intercept - - if isinstance(X, np.ndarray) or sparse.isspmatrix(X): - # Keep a reference to X - reference_to_old_X = X - # Let us not impose fortran ordering so far: it is - # not useful for the cross-validation loop and will be done - # by the model fitting itself - X = self._validate_data(X, accept_sparse='csc', - dtype=[np.float64, np.float32], copy=False) - if sparse.isspmatrix(X): - if (hasattr(reference_to_old_X, "data") and - not np.may_share_memory(reference_to_old_X.data, X.data)): - # X is a sparse matrix and has been copied - copy_X = False - elif not np.may_share_memory(reference_to_old_X, X): - # X has been copied - copy_X = False - del reference_to_old_X - else: - X = self._validate_data(X, accept_sparse='csc', - dtype=[np.float64, np.float32], order='F', - copy=copy_X) - copy_X = False - if X.shape[0] != y.shape[0]: raise ValueError("X and y have inconsistent dimensions (%d != %d)" % (X.shape[0], y.shape[0])) @@ -1842,9 +1857,15 @@ def fit(self, X, y): To avoid memory re-allocation it is advised to allocate the initial data in memory directly using that format. """ - X = self._validate_data(X, dtype=[np.float64, np.float32], order='F', - copy=self.copy_X and self.fit_intercept) - y = check_array(y, dtype=X.dtype.type, ensure_2d=False) + + # Need to validate separately here. + # We can't pass multi_ouput=True because that would allow y to be csr. + check_X_params = dict(dtype=[np.float64, np.float32], order='F', + copy=self.copy_X and self.fit_intercept) + check_y_params = dict(ensure_2d=False) + X, y = self._validate_data(X, y, validate_separately=(check_X_params, + check_y_params)) + y = y.astype(X.dtype) if hasattr(self, 'l1_ratio'): model_str = 'ElasticNet' diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 86ceb0d5e311f..5eac651c76383 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -9,7 +9,7 @@ from ..base import BaseEstimator, MetaEstimatorMixin, RegressorMixin, clone from ..base import MultiOutputMixin -from ..utils import check_random_state, check_array, check_consistent_length +from ..utils import check_random_state, check_consistent_length from ..utils.random import sample_without_replacement from ..utils.validation import check_is_fitted, _check_sample_weight from ..utils.validation import _deprecate_positional_args @@ -247,8 +247,12 @@ def fit(self, X, y, sample_weight=None): `max_trials` randomly chosen sub-samples. """ - X = self._validate_data(X, accept_sparse='csr') - y = check_array(y, ensure_2d=False) + # Need to validate separately here. + # We can't pass multi_ouput=True because that would allow y to be csr. + check_X_params = dict(accept_sparse='csr') + check_y_params = dict(ensure_2d=False) + X, y = self._validate_data(X, y, validate_separately=(check_X_params, + check_y_params)) check_consistent_length(X, y) if self.base_estimator is not None: diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 945959ef10d9c..a1eebdcf78648 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -23,7 +23,7 @@ from ..base import BaseEstimator, MultiOutputMixin from ..metrics import pairwise_distances_chunked from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS -from ..utils import check_X_y, check_array, gen_even_slices +from ..utils import check_array, gen_even_slices from ..utils import _to_object_array from ..utils.multiclass import check_classification_targets from ..utils.validation import check_is_fitted @@ -1104,10 +1104,14 @@ def fit(self, X, y): or [n_samples, n_outputs] """ if not isinstance(X, (KDTree, BallTree)): - X, y = check_X_y(X, y, "csr", multi_output=True) + X, y = self._validate_data(X, y, accept_sparse="csr", + multi_output=True) self._y = y return self._fit(X) + def _more_tags(self): + return {'requires_y': True} + class SupervisedIntegerMixin: def fit(self, X, y): @@ -1124,7 +1128,8 @@ def fit(self, X, y): """ if not isinstance(X, (KDTree, BallTree)): - X, y = check_X_y(X, y, "csr", multi_output=True) + X, y = self._validate_data(X, y, accept_sparse="csr", + multi_output=True) if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1: if y.ndim != 1: @@ -1151,6 +1156,9 @@ def fit(self, X, y): return self._fit(X) + def _more_tags(self): + return {'requires_y': True} + class UnsupervisedMixin: def fit(self, X, y=None): diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py index 9fc4a6e830cde..d217999196950 100644 --- a/sklearn/neighbors/_graph.py +++ b/sklearn/neighbors/_graph.py @@ -192,8 +192,8 @@ def radius_neighbors_graph(X, radius, mode='connectivity', metric='minkowski', return X.radius_neighbors_graph(query, radius, mode) -class KNeighborsTransformer(NeighborsBase, KNeighborsMixin, - UnsupervisedMixin, TransformerMixin): +class KNeighborsTransformer(KNeighborsMixin, UnsupervisedMixin, + TransformerMixin, NeighborsBase): """Transform X into a (weighted) graph of k nearest neighbors The transformed data is a sparse graph as returned by kneighbors_graph. @@ -335,8 +335,8 @@ def fit_transform(self, X, y=None): return self.fit(X).transform(X) -class RadiusNeighborsTransformer(NeighborsBase, RadiusNeighborsMixin, - UnsupervisedMixin, TransformerMixin): +class RadiusNeighborsTransformer(RadiusNeighborsMixin, UnsupervisedMixin, + TransformerMixin, NeighborsBase): """Transform X into a (weighted) graph of neighbors nearer than a radius The transformed data is a sparse graph as returned by diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py index e03c4d9cb1e0e..f3b141bf499e5 100644 --- a/sklearn/neighbors/_lof.py +++ b/sklearn/neighbors/_lof.py @@ -16,8 +16,8 @@ __all__ = ["LocalOutlierFactor"] -class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin, - OutlierMixin): +class LocalOutlierFactor(KNeighborsMixin, UnsupervisedMixin, + OutlierMixin, NeighborsBase): """Unsupervised Outlier Detection using Local Outlier Factor (LOF) The anomaly score of each sample is called Local Outlier Factor. diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py index cd87d594281da..1017f5cf12606 100644 --- a/sklearn/neighbors/_nca.py +++ b/sklearn/neighbors/_nca.py @@ -520,3 +520,6 @@ def _loss_grad_lbfgs(self, transformation, X, same_class_mask, sign=1.0): sys.stdout.flush() return sign * loss, sign * gradient.ravel() + + def _more_tags(self): + return {'requires_y': True} diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py index 20be4f636c2a4..6faafeee9ffcd 100644 --- a/sklearn/neighbors/_unsupervised.py +++ b/sklearn/neighbors/_unsupervised.py @@ -5,8 +5,8 @@ from ._base import UnsupervisedMixin -class NearestNeighbors(NeighborsBase, KNeighborsMixin, - RadiusNeighborsMixin, UnsupervisedMixin): +class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, + UnsupervisedMixin, NeighborsBase): """Unsupervised learner for implementing neighbor searches. Read more in the :ref:`User Guide `. diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index f252ba0acbb1c..81cd7bd8e8989 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -146,8 +146,14 @@ def fit(self, X, y, sample_weight=None, check_input=True, raise ValueError("ccp_alpha must be greater than or equal to 0") if check_input: - X = self._validate_data(X, dtype=DTYPE, accept_sparse="csc") - y = check_array(y, ensure_2d=False, dtype=None) + # Need to validate separately here. + # We can't pass multi_ouput=True because that would allow y to be + # csr. + check_X_params = dict(dtype=DTYPE, accept_sparse="csc") + check_y_params = dict(ensure_2d=False, dtype=None) + X, y = self._validate_data(X, y, + validate_separately=(check_X_params, + check_y_params)) if issparse(X): X.sort_indices() diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 351f24b66283e..efac2aca2a2df 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -270,6 +270,8 @@ def _yield_all_checks(name, estimator): yield check_fit_idempotent if not tags["no_validation"]: yield check_n_features_in + if tags["requires_y"]: + yield check_requires_y_none if tags["requires_positive_X"]: yield check_fit_non_negative @@ -2976,3 +2978,35 @@ def check_n_features_in(name, estimator_orig): "https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep010/proposal.html", # noqa FutureWarning ) + + +def check_requires_y_none(name, estimator_orig): + # Make sure that an estimator with requires_y=True fails gracefully when + # given y=None + + rng = np.random.RandomState(0) + + estimator = clone(estimator_orig) + set_random_state(estimator) + + n_samples = 100 + X = rng.normal(loc=100, size=(n_samples, 2)) + X = _pairwise_estimator_convert_X(X, estimator) + + warning_msg = ("As of scikit-learn 0.23, estimators should have a " + "'requires_y' tag set to the appropriate value. " + "The default value of the tag is False. " + "An error will be raised from version 0.25 when calling " + "check_estimator() if the tag isn't properly set.") + + expected_err_msgs = ( + "requires y to be passed, but the target y is None", + "Expected array-like (array or non-string sequence), got None", + "y should be a 1d array" + ) + + try: + estimator.fit(X, None) + except ValueError as ve: + if not any(msg in str(ve) for msg in expected_err_msgs): + warnings.warn(warning_msg, FutureWarning) From 5f6dfcb4d18efb100e5f540d04e0b2287383f845 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 22 Apr 2020 08:42:04 -0400 Subject: [PATCH 033/125] API Deprecate positional arguments in pipeline (#16997) --- sklearn/pipeline.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index c1bbdbd629ff8..477354107e133 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -21,6 +21,7 @@ from .utils.metaestimators import if_delegate_has_method from .utils import Bunch, _print_elapsed_time from .utils.validation import check_memory +from .utils.validation import _deprecate_positional_args from .utils.metaestimators import _BaseComposition @@ -104,7 +105,8 @@ class Pipeline(_BaseComposition): # BaseEstimator interface _required_parameters = ['steps'] - def __init__(self, steps, memory=None, verbose=False): + @_deprecate_positional_args + def __init__(self, steps, *, memory=None, verbose=False): self.steps = steps self.memory = memory self.verbose = verbose @@ -797,7 +799,8 @@ class FeatureUnion(TransformerMixin, _BaseComposition): """ _required_parameters = ["transformer_list"] - def __init__(self, transformer_list, n_jobs=None, + @_deprecate_positional_args + def __init__(self, transformer_list, *, n_jobs=None, transformer_weights=None, verbose=False): self.transformer_list = transformer_list self.n_jobs = n_jobs From 1523f395e952dd79d2427d8230056ed0cf47f7a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Wed, 22 Apr 2020 14:50:42 +0200 Subject: [PATCH 034/125] MNT fix memory leak in elkan KMeans (#17000) --- sklearn/cluster/_k_means_elkan.pyx | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sklearn/cluster/_k_means_elkan.pyx b/sklearn/cluster/_k_means_elkan.pyx index 65c8871fbb456..70c4abb0c4ac7 100644 --- a/sklearn/cluster/_k_means_elkan.pyx +++ b/sklearn/cluster/_k_means_elkan.pyx @@ -322,6 +322,9 @@ def _elkan_iter_chunked_dense( for k in range(n_features): centers_new[j, k] += centers_new_chunk[j * n_features + k] + free(centers_new_chunk) + free(weight_in_clusters_chunk) + if update_centers: _relocate_empty_clusters_dense(X, sample_weight, centers_old, centers_new, weight_in_clusters, labels) @@ -553,6 +556,9 @@ def _elkan_iter_chunked_sparse( for k in range(n_features): centers_new[j, k] += centers_new_chunk[j * n_features + k] + free(centers_new_chunk) + free(weight_in_clusters_chunk) + if update_centers: _relocate_empty_clusters_sparse( X_data, X_indices, X_indptr, sample_weight, From 388999b59d0070fafdcd76cf599af6758b25d987 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 22 Apr 2020 09:20:57 -0400 Subject: [PATCH 035/125] API Deprecate positional arguments in tree module (#16966) --- sklearn/tree/_classes.py | 16 +++++++++++----- sklearn/tree/_export.py | 10 +++++++--- sklearn/tree/tests/test_export.py | 3 +-- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 81cd7bd8e8989..3994613d92b6b 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -36,6 +36,7 @@ from ..utils import compute_sample_weight from ..utils.multiclass import check_classification_targets from ..utils.validation import check_is_fitted +from ..utils.validation import _deprecate_positional_args from ._criterion import Criterion from ._splitter import Splitter @@ -82,7 +83,8 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): """ @abstractmethod - def __init__(self, + @_deprecate_positional_args + def __init__(self, *, criterion, splitter, max_depth, @@ -815,7 +817,8 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): array([ 1. , 0.93..., 0.86..., 0.93..., 0.93..., 0.93..., 0.93..., 1. , 0.93..., 1. ]) """ - def __init__(self, + @_deprecate_positional_args + def __init__(self, *, criterion="gini", splitter="best", max_depth=None, @@ -1169,7 +1172,8 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): array([-0.39..., -0.46..., 0.02..., 0.06..., -0.50..., 0.16..., 0.11..., -0.73..., -0.30..., -0.00...]) """ - def __init__(self, + @_deprecate_positional_args + def __init__(self, *, criterion="mse", splitter="best", max_depth=None, @@ -1499,7 +1503,8 @@ class ExtraTreeClassifier(DecisionTreeClassifier): >>> cls.score(X_test, y_test) 0.8947... """ - def __init__(self, + @_deprecate_positional_args + def __init__(self, *, criterion="gini", splitter="random", max_depth=None, @@ -1716,7 +1721,8 @@ class ExtraTreeRegressor(DecisionTreeRegressor): >>> reg.score(X_test, y_test) 0.33... """ - def __init__(self, + @_deprecate_positional_args + def __init__(self, *, criterion="mse", splitter="random", max_depth=None, diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py index 3197995818f81..d0f67326012e9 100644 --- a/sklearn/tree/_export.py +++ b/sklearn/tree/_export.py @@ -17,6 +17,7 @@ import numpy as np from ..utils.validation import check_is_fitted +from ..utils.validation import _deprecate_positional_args from ..base import is_classifier from . import _criterion @@ -77,7 +78,8 @@ def __repr__(self): SENTINEL = Sentinel() -def plot_tree(decision_tree, max_depth=None, feature_names=None, +@_deprecate_positional_args +def plot_tree(decision_tree, *, max_depth=None, feature_names=None, class_names=None, label='all', filled=False, impurity=True, node_ids=False, proportion=False, rotate='deprecated', rounded=False, @@ -656,7 +658,8 @@ def recurse(self, node, tree, ax, scale_x, scale_y, height, depth=0): ax.annotate("\n (...) \n", xy_parent, xy, **kwargs) -def export_graphviz(decision_tree, out_file=None, max_depth=None, +@_deprecate_positional_args +def export_graphviz(decision_tree, out_file=None, *, max_depth=None, feature_names=None, class_names=None, label='all', filled=False, leaves_parallel=False, impurity=True, node_ids=False, proportion=False, rotate=False, @@ -807,7 +810,8 @@ def compute_depth_(current_node, current_depth, return max(depths) -def export_text(decision_tree, feature_names=None, max_depth=10, +@_deprecate_positional_args +def export_text(decision_tree, *, feature_names=None, max_depth=10, spacing=3, decimals=2, show_weights=False): """Build a text report showing the rules of a decision tree. diff --git a/sklearn/tree/tests/test_export.py b/sklearn/tree/tests/test_export.py index ad49f81fcf9ac..f12f1daeb57c1 100644 --- a/sklearn/tree/tests/test_export.py +++ b/sklearn/tree/tests/test_export.py @@ -465,6 +465,5 @@ def test_not_fitted_tree(pyplot): # Testing if not fitted tree throws the correct error clf = DecisionTreeRegressor() - out = StringIO() with pytest.raises(NotFittedError): - plot_tree(clf, out) + plot_tree(clf) From 79df4068585ec06e591bffe3330bcb783c0497dd Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 22 Apr 2020 17:11:32 -0400 Subject: [PATCH 036/125] API Deprecate positional arguments in random_projection (#16995) * API Deprecate positional arguments in random_projection * CLN Address comments --- sklearn/random_projection.py | 9 ++++++--- sklearn/tests/test_random_projection.py | 14 +++++++------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py index d18f3bf846901..61eeeea5ef45e 100644 --- a/sklearn/random_projection.py +++ b/sklearn/random_projection.py @@ -39,6 +39,7 @@ from .utils.extmath import safe_sparse_dot from .utils.random import sample_without_replacement from .utils.validation import check_array, check_is_fitted +from .utils.validation import _deprecate_positional_args from .exceptions import DataDimensionalityWarning from .utils import deprecated @@ -310,7 +311,7 @@ class BaseRandomProjection(TransformerMixin, BaseEstimator, metaclass=ABCMeta): """ @abstractmethod - def __init__(self, n_components='auto', eps=0.1, dense_output=False, + def __init__(self, n_components='auto', *, eps=0.1, dense_output=False, random_state=None): self.n_components = n_components self.eps = eps @@ -489,7 +490,8 @@ class GaussianRandomProjection(BaseRandomProjection): SparseRandomProjection """ - def __init__(self, n_components='auto', eps=0.1, random_state=None): + @_deprecate_positional_args + def __init__(self, n_components='auto', *, eps=0.1, random_state=None): super().__init__( n_components=n_components, eps=eps, @@ -626,7 +628,8 @@ class SparseRandomProjection(BaseRandomProjection): https://users.soe.ucsc.edu/~optas/papers/jl.pdf """ - def __init__(self, n_components='auto', density='auto', eps=0.1, + @_deprecate_positional_args + def __init__(self, n_components='auto', *, density='auto', eps=0.1, dense_output=False, random_state=None): super().__init__( n_components=n_components, diff --git a/sklearn/tests/test_random_projection.py b/sklearn/tests/test_random_projection.py index 033bb84279d54..b8d69632105b0 100644 --- a/sklearn/tests/test_random_projection.py +++ b/sklearn/tests/test_random_projection.py @@ -62,21 +62,21 @@ def densify(matrix): # test on JL lemma ############################################################################### def test_invalid_jl_domain(): - assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, 1.1) - assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, 0.0) - assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, -0.1) - assert_raises(ValueError, johnson_lindenstrauss_min_dim, 0, 0.5) + assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, eps=1.1) + assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, eps=0.0) + assert_raises(ValueError, johnson_lindenstrauss_min_dim, 100, eps=-0.1) + assert_raises(ValueError, johnson_lindenstrauss_min_dim, 0, eps=0.5) def test_input_size_jl_min_dim(): assert_raises(ValueError, johnson_lindenstrauss_min_dim, - 3 * [100], 2 * [0.9]) + 3 * [100], eps=2 * [0.9]) assert_raises(ValueError, johnson_lindenstrauss_min_dim, 3 * [100], - 2 * [0.9]) + eps=2 * [0.9]) johnson_lindenstrauss_min_dim(np.random.randint(1, 10, size=(10, 10)), - np.full((10, 10), 0.5)) + eps=np.full((10, 10), 0.5)) ############################################################################### From 6717c6afa0d1f7b1c7c9ed7b2abee903d7072c7e Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 22 Apr 2020 17:14:06 -0400 Subject: [PATCH 037/125] API Deprecate positional arguments in semi_supervised module (#16974) * API Keyword only for semi_supervised * CLN Address comments --- sklearn/semi_supervised/_label_propagation.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py index d46dacbe754e4..ccc6b889f41f6 100644 --- a/sklearn/semi_supervised/_label_propagation.py +++ b/sklearn/semi_supervised/_label_propagation.py @@ -68,6 +68,7 @@ from ..utils.extmath import safe_sparse_dot from ..utils.multiclass import check_classification_targets from ..utils.validation import check_is_fitted, check_array +from ..utils.validation import _deprecate_positional_args from ..exceptions import ConvergenceWarning @@ -105,7 +106,8 @@ class BaseLabelPropagation(ClassifierMixin, BaseEstimator, metaclass=ABCMeta): for more details. """ - def __init__(self, kernel='rbf', gamma=20, n_neighbors=7, + @_deprecate_positional_args + def __init__(self, kernel='rbf', *, gamma=20, n_neighbors=7, alpha=1, max_iter=30, tol=1e-3, n_jobs=None): self.max_iter = max_iter @@ -378,7 +380,8 @@ class LabelPropagation(BaseLabelPropagation): _variant = 'propagation' - def __init__(self, kernel='rbf', gamma=20, n_neighbors=7, + @_deprecate_positional_args + def __init__(self, kernel='rbf', *, gamma=20, n_neighbors=7, max_iter=1000, tol=1e-3, n_jobs=None): super().__init__(kernel=kernel, gamma=gamma, n_neighbors=n_neighbors, max_iter=max_iter, @@ -491,7 +494,8 @@ class LabelSpreading(BaseLabelPropagation): _variant = 'spreading' - def __init__(self, kernel='rbf', gamma=20, n_neighbors=7, alpha=0.2, + @_deprecate_positional_args + def __init__(self, kernel='rbf', *, gamma=20, n_neighbors=7, alpha=0.2, max_iter=30, tol=1e-3, n_jobs=None): # this one has different base parameters From 8010cadf6ca9d319321ac72ff5e604b941f0d40c Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 23 Apr 2020 04:03:51 -0400 Subject: [PATCH 038/125] API Deprecate positional arguments in svm module (#16973) * API Keyword only for svm * BUG Fix * CLN Address comments --- sklearn/svm/_bounds.py | 4 +++- sklearn/svm/_classes.py | 25 ++++++++++++++++--------- sklearn/svm/tests/test_bounds.py | 7 ++++--- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/sklearn/svm/_bounds.py b/sklearn/svm/_bounds.py index 1e1ed8939ce5f..b35728041f6cf 100644 --- a/sklearn/svm/_bounds.py +++ b/sklearn/svm/_bounds.py @@ -6,10 +6,12 @@ from ..preprocessing import LabelBinarizer from ..utils.validation import check_consistent_length, check_array +from ..utils.validation import _deprecate_positional_args from ..utils.extmath import safe_sparse_dot -def l1_min_c(X, y, loss='squared_hinge', fit_intercept=True, +@_deprecate_positional_args +def l1_min_c(X, y, *, loss='squared_hinge', fit_intercept=True, intercept_scaling=1.0): """ Return the lowest bound for C such that for C in (l1_min_C, infinity) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 10975a6f8e4a2..77110da119a02 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -5,6 +5,7 @@ from ..linear_model._base import LinearClassifierMixin, SparseCoefMixin, \ LinearModel from ..utils.validation import _num_samples +from ..utils.validation import _deprecate_positional_args from ..utils.multiclass import check_classification_targets from ..utils.deprecation import deprecated @@ -177,9 +178,9 @@ class LinearSVC(BaseEstimator, LinearClassifierMixin, >>> print(clf.predict([[0, 0, 0, 0]])) [1] """ - - def __init__(self, penalty='l2', loss='squared_hinge', dual=True, tol=1e-4, - C=1.0, multi_class='ovr', fit_intercept=True, + @_deprecate_positional_args + def __init__(self, penalty='l2', loss='squared_hinge', *, dual=True, + tol=1e-4, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000): self.dual = dual @@ -364,7 +365,8 @@ class LinearSVR(RegressorMixin, LinearModel): various loss functions and regularization regimes. """ - def __init__(self, epsilon=0.0, tol=1e-4, C=1.0, + @_deprecate_positional_args + def __init__(self, *, epsilon=0.0, tol=1e-4, C=1.0, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1., dual=True, verbose=0, random_state=None, max_iter=1000): @@ -627,7 +629,8 @@ class SVC(BaseSVC): _impl = 'c_svc' - def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='scale', + @_deprecate_positional_args + def __init__(self, *, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=1e-3, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', @@ -838,7 +841,8 @@ class NuSVC(BaseSVC): _impl = 'nu_svc' - def __init__(self, nu=0.5, kernel='rbf', degree=3, gamma='scale', + @_deprecate_positional_args + def __init__(self, *, nu=0.5, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=1e-3, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, @@ -992,7 +996,8 @@ class SVR(RegressorMixin, BaseLibSVM): _impl = 'epsilon_svr' - def __init__(self, kernel='rbf', degree=3, gamma='scale', + @_deprecate_positional_args + def __init__(self, *, kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=1e-3, C=1.0, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1): @@ -1137,7 +1142,8 @@ class NuSVR(RegressorMixin, BaseLibSVM): _impl = 'nu_svr' - def __init__(self, nu=0.5, C=1.0, kernel='rbf', degree=3, + @_deprecate_positional_args + def __init__(self, *, nu=0.5, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, tol=1e-3, cache_size=200, verbose=False, max_iter=-1): @@ -1251,7 +1257,8 @@ class OneClassSVM(OutlierMixin, BaseLibSVM): _impl = 'one_class' - def __init__(self, kernel='rbf', degree=3, gamma='scale', + @_deprecate_positional_args + def __init__(self, *, kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=1e-3, nu=0.5, shrinking=True, cache_size=200, verbose=False, max_iter=-1): diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index 664a39c790b9b..8454ebf64de1a 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -37,11 +37,12 @@ def test_l1_min_c(loss, X_label, Y_label, intercept_label): def test_l1_min_c_l2_loss(): # loss='l2' should raise ValueError assert_raise_message(ValueError, "loss type not in", - l1_min_c, dense_X, Y1, "l2") + l1_min_c, dense_X, Y1, loss="l2") def check_l1_min_c(X, y, loss, fit_intercept=True, intercept_scaling=None): - min_c = l1_min_c(X, y, loss, fit_intercept, intercept_scaling) + min_c = l1_min_c(X, y, loss=loss, fit_intercept=fit_intercept, + intercept_scaling=intercept_scaling) clf = { 'log': LogisticRegression(penalty='l1', solver='liblinear'), @@ -72,4 +73,4 @@ def test_ill_posed_min_c(): def test_unsupported_loss(): with pytest.raises(ValueError): - l1_min_c(dense_X, Y1, 'l1') + l1_min_c(dense_X, Y1, loss='l1') From fbae1edfd8a7c0dda20ed7822c9f88f4bbf4f266 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 23 Apr 2020 04:20:52 -0400 Subject: [PATCH 039/125] API kwonly args in base (#17006) --- sklearn/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/base.py b/sklearn/base.py index c0328e00d84d0..bf5ee370aa8f1 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -20,6 +20,7 @@ from .utils import _IS_32BIT from .utils.validation import check_X_y from .utils.validation import check_array +from .utils.validation import _deprecate_positional_args _DEFAULT_TAGS = { 'non_deterministic': False, @@ -41,7 +42,8 @@ } -def clone(estimator, safe=True): +@_deprecate_positional_args +def clone(estimator, *, safe=True): """Constructs a new estimator with the same parameters. Clone does a deep copy of the model in an estimator From bbedfa0216877e26376acf147c50458710a4fc03 Mon Sep 17 00:00:00 2001 From: mathurinm Date: Thu, 23 Apr 2020 11:43:07 +0200 Subject: [PATCH 040/125] use semilogx for a more readable xaxis (#17001) --- .../plot_lasso_model_selection.py | 32 ++++++++----------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/examples/linear_model/plot_lasso_model_selection.py b/examples/linear_model/plot_lasso_model_selection.py index 88e83d434a3c6..73fc94fb94600 100644 --- a/examples/linear_model/plot_lasso_model_selection.py +++ b/examples/linear_model/plot_lasso_model_selection.py @@ -80,14 +80,12 @@ def plot_ic_criterion(model, name, color): - alpha_ = model.alpha_ + EPSILON - alphas_ = model.alphas_ + EPSILON criterion_ = model.criterion_ - plt.plot(-np.log10(alphas_), criterion_, '--', color=color, - linewidth=3, label='%s criterion' % name) - plt.axvline(-np.log10(alpha_), color=color, linewidth=3, + plt.semilogx(model.alphas_ + EPSILON, criterion_, '--', color=color, + linewidth=3, label='%s criterion' % name) + plt.axvline(model.alpha_ + EPSILON, color=color, linewidth=3, label='alpha: %s estimate' % name) - plt.xlabel('-log(alpha)') + plt.xlabel(r'$\alpha$') plt.ylabel('criterion') @@ -108,19 +106,17 @@ def plot_ic_criterion(model, name, color): t_lasso_cv = time.time() - t1 # Display results -m_log_alphas = -np.log10(model.alphas_ + EPSILON) - plt.figure() ymin, ymax = 2300, 3800 -plt.plot(m_log_alphas, model.mse_path_, ':') -plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k', +plt.semilogx(model.alphas_ + EPSILON, model.mse_path_, ':') +plt.plot(model.alphas_ + EPSILON, model.mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) -plt.axvline(-np.log10(model.alpha_ + EPSILON), linestyle='--', color='k', +plt.axvline(model.alpha_ + EPSILON, linestyle='--', color='k', label='alpha: CV estimate') plt.legend() -plt.xlabel('-log(alpha)') +plt.xlabel(r'$\alpha$') plt.ylabel('Mean square error') plt.title('Mean square error on each fold: coordinate descent ' '(train time: %.2fs)' % t_lasso_cv) @@ -137,17 +133,15 @@ def plot_ic_criterion(model, name, color): t_lasso_lars_cv = time.time() - t1 # Display results -m_log_alphas = -np.log10(model.cv_alphas_ + EPSILON) - plt.figure() -plt.plot(m_log_alphas, model.mse_path_, ':') -plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k', - label='Average across the folds', linewidth=2) -plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', +plt.semilogx(model.cv_alphas_ + EPSILON, model.mse_path_, ':') +plt.semilogx(model.cv_alphas_ + EPSILON, model.mse_path_.mean(axis=-1), 'k', + label='Average across the folds', linewidth=2) +plt.axvline(model.alpha_, linestyle='--', color='k', label='alpha CV') plt.legend() -plt.xlabel('-log(alpha)') +plt.xlabel(r'$\alpha$') plt.ylabel('Mean square error') plt.title('Mean square error on each fold: Lars (train time: %.2fs)' % t_lasso_lars_cv) From 49d213d2fc8b9676c5ab3238d3756c8397543b05 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 23 Apr 2020 08:19:35 -0400 Subject: [PATCH 041/125] [MRG] API kwonly for neural_network module (#17002) --- sklearn/neural_network/_multilayer_perceptron.py | 8 +++++--- sklearn/neural_network/_rbm.py | 5 +++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py index 3ec30336c23c1..f9b8fce5eb0c7 100644 --- a/sklearn/neural_network/_multilayer_perceptron.py +++ b/sklearn/neural_network/_multilayer_perceptron.py @@ -25,7 +25,7 @@ from ..utils import check_array, column_or_1d from ..exceptions import ConvergenceWarning from ..utils.extmath import safe_sparse_dot -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, _deprecate_positional_args from ..utils.multiclass import _check_partial_fit_first_call, unique_labels from ..utils.multiclass import type_of_target from ..utils.optimize import _check_optimize_result @@ -936,7 +936,8 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron): Kingma, Diederik, and Jimmy Ba. "Adam: A method for stochastic optimization." arXiv preprint arXiv:1412.6980 (2014). """ - def __init__(self, hidden_layer_sizes=(100,), activation="relu", + @_deprecate_positional_args + def __init__(self, hidden_layer_sizes=(100,), activation="relu", *, solver='adam', alpha=0.0001, batch_size='auto', learning_rate="constant", learning_rate_init=0.001, power_t=0.5, max_iter=200, @@ -1339,7 +1340,8 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron): Kingma, Diederik, and Jimmy Ba. "Adam: A method for stochastic optimization." arXiv preprint arXiv:1412.6980 (2014). """ - def __init__(self, hidden_layer_sizes=(100,), activation="relu", + @_deprecate_positional_args + def __init__(self, hidden_layer_sizes=(100,), activation="relu", *, solver='adam', alpha=0.0001, batch_size='auto', learning_rate="constant", learning_rate_init=0.001, diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py index 03b69c656b4a3..67e1d68a3607e 100644 --- a/sklearn/neural_network/_rbm.py +++ b/sklearn/neural_network/_rbm.py @@ -20,7 +20,7 @@ from ..utils import gen_even_slices from ..utils.extmath import safe_sparse_dot from ..utils.extmath import log_logistic -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, _deprecate_positional_args class BernoulliRBM(TransformerMixin, BaseEstimator): @@ -106,7 +106,8 @@ class BernoulliRBM(TransformerMixin, BaseEstimator): Approximations to the Likelihood Gradient. International Conference on Machine Learning (ICML) 2008 """ - def __init__(self, n_components=256, learning_rate=0.1, batch_size=10, + @_deprecate_positional_args + def __init__(self, n_components=256, *, learning_rate=0.1, batch_size=10, n_iter=10, verbose=0, random_state=None): self.n_components = n_components self.learning_rate = learning_rate From 946fddec7b62215191524c3f950a41fe944d014c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 23 Apr 2020 10:19:45 -0400 Subject: [PATCH 042/125] BUG Fix instability issue of ARDRegression (with speedup) (#16849) --- doc/whats_new/v0.23.rst | 6 ++ sklearn/externals/_scipy_linalg.py | 118 ----------------------- sklearn/linear_model/_bayes.py | 59 ++++++++---- sklearn/linear_model/tests/test_bayes.py | 48 ++++++--- sklearn/utils/fixes.py | 8 -- 5 files changed, 83 insertions(+), 156 deletions(-) delete mode 100644 sklearn/externals/_scipy_linalg.py diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 3b8af7ccf416d..6719e511a583d 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -303,6 +303,12 @@ Changelog of strictly inferior for maximum of `absgrad` and `tol` in `utils.optimize._newton_cg`. :pr:`16266` by :user:`Rushabh Vasani `. +- |Fix| |Efficiency| :class:`linear_model.ARDRegression` is more stable and + much faster when `n_samples > n_features`. It can now scale to hundreds of + thousands of samples. The stability fix might imply changes in the number + of non-zero coefficients and in the predicted output. :pr:`16849` by + `Nicolas Hug`_. + - |Enhancement| :class:`linear_model.LassoLars` and :class:`linear_model.Lars` now support a `jitter` parameter that adds random noise to the target. This might help with stability in some edge diff --git a/sklearn/externals/_scipy_linalg.py b/sklearn/externals/_scipy_linalg.py deleted file mode 100644 index 70a6ff5a0c623..0000000000000 --- a/sklearn/externals/_scipy_linalg.py +++ /dev/null @@ -1,118 +0,0 @@ -# This should remained pinned to version 1.2 and not updated like other -# externals. -"""Copyright (c) 2001-2002 Enthought, Inc. 2003-2019, SciPy Developers. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - -3. Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -""" - -import numpy as np -import scipy.linalg.decomp as decomp - - -def pinvh(a, cond=None, rcond=None, lower=True, return_rank=False, - check_finite=True): - """ - Compute the (Moore-Penrose) pseudo-inverse of a Hermitian matrix. - - Copied in from scipy==1.2.2, in order to preserve the default choice of the - `cond` and `above_cutoff` values which determine which values of the matrix - inversion lie below threshold and are so set to zero. Changes in scipy 1.3 - resulted in a smaller default threshold and thus slower convergence of - dependent algorithms in some cases (see Sklearn github issue #14055). - - Calculate a generalized inverse of a Hermitian or real symmetric matrix - using its eigenvalue decomposition and including all eigenvalues with - 'large' absolute value. - - Parameters - ---------- - a : (N, N) array_like - Real symmetric or complex hermetian matrix to be pseudo-inverted - cond, rcond : float or None - Cutoff for 'small' eigenvalues. - Singular values smaller than rcond * largest_eigenvalue are considered - zero. - - If None or -1, suitable machine precision is used. - lower : bool, optional - Whether the pertinent array data is taken from the lower or upper - triangle of a. (Default: lower) - return_rank : bool, optional - if True, return the effective rank of the matrix - check_finite : bool, optional - Whether to check that the input matrix contains only finite numbers. - Disabling may give a performance gain, but may result in problems - (crashes, non-termination) if the inputs do contain infinities or NaNs. - - Returns - ------- - B : (N, N) ndarray - The pseudo-inverse of matrix `a`. - rank : int - The effective rank of the matrix. Returned if return_rank == True - - Raises - ------ - LinAlgError - If eigenvalue does not converge - - Examples - -------- - >>> from scipy.linalg import pinvh - >>> a = np.random.randn(9, 6) - >>> a = np.dot(a, a.T) - >>> B = pinvh(a) - >>> np.allclose(a, np.dot(a, np.dot(B, a))) - True - >>> np.allclose(B, np.dot(B, np.dot(a, B))) - True - - """ - a = decomp._asarray_validated(a, check_finite=check_finite) - s, u = decomp.eigh(a, lower=lower, check_finite=False) - - if rcond is not None: - cond = rcond - if cond in [None, -1]: - t = u.dtype.char.lower() - factor = {'f': 1E3, 'd': 1E6} - cond = factor[t] * np.finfo(t).eps - - # For Hermitian matrices, singular values equal abs(eigenvalues) - above_cutoff = (abs(s) > cond * np.max(abs(s))) - psigma_diag = 1.0 / s[above_cutoff] - u = u[:, above_cutoff] - - B = np.dot(u * psigma_diag, np.conjugate(u).T) - - if return_rank: - return B, len(psigma_diag) - else: - return B diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py index eb12a271695cd..c69ebc1ce4307 100644 --- a/sklearn/linear_model/_bayes.py +++ b/sklearn/linear_model/_bayes.py @@ -12,7 +12,7 @@ from ._base import LinearModel, _rescale_data from ..base import RegressorMixin from ..utils.extmath import fast_logdet -from ..utils.fixes import pinvh +from scipy.linalg import pinvh from ..utils.validation import _check_sample_weight from ..utils.validation import _deprecate_positional_args @@ -554,27 +554,16 @@ def fit(self, X, y): self.scores_ = list() coef_old_ = None - # Compute sigma and mu (using Woodbury matrix identity) - def update_sigma(X, alpha_, lambda_, keep_lambda, n_samples): - sigma_ = pinvh(np.eye(n_samples) / alpha_ + - np.dot(X[:, keep_lambda] * - np.reshape(1. / lambda_[keep_lambda], [1, -1]), - X[:, keep_lambda].T)) - sigma_ = np.dot(sigma_, X[:, keep_lambda] * - np.reshape(1. / lambda_[keep_lambda], [1, -1])) - sigma_ = - np.dot(np.reshape(1. / lambda_[keep_lambda], [-1, 1]) * - X[:, keep_lambda].T, sigma_) - sigma_.flat[::(sigma_.shape[1] + 1)] += 1. / lambda_[keep_lambda] - return sigma_ - def update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_): coef_[keep_lambda] = alpha_ * np.dot( sigma_, np.dot(X[:, keep_lambda].T, y)) return coef_ + update_sigma = (self._update_sigma if n_samples >= n_features + else self._update_sigma_woodbury) # Iterative procedure of ARDRegression for iter_ in range(self.n_iter): - sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda, n_samples) + sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda) coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_) # Update alpha and lambda @@ -606,9 +595,15 @@ def update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_): break coef_old_ = np.copy(coef_) - # update sigma and mu using updated parameters from the last iteration - sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda, n_samples) - coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_) + if not keep_lambda.any(): + break + + if keep_lambda.any(): + # update sigma and mu using updated params from the last iteration + sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda) + coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_) + else: + sigma_ = np.array([]).reshape(0, 0) self.coef_ = coef_ self.alpha_ = alpha_ @@ -617,6 +612,34 @@ def update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_): self._set_intercept(X_offset_, y_offset_, X_scale_) return self + def _update_sigma_woodbury(self, X, alpha_, lambda_, keep_lambda): + # See slides as referenced in the docstring note + # this function is used when n_samples < n_features and will invert + # a matrix of shape (n_samples, n_samples) making use of the + # woodbury formula: + # https://en.wikipedia.org/wiki/Woodbury_matrix_identity + n_samples = X.shape[0] + X_keep = X[:, keep_lambda] + inv_lambda = 1 / lambda_[keep_lambda].reshape(1, -1) + sigma_ = pinvh( + np.eye(n_samples) / alpha_ + np.dot(X_keep * inv_lambda, X_keep.T) + ) + sigma_ = np.dot(sigma_, X_keep * inv_lambda) + sigma_ = - np.dot(inv_lambda.reshape(-1, 1) * X_keep.T, sigma_) + sigma_[np.diag_indices(sigma_.shape[1])] += 1. / lambda_[keep_lambda] + return sigma_ + + def _update_sigma(self, X, alpha_, lambda_, keep_lambda): + # See slides as referenced in the docstring note + # this function is used when n_samples >= n_features and will + # invert a matrix of shape (n_features, n_features) + X_keep = X[:, keep_lambda] + gram = np.dot(X_keep.T, X_keep) + eye = np.eye(gram.shape[0]) + sigma_inv = lambda_[keep_lambda] * eye + alpha_ * gram + sigma_ = pinvh(sigma_inv) + return sigma_ + def predict(self, X, return_std=False): """Predict using the linear model. diff --git a/sklearn/linear_model/tests/test_bayes.py b/sklearn/linear_model/tests/test_bayes.py index e1922a010514f..ff3ac13c2d7f6 100644 --- a/sklearn/linear_model/tests/test_bayes.py +++ b/sklearn/linear_model/tests/test_bayes.py @@ -7,6 +7,8 @@ import numpy as np from scipy.linalg import pinvh +import pytest + from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_almost_equal @@ -159,7 +161,7 @@ def test_std_bayesian_ridge_ard_with_constant_input(): # Test BayesianRidge and ARDRegression standard dev. for edge case of # constant target vector # The standard dev. should be relatively small (< 0.01 is tested here) - n_samples = 4 + n_samples = 10 n_features = 5 random_state = check_random_state(42) constant_value = random_state.rand() @@ -181,9 +183,9 @@ def test_update_of_sigma_in_ard(): y = np.array([0, 0]) clf = ARDRegression(n_iter=1) clf.fit(X, y) - # With the inputs above, ARDRegression prunes one of the two coefficients - # in the first iteration. Hence, the expected shape of `sigma_` is (1, 1). - assert clf.sigma_.shape == (1, 1) + # With the inputs above, ARDRegression prunes both of the two coefficients + # in the first iteration. Hence, the expected shape of `sigma_` is (0, 0). + assert clf.sigma_.shape == (0, 0) # Ensure that no error is thrown at prediction stage clf.predict(X, return_std=True) @@ -200,22 +202,19 @@ def test_toy_ard_object(): assert_array_almost_equal(clf.predict(test), [1, 3, 4], 2) -def test_ard_accuracy_on_easy_problem(): +@pytest.mark.parametrize('seed', range(100)) +@pytest.mark.parametrize('n_samples, n_features', ((10, 100), (100, 10))) +def test_ard_accuracy_on_easy_problem(seed, n_samples, n_features): # Check that ARD converges with reasonable accuracy on an easy problem # (Github issue #14055) - # This particular seed seems to converge poorly in the failure-case - # (scipy==1.3.0, sklearn==0.21.2) - seed = 45 X = np.random.RandomState(seed=seed).normal(size=(250, 3)) y = X[:, 1] - regressor = ARDRegression(n_iter=600) + regressor = ARDRegression() regressor.fit(X, y) abs_coef_error = np.abs(1 - regressor.coef_[1]) - # Expect an accuracy of better than 1E-4 in most cases - - # Failure-case produces 0.16! - assert abs_coef_error < 0.01 + assert abs_coef_error < 1e-10 def test_return_std(): @@ -248,3 +247,28 @@ def f_noise(X, noise_mult): m2.fit(X, y) y_mean2, y_std2 = m2.predict(X_test, return_std=True) assert_array_almost_equal(y_std2, noise_mult, decimal=decimal) + + +@pytest.mark.parametrize('seed', range(10)) +def test_update_sigma(seed): + # make sure the two update_sigma() helpers are equivalent. The woodbury + # formula is used when n_samples < n_features, and the other one is used + # otherwise. + + rng = np.random.RandomState(seed) + + # set n_samples == n_features to avoid instability issues when inverting + # the matrices. Using the woodbury formula would be unstable when + # n_samples > n_features + n_samples = n_features = 10 + X = rng.randn(n_samples, n_features) + alpha = 1 + lmbda = np.arange(1, n_features + 1) + keep_lambda = np.array([True] * n_features) + + reg = ARDRegression() + + sigma = reg._update_sigma(X, alpha, lmbda, keep_lambda) + sigma_woodbury = reg._update_sigma_woodbury(X, alpha, lmbda, keep_lambda) + + np.testing.assert_allclose(sigma, sigma_woodbury) diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index 622c102fbbd0b..6635c7345aec5 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -42,14 +42,6 @@ def _parse_version(version_string): # mypy error: Name 'lobpcg' already defined (possibly by an import) from ..externals._lobpcg import lobpcg # type: ignore # noqa -if sp_version >= (1, 3): - # Preserves earlier default choice of pinvh cutoff `cond` value. - # Can be removed once issue #14055 is fully addressed. - from ..externals._scipy_linalg import pinvh -else: - # mypy error: Name 'pinvh' already defined (possibly by an import) - from scipy.linalg import pinvh # type: ignore # noqa - def _object_dtype_isnan(X): return X != X From a93b15f19bf7db8027e04705603161c336fb1454 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 23 Apr 2020 16:27:35 +0200 Subject: [PATCH 043/125] ENH Poisson loss for HistGradientBoostingRegressor (#16692) --- doc/modules/ensemble.rst | 5 +- doc/whats_new/v0.23.rst | 5 ++ .../_hist_gradient_boosting/_loss.pyx | 33 +++++++++++- .../gradient_boosting.py | 23 +++++--- .../ensemble/_hist_gradient_boosting/loss.py | 54 ++++++++++++++++++- .../tests/test_gradient_boosting.py | 42 +++++++++++++++ .../tests/test_loss.py | 39 ++++++++++++-- 7 files changed, 184 insertions(+), 17 deletions(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 1416b9d3a6045..e731ece0bdb20 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -952,8 +952,9 @@ controls the number of iterations of the boosting process:: >>> clf.score(X_test, y_test) 0.8965 -Available losses for regression are 'least_squares' and -'least_absolute_deviation', which is less sensitive to outliers. For +Available losses for regression are 'least_squares', +'least_absolute_deviation', which is less sensitive to outliers, and +'poisson', which is well suited to model counts and frequencies. For classification, 'binary_crossentropy' is used for binary classification and 'categorical_crossentropy' is used for multiclass classification. By default the loss is 'auto' and will select the appropriate loss depending on diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 6719e511a583d..844cdf0360f73 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -215,6 +215,11 @@ Changelog to obtain the input to the meta estimator. :pr:`16539` by :user:`Bill DeRose `. +- |Feature| Added additional option `loss="poisson"` to + :class:`ensemble.HistGradientBoostingRegressor`, which adds Poisson deviance + with log-link useful for modeling count data. + :pr:`16692` by :user:`Christian Lorentzen ` + :mod:`sklearn.feature_extraction` ................................. diff --git a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx index 821a81a48fcf3..64480911439e5 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx @@ -10,7 +10,7 @@ from cython.parallel import prange import numpy as np cimport numpy as np -from libc.math cimport exp +from libc.math cimport exp, log from .common cimport Y_DTYPE_C from .common cimport G_H_DTYPE_C @@ -27,7 +27,7 @@ def _update_gradients_least_squares( n_samples = raw_predictions.shape[0] for i in prange(n_samples, schedule='static', nogil=True): - # Note: a more correct exp is 2 * (raw_predictions - y_true) + # Note: a more correct expression is 2 * (raw_predictions - y_true) # but since we use 1 for the constant hessian value (and not 2) this # is strictly equivalent for the leaves values. gradients[i] = raw_predictions[i] - y_true[i] @@ -87,6 +87,35 @@ def _update_gradients_least_absolute_deviation( gradients[i] = 2 * (y_true[i] - raw_predictions[i] < 0) - 1 +def _update_gradients_hessians_poisson( + G_H_DTYPE_C [::1] gradients, # OUT + G_H_DTYPE_C [::1] hessians, # OUT + const Y_DTYPE_C [::1] y_true, # IN + const Y_DTYPE_C [::1] raw_predictions, # IN + const Y_DTYPE_C [::1] sample_weight): # IN + + cdef: + int n_samples + int i + Y_DTYPE_C y_pred + + n_samples = raw_predictions.shape[0] + if sample_weight is None: + for i in prange(n_samples, schedule='static', nogil=True): + # Note: We use only half of the deviance loss. Therefore, there is + # no factor of 2. + y_pred = exp(raw_predictions[i]) + gradients[i] = (y_pred - y_true[i]) + hessians[i] = y_pred + else: + for i in prange(n_samples, schedule='static', nogil=True): + # Note: We use only half of the deviance loss. Therefore, there is + # no factor of 2. + y_pred = exp(raw_predictions[i]) + gradients[i] = (y_pred - y_true[i]) * sample_weight[i] + hessians[i] = y_pred * sample_weight[i] + + def _update_gradients_hessians_binary_crossentropy( G_H_DTYPE_C [::1] gradients, # OUT G_H_DTYPE_C [::1] hessians, # OUT diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 6087adb0b6575..8287cda367a10 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -758,11 +758,13 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): Parameters ---------- - loss : {'least_squares', 'least_absolute_deviation'}, \ + loss : {'least_squares', 'least_absolute_deviation', 'poisson'}, \ optional (default='least_squares') The loss function to use in the boosting process. Note that the - "least squares" loss actually implements an "half least squares loss" - to simplify the computation of the gradient. + "least squares" and "poisson" losses actually implement + "half least squares loss" and "half poisson deviance" to simplify the + computation of the gradient. Furthermore, "poisson" loss internally + uses a log-link and requires ``y >= 0`` learning_rate : float, optional (default=0.1) The learning rate, also known as *shrinkage*. This is used as a multiplicative factor for the leaves values. Use ``1`` for no @@ -868,7 +870,8 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): 0.92... """ - _VALID_LOSSES = ('least_squares', 'least_absolute_deviation') + _VALID_LOSSES = ('least_squares', 'least_absolute_deviation', + 'poisson') @_deprecate_positional_args def __init__(self, loss='least_squares', *, learning_rate=0.1, @@ -902,14 +905,20 @@ def predict(self, X): y : ndarray, shape (n_samples,) The predicted values. """ - # Return raw predictions after converting shape - # (n_samples, 1) to (n_samples,) - return self._raw_predict(X).ravel() + check_is_fitted(self) + # Return inverse link of raw predictions after converting + # shape (n_samples, 1) to (n_samples,) + return self.loss_.inverse_link_function(self._raw_predict(X).ravel()) def _encode_y(self, y): # Just convert y to the expected dtype self.n_trees_per_iteration_ = 1 y = y.astype(Y_DTYPE, copy=False) + if self.loss == 'poisson': + # Ensure y >= 0 and sum(y) > 0 + if not (np.all(y >= 0) and np.sum(y) > 0): + raise ValueError("loss='poisson' requires non-negative y and " + "sum(y) > 0.") return y def _get_loss(self, sample_weight): diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py index c7884a25a9c41..f256408bf01fb 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/loss.py @@ -9,7 +9,7 @@ from abc import ABC, abstractmethod import numpy as np -from scipy.special import expit, logsumexp +from scipy.special import expit, logsumexp, xlogy from .common import Y_DTYPE from .common import G_H_DTYPE @@ -19,11 +19,13 @@ from ._loss import _update_gradients_hessians_least_absolute_deviation from ._loss import _update_gradients_hessians_binary_crossentropy from ._loss import _update_gradients_hessians_categorical_crossentropy +from ._loss import _update_gradients_hessians_poisson from ...utils.stats import _weighted_percentile class BaseLoss(ABC): """Base class for a loss.""" + def __init__(self, hessians_are_constant): self.hessians_are_constant = hessians_are_constant @@ -153,6 +155,7 @@ class LeastSquares(BaseLoss): the computation of the gradients and get a unit hessian (and be consistent with what is done in LightGBM). """ + def __init__(self, sample_weight): # If sample weights are provided, the hessians and gradients # are multiplied by sample_weight, which means the hessians are @@ -195,6 +198,7 @@ class LeastAbsoluteDeviation(BaseLoss): loss(x_i) = |y_true_i - raw_pred_i| """ + def __init__(self, sample_weight): # If sample weights are provided, the hessians and gradients # are multiplied by sample_weight, which means the hessians are @@ -265,6 +269,51 @@ def update_leaves_values(self, grower, y_true, raw_predictions, # Note that the regularization is ignored here +class Poisson(BaseLoss): + """Poisson deviance loss with log-link, for regression. + + For a given sample x_i, Poisson deviance loss is defined as:: + + loss(x_i) = y_true_i * log(y_true_i/exp(raw_pred_i)) + - y_true_i + exp(raw_pred_i)) + + This actually computes half the Poisson deviance to simplify + the computation of the gradients. + """ + + def __init__(self, sample_weight): + super().__init__(hessians_are_constant=False) + + inverse_link_function = staticmethod(np.exp) + + def pointwise_loss(self, y_true, raw_predictions): + # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to + # return a view. + raw_predictions = raw_predictions.reshape(-1) + # TODO: For speed, we could remove the constant xlogy(y_true, y_true) + # Advantage of this form: minimum of zero at raw_predictions = y_true. + loss = (xlogy(y_true, y_true) - y_true * (raw_predictions + 1) + + np.exp(raw_predictions)) + return loss + + def get_baseline_prediction(self, y_train, sample_weight, prediction_dim): + y_pred = np.average(y_train, weights=sample_weight) + eps = np.finfo(y_train.dtype).eps + y_pred = np.clip(y_pred, eps, None) + return np.log(y_pred) + + def update_gradients_and_hessians(self, gradients, hessians, y_true, + raw_predictions, sample_weight): + # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to + # return a view. + raw_predictions = raw_predictions.reshape(-1) + gradients = gradients.reshape(-1) + hessians = hessians.reshape(-1) + _update_gradients_hessians_poisson(gradients, hessians, + y_true, raw_predictions, + sample_weight) + + class BinaryCrossEntropy(BaseLoss): """Binary cross-entropy loss, for binary classification. @@ -372,5 +421,6 @@ def predict_proba(self, raw_predictions): 'least_squares': LeastSquares, 'least_absolute_deviation': LeastAbsoluteDeviation, 'binary_crossentropy': BinaryCrossEntropy, - 'categorical_crossentropy': CategoricalCrossEntropy + 'categorical_crossentropy': CategoricalCrossEntropy, + 'poisson': Poisson, } diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 6fc412942d180..dfed16dafca39 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -2,10 +2,13 @@ import pytest from numpy.testing import assert_allclose, assert_array_equal from sklearn.datasets import make_classification, make_regression +from sklearn.datasets import make_low_rank_matrix from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler from sklearn.model_selection import train_test_split from sklearn.base import clone, BaseEstimator, TransformerMixin from sklearn.pipeline import make_pipeline +from sklearn.metrics import mean_poisson_deviance +from sklearn.dummy import DummyRegressor # To use this experimental feature, we need to explicitly ask for it: from sklearn.experimental import enable_hist_gradient_boosting # noqa @@ -194,6 +197,45 @@ def test_least_absolute_deviation(): assert gbdt.score(X, y) > .9 +@pytest.mark.parametrize('y', [([1., -2., 0.]), ([0., 0., 0.])]) +def test_poisson_y_positive(y): + # Test that ValueError is raised if either one y_i < 0 or sum(y_i) <= 0. + err_msg = r"loss='poisson' requires non-negative y and sum\(y\) > 0." + gbdt = HistGradientBoostingRegressor(loss='poisson', random_state=0) + with pytest.raises(ValueError, match=err_msg): + gbdt.fit(np.zeros(shape=(len(y), 1)), y) + + +def test_poisson(): + # For Poisson distributed target, Poisson loss should give better results + # than least squares measured in Poisson deviance as metric. + rng = np.random.RandomState(42) + n_train, n_test, n_features = 500, 100, 100 + X = make_low_rank_matrix(n_samples=n_train+n_test, n_features=n_features, + random_state=rng) + # We create a log-linear Poisson model and downscale coef as it will get + # exponentiated. + coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0) + y = rng.poisson(lam=np.exp(X @ coef)) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test, + random_state=rng) + gbdt_pois = HistGradientBoostingRegressor(loss='poisson', random_state=rng) + gbdt_ls = HistGradientBoostingRegressor(loss='least_squares', + random_state=rng) + gbdt_pois.fit(X_train, y_train) + gbdt_ls.fit(X_train, y_train) + dummy = DummyRegressor(strategy="mean").fit(X_train, y_train) + + for X, y in [(X_train, y_train), (X_test, y_test)]: + metric_pois = mean_poisson_deviance(y, gbdt_pois.predict(X)) + # least_squares might produce non-positive predictions => clip + metric_ls = mean_poisson_deviance(y, np.clip(gbdt_ls.predict(X), 1e-15, + None)) + metric_dummy = mean_poisson_deviance(y, dummy.predict(X)) + assert metric_pois < metric_ls + assert metric_pois < metric_dummy + + def test_binning_train_validation_are_separated(): # Make sure training and validation data are binned separately. # See issue 13926 diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py index 915dc300e4760..7fc6ab9097873 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py @@ -52,6 +52,9 @@ def get_hessians(y_true, raw_predictions): # ('binary_crossentropy', 0.3, 0), ('binary_crossentropy', -12, 1), ('binary_crossentropy', 30, 1), + ('poisson', 12., 1.), + ('poisson', 0., 2.), + ('poisson', -22., 10.), ]) @pytest.mark.skipif(sp_version == (1, 2, 0), reason='bug in scipy 1.2.0, see scipy issue #9608') @@ -76,10 +79,11 @@ def fprime(x): def fprime2(x): return get_hessians(y_true, x) - optimum = newton(func, x0=x0, fprime=fprime, fprime2=fprime2) + optimum = newton(func, x0=x0, fprime=fprime, fprime2=fprime2, + maxiter=70, tol=2e-8) assert np.allclose(loss.inverse_link_function(optimum), y_true) assert np.allclose(loss.pointwise_loss(y_true, optimum), 0) - assert np.allclose(get_gradients(y_true, optimum), 0) + assert np.allclose(get_gradients(y_true, optimum), 0, atol=1e-7) @pytest.mark.parametrize('loss, n_classes, prediction_dim', [ @@ -87,6 +91,7 @@ def fprime2(x): ('least_absolute_deviation', 0, 1), ('binary_crossentropy', 2, 1), ('categorical_crossentropy', 3, 3), + ('poisson', 0, 1), ]) @pytest.mark.skipif(Y_DTYPE != np.float64, reason='Need 64 bits float precision for numerical checks') @@ -100,6 +105,8 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0): n_samples = 100 if loss in ('least_squares', 'least_absolute_deviation'): y_true = rng.normal(size=n_samples).astype(Y_DTYPE) + elif loss in ('poisson'): + y_true = rng.poisson(size=n_samples).astype(Y_DTYPE) else: y_true = rng.randint(0, n_classes, size=n_samples).astype(Y_DTYPE) raw_predictions = rng.normal( @@ -114,7 +121,7 @@ def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0): # Approximate gradients # For multiclass loss, we should only change the predictions of one tree - # (here the first), hence the use of offset[:, 0] += eps + # (here the first), hence the use of offset[0, :] += eps # As a softmax is computed, offsetting the whole array by a constant would # have no effect on the probabilities, and thus on the loss eps = 1e-9 @@ -164,6 +171,27 @@ def test_baseline_least_absolute_deviation(): assert baseline_prediction == pytest.approx(np.median(y_train)) +def test_baseline_poisson(): + rng = np.random.RandomState(0) + + loss = _LOSSES['poisson'](sample_weight=None) + y_train = rng.poisson(size=100).astype(np.float64) + # Sanity check, make sure at least one sample is non-zero so we don't take + # log(0) + assert y_train.sum() > 0 + baseline_prediction = loss.get_baseline_prediction(y_train, None, 1) + assert np.isscalar(baseline_prediction) + assert baseline_prediction.dtype == y_train.dtype + assert_all_finite(baseline_prediction) + # Make sure baseline prediction produces the log of the mean of all targets + assert_almost_equal(np.log(y_train.mean()), baseline_prediction) + + # Test baseline for y_true = 0 + y_train.fill(0.) + baseline_prediction = loss.get_baseline_prediction(y_train, None, 1) + assert_all_finite(baseline_prediction) + + def test_baseline_binary_crossentropy(): rng = np.random.RandomState(0) @@ -215,7 +243,8 @@ def test_baseline_categorical_crossentropy(): ('least_squares', 'regression'), ('least_absolute_deviation', 'regression'), ('binary_crossentropy', 'classification'), - ('categorical_crossentropy', 'classification') + ('categorical_crossentropy', 'classification'), + ('poisson', 'poisson_regression'), ]) @pytest.mark.parametrize('sample_weight', ['ones', 'random']) def test_sample_weight_multiplies_gradients(loss, problem, sample_weight): @@ -232,6 +261,8 @@ def test_sample_weight_multiplies_gradients(loss, problem, sample_weight): if problem == 'regression': y_true = rng.normal(size=n_samples).astype(Y_DTYPE) + elif problem == 'poisson_regression': + y_true = rng.poisson(size=n_samples).astype(Y_DTYPE) else: y_true = rng.randint(0, n_classes, size=n_samples).astype(Y_DTYPE) From 36934921c5c6a39eb09ee80493483ff0520146d8 Mon Sep 17 00:00:00 2001 From: Lucy Liu Date: Thu, 23 Apr 2020 18:06:36 +0200 Subject: [PATCH 044/125] DOC Add link and fix typo in nearest neighbours estimators (#17017) --- sklearn/neighbors/_classification.py | 8 ++++---- sklearn/neighbors/_regression.py | 4 ++-- sklearn/neighbors/_unsupervised.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py index 0580b710afd44..e223476d3107b 100644 --- a/sklearn/neighbors/_classification.py +++ b/sklearn/neighbors/_classification.py @@ -71,10 +71,10 @@ class KNeighborsClassifier(NeighborsBase, KNeighborsMixin, metric : str or callable, default='minkowski' the distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean - metric. See the documentation of the DistanceMetric class for a + metric. See the documentation of :class:`DistanceMetric` for a list of available metrics. If metric is "precomputed", X is assumed to be a distance matrix and - must be square during fit. X may be a :term:`Glossary `, + must be square during fit. X may be a :term:`sparse graph`, in which case only "nonzero" elements may be considered neighbors. metric_params : dict, default=None @@ -303,10 +303,10 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, metric : str or callable, default='minkowski' the distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean - metric. See the documentation of the DistanceMetric class for a + metric. See the documentation of :class:`DistanceMetric` for a list of available metrics. If metric is "precomputed", X is assumed to be a distance matrix and - must be square during fit. X may be a :term:`Glossary `, + must be square during fit. X may be a :term:`sparse graph`, in which case only "nonzero" elements may be considered neighbors. outlier_label : {manual label, 'most_frequent'}, default=None diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py index a00d83c98102b..cce218062a3d5 100644 --- a/sklearn/neighbors/_regression.py +++ b/sklearn/neighbors/_regression.py @@ -253,10 +253,10 @@ class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin, metric : str or callable, default='minkowski' the distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean - metric. See the documentation of the DistanceMetric class for a + metric. See the documentation of :class:`DistanceMetric` for a list of available metrics. If metric is "precomputed", X is assumed to be a distance matrix and - must be square during fit. X may be a :term:`Glossary `, + must be square during fit. X may be a :term:`sparse graph`, in which case only "nonzero" elements may be considered neighbors. metric_params : dict, default=None diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py index 6faafeee9ffcd..923a465b1d31b 100644 --- a/sklearn/neighbors/_unsupervised.py +++ b/sklearn/neighbors/_unsupervised.py @@ -43,10 +43,10 @@ class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, metric : str or callable, default='minkowski' the distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean - metric. See the documentation of the DistanceMetric class for a + metric. See the documentation of :class:`DistanceMetric` for a list of available metrics. If metric is "precomputed", X is assumed to be a distance matrix and - must be square during fit. X may be a :term:`Glossary `, + must be square during fit. X may be a :term:`sparse graph`, in which case only "nonzero" elements may be considered neighbors. p : int, default=2 From 2592edddb6eb326a65311f081e668d93ab044703 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 23 Apr 2020 12:12:07 -0400 Subject: [PATCH 045/125] MNT Fixes DataConversionWarning doctest in pypy (#16965) --- sklearn/exceptions.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py index 0083632418c8b..1b71050813d2b 100644 --- a/sklearn/exceptions.py +++ b/sklearn/exceptions.py @@ -98,10 +98,9 @@ class DataConversionWarning(UserWarning): ... Y = validation.column_or_1d(Y,warn=True) ... except ValueError: ... pass - ... print(repr(w[-1].message)) - DataConversionWarning('A column-vector y was passed when a - 1d array was expected. Please change the shape of y to - (n_samples, ), for example using ravel().') + ... print(w[-1].message) + A column-vector y was passed when a 1d array was expected. Please change + the shape of y to (n_samples, ), for example using ravel(). .. versionchanged:: 0.18 Moved from sklearn.utils.validation. From e54cd3c0617e3485baa19e2c69332da55b363636 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 23 Apr 2020 13:10:49 -0400 Subject: [PATCH 046/125] API Deprecate positional arguments in preprocessing (#16996) * API Deprecate positional arguments in preprocessing * CLN Address comments * CLN Uses classes as a keyword only argument * BUG Fix --- sklearn/calibration.py | 4 +- sklearn/metrics/_ranking.py | 4 +- sklearn/metrics/tests/test_classification.py | 2 +- sklearn/preprocessing/_data.py | 54 ++++++++++++------- sklearn/preprocessing/_discretization.py | 4 +- sklearn/preprocessing/_encoders.py | 7 ++- .../preprocessing/_function_transformer.py | 5 +- sklearn/preprocessing/_label.py | 13 +++-- sklearn/preprocessing/tests/test_data.py | 10 ++-- sklearn/preprocessing/tests/test_label.py | 10 ++-- 10 files changed, 71 insertions(+), 42 deletions(-) diff --git a/sklearn/calibration.py b/sklearn/calibration.py index 8a719d49bd6de..31df362ddb009 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -329,7 +329,7 @@ def fit(self, X, y, sample_weight=None): self.label_encoder_.fit(self.classes) self.classes_ = self.label_encoder_.classes_ - Y = label_binarize(y, self.classes_) + Y = label_binarize(y, classes=self.classes_) df, idx_pos_class = self._preproc(X) self.calibrators_ = [] @@ -574,7 +574,7 @@ def calibration_curve(y_true, y_prob, normalize=False, n_bins=5, if len(labels) > 2: raise ValueError("Only binary classification is supported. " "Provided labels %s." % labels) - y_true = label_binarize(y_true, labels)[:, 0] + y_true = label_binarize(y_true, classes=labels)[:, 0] if strategy == 'quantile': # Determine bin edges by distribution of data quantiles = np.linspace(0, 1, n_bins + 1) diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py index e525539c0d706..ad867efb8bfa3 100644 --- a/sklearn/metrics/_ranking.py +++ b/sklearn/metrics/_ranking.py @@ -383,7 +383,7 @@ def roc_auc_score(y_true, y_score, average="macro", sample_weight=None, multi_class, average, sample_weight) elif y_type == "binary": labels = np.unique(y_true) - y_true = label_binarize(y_true, labels)[:, 0] + y_true = label_binarize(y_true, classes=labels)[:, 0] return _average_binary_score(partial(_binary_roc_auc_score, max_fpr=max_fpr), y_true, y_score, average, @@ -489,7 +489,7 @@ def _multiclass_roc_auc_score(y_true, y_score, labels, y_score, average=average) else: # ovr is same as multi-label - y_true_multilabel = label_binarize(y_true, classes) + y_true_multilabel = label_binarize(y_true, classes=classes) return _average_binary_score(_binary_roc_auc_score, y_true_multilabel, y_score, average, sample_weight=sample_weight) diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index be6364e63b2cd..ca56e79299adb 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -654,7 +654,7 @@ def test_matthews_corrcoef(): y_true_inv = ["b" if i == "a" else "a" for i in y_true] assert_almost_equal(matthews_corrcoef(y_true, y_true_inv), -1) - y_true_inv2 = label_binarize(y_true, ["a", "b"]) + y_true_inv2 = label_binarize(y_true, classes=["a", "b"]) y_true_inv2 = np.where(y_true_inv2, 'a', 'b') assert_almost_equal(matthews_corrcoef(y_true, y_true_inv2), -1) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index c95351db9d985..f9af3dbac6d0d 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -29,7 +29,7 @@ mean_variance_axis, incr_mean_variance_axis, min_max_axis) from ..utils.validation import (check_is_fitted, check_random_state, - FLOAT_DTYPES) + FLOAT_DTYPES, _deprecate_positional_args) from ._csr_polynomial_expansion import _csr_polynomial_expansion @@ -78,7 +78,8 @@ def _handle_zeros_in_scale(scale, copy=True): return scale -def scale(X, axis=0, with_mean=True, with_std=True, copy=True): +@_deprecate_positional_args +def scale(X, *, axis=0, with_mean=True, with_std=True, copy=True): """Standardize a dataset along any axis Center to the mean and component wise scale to unit variance. @@ -291,7 +292,8 @@ class MinMaxScaler(TransformerMixin, BaseEstimator): `. """ - def __init__(self, feature_range=(0, 1), copy=True): + @_deprecate_positional_args + def __init__(self, feature_range=(0, 1), *, copy=True): self.feature_range = feature_range self.copy = copy @@ -435,7 +437,8 @@ def _more_tags(self): return {'allow_nan': True} -def minmax_scale(X, feature_range=(0, 1), axis=0, copy=True): +@_deprecate_positional_args +def minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True): """Transform features by scaling each feature to a given range. This estimator scales and translates each feature individually such @@ -626,7 +629,8 @@ class StandardScaler(TransformerMixin, BaseEstimator): `. """ # noqa - def __init__(self, copy=True, with_mean=True, with_std=True): + @_deprecate_positional_args + def __init__(self, *, copy=True, with_mean=True, with_std=True): self.with_mean = with_mean self.with_std = with_std self.copy = copy @@ -908,7 +912,8 @@ class MaxAbsScaler(TransformerMixin, BaseEstimator): `. """ - def __init__(self, copy=True): + @_deprecate_positional_args + def __init__(self, *, copy=True): self.copy = copy def _reset(self): @@ -1024,7 +1029,8 @@ def _more_tags(self): return {'allow_nan': True} -def maxabs_scale(X, axis=0, copy=True): +@_deprecate_positional_args +def maxabs_scale(X, *, axis=0, copy=True): """Scale each feature to the [-1, 1] range without breaking the sparsity. This estimator scales each feature individually such @@ -1172,8 +1178,8 @@ class RobustScaler(TransformerMixin, BaseEstimator): https://en.wikipedia.org/wiki/Median https://en.wikipedia.org/wiki/Interquartile_range """ - - def __init__(self, with_centering=True, with_scaling=True, + @_deprecate_positional_args + def __init__(self, *, with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True): self.with_centering = with_centering self.with_scaling = with_scaling @@ -1282,7 +1288,8 @@ def _more_tags(self): return {'allow_nan': True} -def robust_scale(X, axis=0, with_centering=True, with_scaling=True, +@_deprecate_positional_args +def robust_scale(X, *, axis=0, with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True): """Standardize a dataset along any axis @@ -1433,7 +1440,8 @@ class PolynomialFeatures(TransformerMixin, BaseEstimator): See :ref:`examples/linear_model/plot_polynomial_interpolation.py ` """ - def __init__(self, degree=2, interaction_only=False, include_bias=True, + @_deprecate_positional_args + def __init__(self, degree=2, *, interaction_only=False, include_bias=True, order='C'): self.degree = degree self.interaction_only = interaction_only @@ -1638,7 +1646,8 @@ def transform(self, X): return XP -def normalize(X, norm='l2', axis=1, copy=True, return_norm=False): +@_deprecate_positional_args +def normalize(X, norm='l2', *, axis=1, copy=True, return_norm=False): """Scale input vectors individually to unit norm (vector length). Read more in the :ref:`User Guide `. @@ -1797,7 +1806,8 @@ class Normalizer(TransformerMixin, BaseEstimator): normalize: Equivalent function without the estimator API. """ - def __init__(self, norm='l2', copy=True): + @_deprecate_positional_args + def __init__(self, norm='l2', *, copy=True): self.norm = norm self.copy = copy @@ -1833,7 +1843,8 @@ def _more_tags(self): return {'stateless': True} -def binarize(X, threshold=0.0, copy=True): +@_deprecate_positional_args +def binarize(X, *, threshold=0.0, copy=True): """Boolean thresholding of array-like or scipy.sparse matrix Read more in the :ref:`User Guide `. @@ -1931,7 +1942,8 @@ class Binarizer(TransformerMixin, BaseEstimator): binarize: Equivalent function without the estimator API. """ - def __init__(self, threshold=0.0, copy=True): + @_deprecate_positional_args + def __init__(self, *, threshold=0.0, copy=True): self.threshold = threshold self.copy = copy @@ -2228,7 +2240,8 @@ class QuantileTransformer(TransformerMixin, BaseEstimator): `. """ - def __init__(self, n_quantiles=1000, output_distribution='uniform', + @_deprecate_positional_args + def __init__(self, *, n_quantiles=1000, output_distribution='uniform', ignore_implicit_zeros=False, subsample=int(1e5), random_state=None, copy=True): self.n_quantiles = n_quantiles @@ -2560,7 +2573,8 @@ def _more_tags(self): return {'allow_nan': True} -def quantile_transform(X, axis=0, n_quantiles=1000, +@_deprecate_positional_args +def quantile_transform(X, *, axis=0, n_quantiles=1000, output_distribution='uniform', ignore_implicit_zeros=False, subsample=int(1e5), @@ -2764,7 +2778,8 @@ class PowerTransformer(TransformerMixin, BaseEstimator): .. [2] G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal of the Royal Statistical Society B, 26, 211-252 (1964). """ - def __init__(self, method='yeo-johnson', standardize=True, copy=True): + @_deprecate_positional_args + def __init__(self, method='yeo-johnson', *, standardize=True, copy=True): self.method = method self.standardize = standardize self.copy = copy @@ -3034,7 +3049,8 @@ def _more_tags(self): return {'allow_nan': True} -def power_transform(X, method='yeo-johnson', standardize=True, copy=True): +@_deprecate_positional_args +def power_transform(X, method='yeo-johnson', *, standardize=True, copy=True): """ Power transforms are a family of parametric, monotonic transformations that are applied to make data more Gaussian-like. This is useful for diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 67641601e06f5..581765a81361e 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -16,6 +16,7 @@ from ..utils.validation import check_array from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES +from ..utils.validation import _deprecate_positional_args class KBinsDiscretizer(TransformerMixin, BaseEstimator): @@ -115,7 +116,8 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): [ 0.5, 3.5, -1.5, 1.5]]) """ - def __init__(self, n_bins=5, encode='onehot', strategy='quantile'): + @_deprecate_positional_args + def __init__(self, n_bins=5, *, encode='onehot', strategy='quantile'): self.n_bins = n_bins self.encode = encode self.strategy = strategy diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index aa3d8d9dabca8..c8f8ba6781400 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -8,6 +8,7 @@ from ..base import BaseEstimator, TransformerMixin from ..utils import check_array from ..utils.validation import check_is_fitted +from ..utils.validation import _deprecate_positional_args from ._label import _encode, _encode_check_unknown @@ -292,7 +293,8 @@ class OneHotEncoder(_BaseEncoder): [1., 0., 1., 0.]]) """ - def __init__(self, categories='auto', drop=None, sparse=True, + @_deprecate_positional_args + def __init__(self, *, categories='auto', drop=None, sparse=True, dtype=np.float64, handle_unknown='error'): self.categories = categories self.sparse = sparse @@ -653,7 +655,8 @@ class OrdinalEncoder(_BaseEncoder): ['Female', 2]], dtype=object) """ - def __init__(self, categories='auto', dtype=np.float64): + @_deprecate_positional_args + def __init__(self, *, categories='auto', dtype=np.float64): self.categories = categories self.dtype = dtype diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index 9cf365ebb3cdf..21dd40365f5a0 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -2,6 +2,7 @@ from ..base import BaseEstimator, TransformerMixin from ..utils.validation import _allclose_dense_sparse +from ..utils.validation import _deprecate_positional_args def _identity(X): @@ -78,7 +79,9 @@ class FunctionTransformer(TransformerMixin, BaseEstimator): array([[0. , 0.6931...], [1.0986..., 1.3862...]]) """ - def __init__(self, func=None, inverse_func=None, validate=False, + + @_deprecate_positional_args + def __init__(self, func=None, inverse_func=None, *, validate=False, accept_sparse=False, check_inverse=True, kw_args=None, inv_kw_args=None): self.func = func diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py index 43b6ac642284c..88fad3670cb01 100644 --- a/sklearn/preprocessing/_label.py +++ b/sklearn/preprocessing/_label.py @@ -21,6 +21,7 @@ from ..utils.validation import check_array from ..utils.validation import check_is_fitted from ..utils.validation import _num_samples +from ..utils.validation import _deprecate_positional_args from ..utils.multiclass import unique_labels from ..utils.multiclass import type_of_target @@ -396,7 +397,8 @@ class LabelBinarizer(TransformerMixin, BaseEstimator): using a one-hot aka one-of-K scheme. """ - def __init__(self, neg_label=0, pos_label=1, sparse_output=False): + @_deprecate_positional_args + def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False): if neg_label >= pos_label: raise ValueError("neg_label={0} must be strictly less than " "pos_label={1}.".format(neg_label, pos_label)) @@ -483,7 +485,7 @@ def transform(self, y): raise ValueError("The object was not fitted with multilabel" " input.") - return label_binarize(y, self.classes_, + return label_binarize(y, classes=self.classes_, pos_label=self.pos_label, neg_label=self.neg_label, sparse_output=self.sparse_output) @@ -541,7 +543,9 @@ def _more_tags(self): return {'X_types': ['1dlabels']} -def label_binarize(y, classes, neg_label=0, pos_label=1, sparse_output=False): +@_deprecate_positional_args +def label_binarize(y, *, classes, neg_label=0, pos_label=1, + sparse_output=False): """Binarize labels in a one-vs-all fashion Several regression and binary classification algorithms are @@ -851,7 +855,8 @@ class MultiLabelBinarizer(TransformerMixin, BaseEstimator): using a one-hot aka one-of-K scheme. """ - def __init__(self, classes=None, sparse_output=False): + @_deprecate_positional_args + def __init__(self, *, classes=None, sparse_output=False): self.classes = classes self.sparse_output = sparse_output diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 7999df083631c..f79703610bee5 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -2295,7 +2295,7 @@ def test_power_transformer_boxcox_strictly_positive_exception(): pt.fit(X_with_negatives) with pytest.raises(ValueError, match=not_positive_message): - power_transform(X_with_negatives, 'box-cox') + power_transform(X_with_negatives, method='box-cox') with pytest.raises(ValueError, match=not_positive_message): pt.transform(np.zeros(X_2d.shape)) @@ -2304,7 +2304,7 @@ def test_power_transformer_boxcox_strictly_positive_exception(): pt.fit(np.zeros(X_2d.shape)) with pytest.raises(ValueError, match=not_positive_message): - power_transform(np.zeros(X_2d.shape), 'box-cox') + power_transform(np.zeros(X_2d.shape), method='box-cox') @pytest.mark.parametrize('X', [X_2d, np.abs(X_2d), -np.abs(X_2d), @@ -2432,7 +2432,7 @@ def test_power_transformer_fit_transform(method, standardize): if method == 'box-cox': X = np.abs(X) - pt = PowerTransformer(method, standardize) + pt = PowerTransformer(method, standardize=standardize) assert_array_almost_equal(pt.fit(X).transform(X), pt.fit_transform(X)) @@ -2449,7 +2449,7 @@ def test_power_transformer_copy_True(method, standardize): assert X is not X_original # sanity checks assert_array_almost_equal(X, X_original) - pt = PowerTransformer(method, standardize, copy=True) + pt = PowerTransformer(method, standardize=standardize, copy=True) pt.fit(X) assert_array_almost_equal(X, X_original) @@ -2477,7 +2477,7 @@ def test_power_transformer_copy_False(method, standardize): assert X is not X_original # sanity checks assert_array_almost_equal(X, X_original) - pt = PowerTransformer(method, standardize, copy=False) + pt = PowerTransformer(method, standardize=standardize, copy=False) pt.fit(X) assert_array_almost_equal(X, X_original) # fit didn't change X diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 887fa90c98d61..505c57cb5f1c1 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -178,7 +178,7 @@ def test_label_binarizer_errors(): with pytest.raises(ValueError): LabelBinarizer().fit(np.array([[1, 3], [2, 1]])) with pytest.raises(ValueError): - label_binarize(np.array([[1, 3], [2, 1]]), [1, 2, 3]) + label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3]) @pytest.mark.parametrize( @@ -509,13 +509,13 @@ def check_binarized_results(y, classes, pos_label, neg_label, expected): for sparse_output in [True, False]: if ((pos_label == 0 or neg_label != 0) and sparse_output): with pytest.raises(ValueError): - label_binarize(y, classes, neg_label=neg_label, + label_binarize(y, classes=classes, neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output) continue # check label_binarize - binarized = label_binarize(y, classes, neg_label=neg_label, + binarized = label_binarize(y, classes=classes, neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output) assert_array_equal(toarray(binarized), expected) @@ -576,7 +576,7 @@ def test_label_binarize_multiclass(): check_binarized_results(y, classes, pos_label, neg_label, expected) with pytest.raises(ValueError): - label_binarize(y, classes, neg_label=-1, pos_label=pos_label, + label_binarize(y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True) @@ -595,7 +595,7 @@ def test_label_binarize_multilabel(): expected) with pytest.raises(ValueError): - label_binarize(y, classes, neg_label=-1, pos_label=pos_label, + label_binarize(y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True) From 923b13ceda1d1d26a1013d0e326734a1dc58bd46 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Thu, 23 Apr 2020 13:11:45 -0400 Subject: [PATCH 047/125] API Adds defaults to Display Objects (#16933) --- doc/conf.py | 1 + doc/visualizations.rst | 1 + examples/plot_display_object_visualization.py | 92 +++++++++++++++++++ sklearn/metrics/_plot/confusion_matrix.py | 16 +++- .../metrics/_plot/precision_recall_curve.py | 29 +++--- sklearn/metrics/_plot/roc_curve.py | 25 +++-- .../_plot/tests/test_plot_confusion_matrix.py | 15 +++ .../_plot/tests/test_plot_precision_recall.py | 20 ++++ .../_plot/tests/test_plot_roc_curve.py | 18 ++++ 9 files changed, 192 insertions(+), 25 deletions(-) create mode 100644 examples/plot_display_object_visualization.py diff --git a/doc/conf.py b/doc/conf.py index c3ab17d3e73af..a13ed14216de4 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -314,6 +314,7 @@ def __call__(self, directory): }, # avoid generating too many cross links 'inspect_global_variables': False, + 'remove_config_comments': True, } diff --git a/doc/visualizations.rst b/doc/visualizations.rst index 47d826602b62f..e50a9a90a0b84 100644 --- a/doc/visualizations.rst +++ b/doc/visualizations.rst @@ -60,6 +60,7 @@ values of the curves. * :ref:`sphx_glr_auto_examples_plot_roc_curve_visualization_api.py` * :ref:`sphx_glr_auto_examples_plot_partial_dependence_visualization_api.py` + * :ref:`sphx_glr_auto_examples_plot_display_object_visualization.py` Available Plotting Utilities ============================ diff --git a/examples/plot_display_object_visualization.py b/examples/plot_display_object_visualization.py new file mode 100644 index 0000000000000..32ea3ef2d1120 --- /dev/null +++ b/examples/plot_display_object_visualization.py @@ -0,0 +1,92 @@ +""" +=================================== +Visualizations with Display Objects +=================================== + +.. currentmodule:: sklearn.metrics + +In this example, we will construct display objects, +:class:`ConfusionMatrixDisplay`, :class:`RocCurveDisplay`, and +:class:`PrecisionRecallDisplay` directly from their respective metrics. This +is an alternative to using their corresponding plot functions when +a model's predictions are already computed or expensive to compute. Note that +this is advanced usage, and in general we recommend using their respective +plot functions. +""" +print(__doc__) + +############################################################################## +# Load Data and train model +# ------------------------- +# For this example, we load a blood transfusion service center data set from +# `OpenML `. This is a binary classification +# problem where the target is whether an individual donated blood. Then the +# data is split into a train and test dataset and a logistic regression is +# fitted wtih the train dataset. +from sklearn.datasets import fetch_openml +from sklearn.preprocessing import StandardScaler +from sklearn.pipeline import make_pipeline +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import train_test_split + +X, y = fetch_openml(data_id=1464, return_X_y=True) +X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y) + +clf = make_pipeline(StandardScaler(), LogisticRegression(random_state=0)) +clf.fit(X_train, y_train) + +############################################################################## +# Create :class:`ConfusionMatrixDisplay` +############################################################################## +# With the fitted model, we compute the predictions of the model on the test +# dataset. These predictions are used to compute the confustion matrix which +# is plotted with the :class:`ConfusionMatrixDisplay` +from sklearn.metrics import confusion_matrix +from sklearn.metrics import ConfusionMatrixDisplay + +y_pred = clf.predict(X_test) +cm = confusion_matrix(y_test, y_pred) + +cm_display = ConfusionMatrixDisplay(cm).plot() + + +############################################################################## +# Create :class:`RocCurveDisplay` +############################################################################## +# The roc curve requires either the probabilities or the non-thresholded +# decision values from the estimator. Since the logistic regression provides +# a decision function, we will use it to plot the roc curve: +from sklearn.metrics import roc_curve +from sklearn.metrics import RocCurveDisplay +y_score = clf.decision_function(X_test) + +fpr, tpr, _ = roc_curve(y_test, y_score, pos_label=clf.classes_[1]) +roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot() + +############################################################################## +# Create :class:`PrecisionRecallDisplay` +############################################################################## +# Similarly, the precision recall curve can be plotted using `y_score` from +# the prevision sections. +from sklearn.metrics import precision_recall_curve +from sklearn.metrics import PrecisionRecallDisplay + +prec, recall, _ = precision_recall_curve(y_test, y_score, + pos_label=clf.classes_[1]) +pr_display = PrecisionRecallDisplay(precision=prec, recall=recall).plot() + +############################################################################## +# Combining the display objects into a single plot +############################################################################## +# The display objects store the computed values that were passed as arguments. +# This allows for the visualizations to be easliy combined using matplotlib's +# API. In the following example, we place the displays next to each other in a +# row. + +# sphinx_gallery_thumbnail_number = 4 +import matplotlib.pyplot as plt +fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8)) + +roc_display.plot(ax=ax1) +pr_display.plot(ax=ax2) +plt.show() diff --git a/sklearn/metrics/_plot/confusion_matrix.py b/sklearn/metrics/_plot/confusion_matrix.py index 8916b523fc273..861a2558a3cef 100644 --- a/sklearn/metrics/_plot/confusion_matrix.py +++ b/sklearn/metrics/_plot/confusion_matrix.py @@ -21,8 +21,9 @@ class ConfusionMatrixDisplay: confusion_matrix : ndarray of shape (n_classes, n_classes) Confusion matrix. - display_labels : ndarray of shape (n_classes,) - Display labels for plot. + display_labels : ndarray of shape (n_classes,), default=None + Display labels for plot. If None, display labels are set from 0 to + `n_classes - 1`. Attributes ---------- @@ -39,7 +40,7 @@ class ConfusionMatrixDisplay: figure_ : matplotlib Figure Figure containing the confusion matrix. """ - def __init__(self, confusion_matrix, display_labels): + def __init__(self, confusion_matrix, display_labels=None): self.confusion_matrix = confusion_matrix self.display_labels = display_labels @@ -108,11 +109,16 @@ def plot(self, include_values=True, cmap='viridis', ha="center", va="center", color=color) + if self.display_labels is None: + display_labels = np.arange(n_classes) + else: + display_labels = self.display_labels + fig.colorbar(self.im_, ax=ax) ax.set(xticks=np.arange(n_classes), yticks=np.arange(n_classes), - xticklabels=self.display_labels, - yticklabels=self.display_labels, + xticklabels=display_labels, + yticklabels=display_labels, ylabel="True label", xlabel="Predicted label") diff --git a/sklearn/metrics/_plot/precision_recall_curve.py b/sklearn/metrics/_plot/precision_recall_curve.py index bfec9276f83be..10dd14e938984 100644 --- a/sklearn/metrics/_plot/precision_recall_curve.py +++ b/sklearn/metrics/_plot/precision_recall_curve.py @@ -23,11 +23,11 @@ class PrecisionRecallDisplay: recall : ndarray Recall values. - average_precision : float - Average precision. + average_precision : float, default=None + Average precision. If None, the average precision is not shown. - estimator_name : str - Name of estimator. + estimator_name : str, default=None + Name of estimator. If None, then the estimator name is not shown. Attributes ---------- @@ -41,7 +41,8 @@ class PrecisionRecallDisplay: Figure containing the curve. """ - def __init__(self, precision, recall, average_precision, estimator_name): + def __init__(self, precision, recall, + average_precision=None, estimator_name=None): self.precision = precision self.recall = recall self.average_precision = average_precision @@ -78,16 +79,22 @@ def plot(self, ax=None, name=None, **kwargs): name = self.estimator_name if name is None else name - line_kwargs = { - "label": "{} (AP = {:0.2f})".format(name, - self.average_precision), - "drawstyle": "steps-post" - } + line_kwargs = {"drawstyle": "steps-post"} + if self.average_precision is not None and name is not None: + line_kwargs["label"] = (f"{name} (AP = " + f"{self.average_precision:0.2f})") + elif self.average_precision is not None: + line_kwargs["label"] = (f"AP = " + f"{self.average_precision:0.2f}") + elif name is not None: + line_kwargs["label"] = name line_kwargs.update(**kwargs) self.line_, = ax.plot(self.recall, self.precision, **line_kwargs) ax.set(xlabel="Recall", ylabel="Precision") - ax.legend(loc='lower left') + + if "label" in line_kwargs: + ax.legend(loc='lower left') self.ax_ = ax self.figure_ = ax.figure diff --git a/sklearn/metrics/_plot/roc_curve.py b/sklearn/metrics/_plot/roc_curve.py index d786ac6659d41..0881646e8a1af 100644 --- a/sklearn/metrics/_plot/roc_curve.py +++ b/sklearn/metrics/_plot/roc_curve.py @@ -22,11 +22,11 @@ class RocCurveDisplay: tpr : ndarray True positive rate. - roc_auc : float - Area under ROC curve. + roc_auc : float, default=None + Area under ROC curve. If None, the roc_auc score is not shown. - estimator_name : str - Name of estimator. + estimator_name : str, default=None + Name of estimator. If None, the estimator name is not shown. Attributes ---------- @@ -54,7 +54,7 @@ class RocCurveDisplay: >>> plt.show() # doctest: +SKIP """ - def __init__(self, fpr, tpr, roc_auc, estimator_name): + def __init__(self, fpr, tpr, roc_auc=None, estimator_name=None): self.fpr = fpr self.tpr = tpr self.roc_auc = roc_auc @@ -88,15 +88,22 @@ def plot(self, ax=None, name=None, **kwargs): name = self.estimator_name if name is None else name - line_kwargs = { - 'label': "{} (AUC = {:0.2f})".format(name, self.roc_auc) - } + line_kwargs = {} + if self.roc_auc is not None and name is not None: + line_kwargs["label"] = f"{name} (AUC = {self.roc_auc:0.2f})" + elif self.roc_auc is not None: + line_kwargs["label"] = f"AUC = {self.roc_auc:0.2f}" + elif name is not None: + line_kwargs["label"] = name + line_kwargs.update(**kwargs) self.line_ = ax.plot(self.fpr, self.tpr, **line_kwargs)[0] ax.set_xlabel("False Positive Rate") ax.set_ylabel("True Positive Rate") - ax.legend(loc='lower right') + + if "label" in line_kwargs: + ax.legend(loc='lower right') self.ax_ = ax self.figure_ = ax.figure diff --git a/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py b/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py index b8a24ae15f1e5..6a0b880ebabb1 100644 --- a/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py +++ b/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py @@ -280,3 +280,18 @@ def test_confusion_matrix_standard_format(pyplot): # Values are have two dec places max, (e.g 100 becomes 1e+02) test = [t.get_text() for t in plotted_text.ravel()] assert test == ['0.1', '10', '1e+02', '0.53'] + + +@pytest.mark.parametrize("display_labels, expected_labels", [ + (None, ["0", "1"]), + (["cat", "dog"], ["cat", "dog"]), +]) +def test_default_labels(pyplot, display_labels, expected_labels): + cm = np.array([[10, 0], [12, 120]]) + disp = ConfusionMatrixDisplay(cm, display_labels=display_labels).plot() + + x_ticks = [tick.get_text() for tick in disp.ax_.get_xticklabels()] + y_ticks = [tick.get_text() for tick in disp.ax_.get_yticklabels()] + + assert_array_equal(x_ticks, expected_labels) + assert_array_equal(y_ticks, expected_labels) diff --git a/sklearn/metrics/_plot/tests/test_plot_precision_recall.py b/sklearn/metrics/_plot/tests/test_plot_precision_recall.py index f22b112e96dc7..48305a93d0b3f 100644 --- a/sklearn/metrics/_plot/tests/test_plot_precision_recall.py +++ b/sklearn/metrics/_plot/tests/test_plot_precision_recall.py @@ -4,6 +4,7 @@ from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.metrics import plot_precision_recall_curve +from sklearn.metrics import PrecisionRecallDisplay from sklearn.metrics import average_precision_score from sklearn.metrics import precision_recall_curve from sklearn.datasets import make_classification @@ -170,3 +171,22 @@ def test_plot_precision_recall_curve_estimator_name_multiple_calls(pyplot): clf_name = "another_name" disp.plot(name=clf_name) assert clf_name in disp.line_.get_label() + + +@pytest.mark.parametrize( + "average_precision, estimator_name, expected_label", + [ + (0.9, None, "AP = 0.90"), + (None, "my_est", "my_est"), + (0.8, "my_est2", "my_est2 (AP = 0.80)"), + ] +) +def test_default_labels(pyplot, average_precision, estimator_name, + expected_label): + prec = np.array([1, 0.5, 0]) + recall = np.array([0, 0.5, 1]) + disp = PrecisionRecallDisplay(prec, recall, + average_precision=average_precision, + estimator_name=estimator_name) + disp.plot() + assert disp.line_.get_label() == expected_label diff --git a/sklearn/metrics/_plot/tests/test_plot_roc_curve.py b/sklearn/metrics/_plot/tests/test_plot_roc_curve.py index 699387ff4cfa3..1aa34bdca7279 100644 --- a/sklearn/metrics/_plot/tests/test_plot_roc_curve.py +++ b/sklearn/metrics/_plot/tests/test_plot_roc_curve.py @@ -4,6 +4,7 @@ from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import plot_roc_curve +from sklearn.metrics import RocCurveDisplay from sklearn.datasets import load_iris from sklearn.linear_model import LogisticRegression from sklearn.metrics import roc_curve, auc @@ -150,3 +151,20 @@ def test_plot_roc_curve_estimator_name_multiple_calls(pyplot, data_binary): clf_name = "another_name" disp.plot(name=clf_name) assert clf_name in disp.line_.get_label() + + +@pytest.mark.parametrize( + "roc_auc, estimator_name, expected_label", + [ + (0.9, None, "AUC = 0.90"), + (None, "my_est", "my_est"), + (0.8, "my_est2", "my_est2 (AUC = 0.80)") + ] +) +def test_default_labels(pyplot, roc_auc, estimator_name, + expected_label): + fpr = np.array([0, 0.5, 1]) + tpr = np.array([0, 0.5, 1]) + disp = RocCurveDisplay(fpr, tpr, roc_auc=roc_auc, + estimator_name=estimator_name).plot() + assert disp.line_.get_label() == expected_label From 7844d1c2d78ed746f96e3acd38cf47e0d037b58d Mon Sep 17 00:00:00 2001 From: Lucy Liu Date: Thu, 23 Apr 2020 19:19:52 +0200 Subject: [PATCH 048/125] TST Replace boston in histgradboost test_predictor (#16918) --- .../tests/test_predictor.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py index 7df1e616445fc..3c837844f29e3 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_predictor.py @@ -1,5 +1,5 @@ import numpy as np -from sklearn.datasets import load_boston +from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split from sklearn.metrics import r2_score import pytest @@ -12,8 +12,9 @@ @pytest.mark.parametrize('n_bins', [200, 256]) -def test_boston_dataset(n_bins): - X, y = load_boston(return_X_y=True) +def test_regression_dataset(n_bins): + X, y = make_regression(n_samples=500, n_features=10, n_informative=5, + random_state=42) X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=42) @@ -24,8 +25,8 @@ def test_boston_dataset(n_bins): gradients = -y_train.astype(G_H_DTYPE) hessians = np.ones(1, dtype=G_H_DTYPE) - min_samples_leaf = 8 - max_leaf_nodes = 31 + min_samples_leaf = 10 + max_leaf_nodes = 30 grower = TreeGrower(X_train_binned, gradients, hessians, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes, n_bins=n_bins, @@ -34,8 +35,8 @@ def test_boston_dataset(n_bins): predictor = grower.make_predictor(bin_thresholds=mapper.bin_thresholds_) - assert r2_score(y_train, predictor.predict(X_train)) > 0.85 - assert r2_score(y_test, predictor.predict(X_test)) > 0.70 + assert r2_score(y_train, predictor.predict(X_train)) > 0.82 + assert r2_score(y_test, predictor.predict(X_test)) > 0.67 @pytest.mark.parametrize('threshold, expected_predictions', [ From 88ba943684477aec39fb7eeba0b0d5dfa51cb2c8 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Fri, 24 Apr 2020 03:29:15 +1000 Subject: [PATCH 049/125] MNT Remove redundant doctest ELLIPSIS annotations (#16992) --- sklearn/metrics/_ranking.py | 20 ++++++++++---------- sklearn/svm/_classes.py | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py index ad867efb8bfa3..a9e45310f330d 100644 --- a/sklearn/metrics/_ranking.py +++ b/sklearn/metrics/_ranking.py @@ -1239,22 +1239,22 @@ def dcg_score(y_true, y_score, k=None, >>> true_relevance = np.asarray([[10, 0, 0, 1, 5]]) >>> # we predict scores for the answers >>> scores = np.asarray([[.1, .2, .3, 4, 70]]) - >>> dcg_score(true_relevance, scores) # doctest: +ELLIPSIS + >>> dcg_score(true_relevance, scores) 9.49... >>> # we can set k to truncate the sum; only top k answers contribute - >>> dcg_score(true_relevance, scores, k=2) # doctest: +ELLIPSIS + >>> dcg_score(true_relevance, scores, k=2) 5.63... >>> # now we have some ties in our prediction >>> scores = np.asarray([[1, 0, 0, 0, 1]]) >>> # by default ties are averaged, so here we get the average true >>> # relevance of our top predictions: (10 + 5) / 2 = 7.5 - >>> dcg_score(true_relevance, scores, k=1) # doctest: +ELLIPSIS + >>> dcg_score(true_relevance, scores, k=1) 7.5 >>> # we can choose to ignore ties for faster results, but only >>> # if we know there aren't ties in our scores, otherwise we get >>> # wrong results: >>> dcg_score(true_relevance, - ... scores, k=1, ignore_ties=True) # doctest: +ELLIPSIS + ... scores, k=1, ignore_ties=True) 5.0 """ @@ -1387,29 +1387,29 @@ def ndcg_score(y_true, y_score, k=None, sample_weight=None, ignore_ties=False): >>> true_relevance = np.asarray([[10, 0, 0, 1, 5]]) >>> # we predict some scores (relevance) for the answers >>> scores = np.asarray([[.1, .2, .3, 4, 70]]) - >>> ndcg_score(true_relevance, scores) # doctest: +ELLIPSIS + >>> ndcg_score(true_relevance, scores) 0.69... >>> scores = np.asarray([[.05, 1.1, 1., .5, .0]]) - >>> ndcg_score(true_relevance, scores) # doctest: +ELLIPSIS + >>> ndcg_score(true_relevance, scores) 0.49... >>> # we can set k to truncate the sum; only top k answers contribute. - >>> ndcg_score(true_relevance, scores, k=4) # doctest: +ELLIPSIS + >>> ndcg_score(true_relevance, scores, k=4) 0.35... >>> # the normalization takes k into account so a perfect answer >>> # would still get 1.0 - >>> ndcg_score(true_relevance, true_relevance, k=4) # doctest: +ELLIPSIS + >>> ndcg_score(true_relevance, true_relevance, k=4) 1.0 >>> # now we have some ties in our prediction >>> scores = np.asarray([[1, 0, 0, 0, 1]]) >>> # by default ties are averaged, so here we get the average (normalized) >>> # true relevance of our top predictions: (10 / 10 + 5 / 10) / 2 = .75 - >>> ndcg_score(true_relevance, scores, k=1) # doctest: +ELLIPSIS + >>> ndcg_score(true_relevance, scores, k=1) 0.75 >>> # we can choose to ignore ties for faster results, but only >>> # if we know there aren't ties in our scores, otherwise we get >>> # wrong results: >>> ndcg_score(true_relevance, - ... scores, k=1, ignore_ties=True) # doctest: +ELLIPSIS + ... scores, k=1, ignore_ties=True) 0.5 """ diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 77110da119a02..f8b30e070711e 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1251,7 +1251,7 @@ class OneClassSVM(OutlierMixin, BaseLibSVM): >>> clf = OneClassSVM(gamma='auto').fit(X) >>> clf.predict(X) array([-1, 1, 1, 1, -1]) - >>> clf.score_samples(X) # doctest: +ELLIPSIS + >>> clf.score_samples(X) array([1.7798..., 2.0547..., 2.0556..., 2.0561..., 1.7332...]) """ From e392bfd6933ad1202bb269b404d18c74da01b19e Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 23 Apr 2020 20:17:00 +0200 Subject: [PATCH 050/125] MNT Add pre-comit configuration (#16957) Co-Authored-By: Nicolas Hug --- .pre-commit-config.yaml | 22 ++++++++++++++++++++++ doc/developers/contributing.rst | 25 +++++++++++++++++-------- 2 files changed, 39 insertions(+), 8 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000000..aa8df3c3cbc87 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,22 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.3.0 + hooks: + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace +- repo: https://gitlab.com/pycqa/flake8 + rev: 3.7.8 + hooks: + - id: flake8 + types: [file, python] + # only check for unused imports for now, as long as + # the code is not fully PEP8 compatible + args: [--select=F401] +- repo: https://github.com/pre-commit/mirrors-mypy + rev: v0.730 + hooks: + - id: mypy + args: + - --ignore-missing-imports + files: sklearn/ diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index 99c59ec3392c6..33ab3fcecb887 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -248,19 +248,28 @@ modifying code and submitting a PR: and start making changes. Always use a feature branch. It's good practice to never work on the ``master`` branch! -9. Develop the feature on your feature branch on your computer, using Git to - do the version control. When you're done editing, add changed files using - ``git add`` and then ``git commit``:: +9. (**Optional**) Install `pre-commit `_ to + run code style checks before each commit:: - $ git add modified_files - $ git commit + $ pip install pre-commit + $ pre-commit install - to record your changes in Git, then push the changes to your GitHub - account with:: + pre-commit checks can be disabled for a particular commit with + `git commit -n`. + +10. Develop the feature on your feature branch on your computer, using Git to + do the version control. When you're done editing, add changed files using + ``git add`` and then ``git commit``:: + + $ git add modified_files + $ git commit + + to record your changes in Git, then push the changes to your GitHub + account with:: $ git push -u origin my_feature -10. Follow `these +11. Follow `these `_ instructions to create a pull request from your fork. This will send an email to the committers. You may want to consider sending an email to the From 94d8911310b7ec9cb6be2752d42b0cbd4c003c93 Mon Sep 17 00:00:00 2001 From: Lucy Liu Date: Thu, 23 Apr 2020 21:06:03 +0200 Subject: [PATCH 051/125] DOC Fix typo in ensemble.rst (#16999) --- doc/modules/ensemble.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index e731ece0bdb20..bff08f542ce11 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -1437,7 +1437,7 @@ any other regressor or classifier, exposing a `predict`, `predict_proba`, and >>> print('R2 score: {:.2f}'.format(r2_score(y_test, y_pred))) R2 score: 0.81 -Note that it is also possible to get the output of the stacked outputs of the +Note that it is also possible to get the output of the stacked `estimators` using the `transform` method:: >>> reg.transform(X_test[:5]) From facd1177bd7bae219cb34d1c39a4cafc6e6c1f3d Mon Sep 17 00:00:00 2001 From: Lucy Liu Date: Fri, 24 Apr 2020 04:40:29 +0200 Subject: [PATCH 052/125] TST Replace boston dataset in test_permutation_importance.py (#17020) --- sklearn/inspection/tests/test_permutation_importance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/inspection/tests/test_permutation_importance.py b/sklearn/inspection/tests/test_permutation_importance.py index c13638b2fc0c7..2b381e9a20b1a 100644 --- a/sklearn/inspection/tests/test_permutation_importance.py +++ b/sklearn/inspection/tests/test_permutation_importance.py @@ -4,7 +4,7 @@ from numpy.testing import assert_allclose from sklearn.compose import ColumnTransformer -from sklearn.datasets import load_boston +from sklearn.datasets import load_diabetes from sklearn.datasets import load_iris from sklearn.datasets import make_classification from sklearn.datasets import make_regression @@ -33,7 +33,7 @@ def test_permutation_importance_correlated_feature_regression(n_jobs): rng = np.random.RandomState(42) n_repeats = 5 - X, y = load_boston(return_X_y=True) + X, y = load_diabetes(return_X_y=True) y_with_little_noise = ( y + rng.normal(scale=0.001, size=y.shape[0])).reshape(-1, 1) From 0d04de2c76cb3f63acd9e1927c19a0c2d6da0266 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 24 Apr 2020 05:44:59 -0400 Subject: [PATCH 053/125] API kwonly for naive_bayes (#17003) * wkonly * pep8 --- sklearn/naive_bayes.py | 16 +++++++++++----- sklearn/tests/test_naive_bayes.py | 2 +- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 6ef3895ffdb60..247d9eea763c6 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -32,6 +32,7 @@ from .utils.multiclass import _check_partial_fit_first_call from .utils.validation import check_is_fitted, check_non_negative, column_or_1d from .utils.validation import _check_sample_weight +from .utils.validation import _deprecate_positional_args __all__ = ['BernoulliNB', 'GaussianNB', 'MultinomialNB', 'ComplementNB', 'CategoricalNB'] @@ -177,7 +178,8 @@ class labels known to the classifier [1] """ - def __init__(self, priors=None, var_smoothing=1e-9): + @_deprecate_positional_args + def __init__(self, *, priors=None, var_smoothing=1e-9): self.priors = priors self.var_smoothing = var_smoothing @@ -745,7 +747,8 @@ class MultinomialNB(_BaseDiscreteNB): https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html """ - def __init__(self, alpha=1.0, fit_prior=True, class_prior=None): + @_deprecate_positional_args + def __init__(self, *, alpha=1.0, fit_prior=True, class_prior=None): self.alpha = alpha self.fit_prior = fit_prior self.class_prior = class_prior @@ -847,7 +850,8 @@ class ComplementNB(_BaseDiscreteNB): https://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf """ - def __init__(self, alpha=1.0, fit_prior=True, class_prior=None, + @_deprecate_positional_args + def __init__(self, *, alpha=1.0, fit_prior=True, class_prior=None, norm=False): self.alpha = alpha self.fit_prior = fit_prior @@ -961,7 +965,8 @@ class BernoulliNB(_BaseDiscreteNB): naive Bayes -- Which naive Bayes? 3rd Conf. on Email and Anti-Spam (CEAS). """ - def __init__(self, alpha=1.0, binarize=.0, fit_prior=True, + @_deprecate_positional_args + def __init__(self, *, alpha=1.0, binarize=.0, fit_prior=True, class_prior=None): self.alpha = alpha self.binarize = binarize @@ -1072,7 +1077,8 @@ class CategoricalNB(_BaseDiscreteNB): [3] """ - def __init__(self, alpha=1.0, fit_prior=True, class_prior=None): + @_deprecate_positional_args + def __init__(self, *, alpha=1.0, fit_prior=True, class_prior=None): self.alpha = alpha self.fit_prior = fit_prior self.class_prior = class_prior diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py index 1f0f9347a188c..1106684998f75 100644 --- a/sklearn/tests/test_naive_bayes.py +++ b/sklearn/tests/test_naive_bayes.py @@ -122,7 +122,7 @@ def test_gnb_priors_sum_isclose(): priors = np.array([0.08, 0.14, 0.03, 0.16, 0.11, 0.16, 0.07, 0.14, 0.11, 0.0]) Y = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) - clf = GaussianNB(priors) + clf = GaussianNB(priors=priors) # smoke test for issue #9633 clf.fit(X, Y) From 361bf75544f7a0315f4388dffcc3279cdde50bec Mon Sep 17 00:00:00 2001 From: Stephen Marsh Date: Fri, 24 Apr 2020 09:25:39 -0400 Subject: [PATCH 054/125] DOC Fix link to user guide (#16989) --- sklearn/model_selection/_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 180c48fc99762..ef8890556ed1d 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -1384,7 +1384,7 @@ def validation_curve(estimator, X, y, param_name, param_range, groups=None, will also compute training scores and is merely a utility for plotting the results. - Read more in the :ref:`User Guide `. + Read more in the :ref:`User Guide `. Parameters ---------- From 2587a033808506cd1e46b0cba18fae5ae829f4ab Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 24 Apr 2020 10:28:56 -0400 Subject: [PATCH 055/125] [MRG] DOC fix ref for ParameterSampler (#16983) --- sklearn/model_selection/_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index d283dc2f0b483..80cb269d4a0c7 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -189,7 +189,7 @@ class ParameterSampler: It is highly recommended to use continuous distributions for continuous parameters. - Read more in the :ref:`User Guide `. + Read more in the :ref:`User Guide `. Parameters ---------- From 89993d2f57f668d0fba7995e764ff656710cb67f Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 24 Apr 2020 18:11:37 +0200 Subject: [PATCH 056/125] Update the URL of valgrind-python.supp in the doc (#17029) --- doc/developers/tips.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/developers/tips.rst b/doc/developers/tips.rst index b26d68ecfbe02..4c11c24684352 100644 --- a/doc/developers/tips.rst +++ b/doc/developers/tips.rst @@ -248,8 +248,8 @@ code. Follow these steps: $> valgrind -v --suppressions=valgrind-python.supp python my_test_script.py .. _valgrind: http://valgrind.org -.. _`README.valgrind`: https://svn.python.org/projects/python/trunk/Misc/README.valgrind -.. _`valgrind-python.supp`: https://svn.python.org/projects/python/trunk/Misc/valgrind-python.supp +.. _`README.valgrind`: https://github.com/python/cpython/blob/master/Misc/README.valgrind +.. _`valgrind-python.supp`: https://github.com/python/cpython/blob/master/Misc/valgrind-python.supp The result will be a list of all the memory-related errors, which reference From 501ee56192154f3fee09f173faece092ae94dd46 Mon Sep 17 00:00:00 2001 From: Lucy Liu Date: Fri, 24 Apr 2020 18:50:19 +0200 Subject: [PATCH 057/125] TST Replace Boston dataset in test_impute (#17025) --- sklearn/impute/tests/test_impute.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/impute/tests/test_impute.py b/sklearn/impute/tests/test_impute.py index 58c71660b401d..960f671915e6a 100644 --- a/sklearn/impute/tests/test_impute.py +++ b/sklearn/impute/tests/test_impute.py @@ -16,7 +16,7 @@ # make IterativeImputer available from sklearn.experimental import enable_iterative_imputer # noqa -from sklearn.datasets import load_boston +from sklearn.datasets import load_diabetes from sklearn.impute import MissingIndicator from sklearn.impute import SimpleImputer, IterativeImputer from sklearn.dummy import DummyRegressor @@ -947,7 +947,7 @@ def test_iterative_imputer_early_stopping(): def test_iterative_imputer_catch_warning(): # check that we catch a RuntimeWarning due to a division by zero when a # feature is constant in the dataset - X, y = load_boston(return_X_y=True) + X, y = load_diabetes(return_X_y=True) n_samples, n_features = X.shape # simulate that a feature only contain one category during fit From 2955d9f0af7785ed353d6ad23740f62852fcf988 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 24 Apr 2020 14:22:09 -0400 Subject: [PATCH 058/125] API kwonly for neighbors module (#17004) --- sklearn/manifold/_locally_linear.py | 4 ++-- sklearn/manifold/_t_sne.py | 4 ++-- sklearn/neighbors/_classification.py | 7 ++++-- sklearn/neighbors/_graph.py | 23 +++++++++++-------- sklearn/neighbors/_kde.py | 4 +++- sklearn/neighbors/_lof.py | 4 +++- sklearn/neighbors/_nca.py | 4 +++- sklearn/neighbors/_nearest_centroid.py | 4 +++- sklearn/neighbors/_regression.py | 7 ++++-- sklearn/neighbors/_unsupervised.py | 6 +++-- sklearn/neighbors/tests/test_kde.py | 4 ++-- sklearn/neighbors/tests/test_neighbors.py | 8 +++---- sklearn/semi_supervised/_label_propagation.py | 2 +- 13 files changed, 51 insertions(+), 30 deletions(-) diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py index 7b46d51df718d..c2d1ffbae9361 100644 --- a/sklearn/manifold/_locally_linear.py +++ b/sklearn/manifold/_locally_linear.py @@ -97,7 +97,7 @@ def barycenter_kneighbors_graph(X, n_neighbors, reg=1e-3, n_jobs=None): sklearn.neighbors.kneighbors_graph sklearn.neighbors.radius_neighbors_graph """ - knn = NearestNeighbors(n_neighbors + 1, n_jobs=n_jobs).fit(X) + knn = NearestNeighbors(n_neighbors=n_neighbors + 1, n_jobs=n_jobs).fit(X) X = knn._fit_X n_samples = knn.n_samples_fit_ ind = knn.kneighbors(X, return_distance=False)[:, 1:] @@ -647,7 +647,7 @@ def __init__(self, n_neighbors=5, n_components=2, reg=1E-3, self.n_jobs = n_jobs def _fit_transform(self, X): - self.nbrs_ = NearestNeighbors(self.n_neighbors, + self.nbrs_ = NearestNeighbors(n_neighbors=self.n_neighbors, algorithm=self.neighbors_algorithm, n_jobs=self.n_jobs) diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py index 53558f6051283..136a32cd86e73 100644 --- a/sklearn/manifold/_t_sne.py +++ b/sklearn/manifold/_t_sne.py @@ -450,8 +450,8 @@ def trustworthiness(X, X_embedded, n_neighbors=5, metric='euclidean'): np.fill_diagonal(dist_X, np.inf) ind_X = np.argsort(dist_X, axis=1) # `ind_X[i]` is the index of sorted distances between i and other samples - ind_X_embedded = NearestNeighbors(n_neighbors).fit(X_embedded).kneighbors( - return_distance=False) + ind_X_embedded = NearestNeighbors(n_neighbors=n_neighbors).fit( + X_embedded).kneighbors(return_distance=False) # We build an inverted index of neighbors in the input space: For sample i, # we define `inverted_index[i]` as the inverted index of sorted distances: diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py index e223476d3107b..331eb7821a511 100644 --- a/sklearn/neighbors/_classification.py +++ b/sklearn/neighbors/_classification.py @@ -20,6 +20,7 @@ RadiusNeighborsMixin, SupervisedIntegerMixin from ..base import ClassifierMixin from ..utils import check_array +from ..utils.validation import _deprecate_positional_args class KNeighborsClassifier(NeighborsBase, KNeighborsMixin, @@ -142,7 +143,8 @@ class KNeighborsClassifier(NeighborsBase, KNeighborsMixin, https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm """ - def __init__(self, n_neighbors=5, + @_deprecate_positional_args + def __init__(self, n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None, **kwargs): @@ -374,7 +376,8 @@ class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin, https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm """ - def __init__(self, radius=1.0, weights='uniform', + @_deprecate_positional_args + def __init__(self, radius=1.0, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', outlier_label=None, metric_params=None, n_jobs=None, **kwargs): diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py index d217999196950..6bf8da3f4ef5e 100644 --- a/sklearn/neighbors/_graph.py +++ b/sklearn/neighbors/_graph.py @@ -9,7 +9,7 @@ from ._base import UnsupervisedMixin from ._unsupervised import NearestNeighbors from ..base import TransformerMixin -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, _deprecate_positional_args def _check_params(X, metric, p, metric_params): @@ -37,8 +37,10 @@ def _query_include_self(X, include_self, mode): return X -def kneighbors_graph(X, n_neighbors, mode='connectivity', metric='minkowski', - p=2, metric_params=None, include_self=False, n_jobs=None): +@_deprecate_positional_args +def kneighbors_graph(X, n_neighbors, *, mode='connectivity', + metric='minkowski', p=2, metric_params=None, + include_self=False, n_jobs=None): """Computes the (weighted) graph of k-Neighbors for points in X Read more in the :ref:`User Guide `. @@ -103,7 +105,7 @@ def kneighbors_graph(X, n_neighbors, mode='connectivity', metric='minkowski', radius_neighbors_graph """ if not isinstance(X, KNeighborsMixin): - X = NearestNeighbors(n_neighbors, metric=metric, p=p, + X = NearestNeighbors(n_neighbors=n_neighbors, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs).fit(X) else: _check_params(X, metric, p, metric_params) @@ -112,9 +114,10 @@ def kneighbors_graph(X, n_neighbors, mode='connectivity', metric='minkowski', return X.kneighbors_graph(X=query, n_neighbors=n_neighbors, mode=mode) -def radius_neighbors_graph(X, radius, mode='connectivity', metric='minkowski', - p=2, metric_params=None, include_self=False, - n_jobs=None): +@_deprecate_positional_args +def radius_neighbors_graph(X, radius, *, mode='connectivity', + metric='minkowski', p=2, metric_params=None, + include_self=False, n_jobs=None): """Computes the (weighted) graph of Neighbors for points in X Neighborhoods are restricted the points at a distance lower than @@ -281,7 +284,8 @@ class KNeighborsTransformer(KNeighborsMixin, UnsupervisedMixin, ... KNeighborsTransformer(n_neighbors=5, mode='distance'), ... Isomap(neighbors_algorithm='precomputed')) """ - def __init__(self, mode='distance', n_neighbors=5, algorithm='auto', + @_deprecate_positional_args + def __init__(self, *, mode='distance', n_neighbors=5, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=1): super(KNeighborsTransformer, self).__init__( @@ -422,7 +426,8 @@ class RadiusNeighborsTransformer(RadiusNeighborsMixin, UnsupervisedMixin, ... RadiusNeighborsTransformer(radius=42.0, mode='distance'), ... DBSCAN(min_samples=30, metric='precomputed')) """ - def __init__(self, mode='distance', radius=1., algorithm='auto', + @_deprecate_positional_args + def __init__(self, *, mode='distance', radius=1., algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=1): super(RadiusNeighborsTransformer, self).__init__( diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py index 91a97e2810baa..684e07947cddd 100644 --- a/sklearn/neighbors/_kde.py +++ b/sklearn/neighbors/_kde.py @@ -9,6 +9,7 @@ from ..base import BaseEstimator from ..utils import check_array, check_random_state from ..utils.validation import _check_sample_weight, check_is_fitted +from ..utils.validation import _deprecate_positional_args from ..utils.extmath import row_norms from ._ball_tree import BallTree, DTYPE @@ -89,7 +90,8 @@ class KernelDensity(BaseEstimator): >>> log_density array([-1.52955942, -1.51462041, -1.60244657]) """ - def __init__(self, bandwidth=1.0, algorithm='auto', + @_deprecate_positional_args + def __init__(self, *, bandwidth=1.0, algorithm='auto', kernel='gaussian', metric="euclidean", atol=0, rtol=0, breadth_first=True, leaf_size=40, metric_params=None): self.algorithm = algorithm diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py index f3b141bf499e5..dfdb89237f516 100644 --- a/sklearn/neighbors/_lof.py +++ b/sklearn/neighbors/_lof.py @@ -11,6 +11,7 @@ from ..base import OutlierMixin from ..utils.validation import check_is_fitted +from ..utils.validation import _deprecate_positional_args from ..utils import check_array __all__ = ["LocalOutlierFactor"] @@ -163,7 +164,8 @@ class LocalOutlierFactor(KNeighborsMixin, UnsupervisedMixin, .. [1] Breunig, M. M., Kriegel, H. P., Ng, R. T., & Sander, J. (2000, May). LOF: identifying density-based local outliers. In ACM sigmod record. """ - def __init__(self, n_neighbors=20, algorithm='auto', leaf_size=30, + @_deprecate_positional_args + def __init__(self, n_neighbors=20, *, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, contamination="auto", novelty=False, n_jobs=None): super().__init__( diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py index 1017f5cf12606..9705c9050f6c7 100644 --- a/sklearn/neighbors/_nca.py +++ b/sklearn/neighbors/_nca.py @@ -23,6 +23,7 @@ from ..utils.multiclass import check_classification_targets from ..utils.random import check_random_state from ..utils.validation import check_is_fitted, check_array, check_scalar +from ..utils.validation import _deprecate_positional_args from ..exceptions import ConvergenceWarning @@ -161,7 +162,8 @@ class NeighborhoodComponentsAnalysis(TransformerMixin, BaseEstimator): """ - def __init__(self, n_components=None, init='auto', warm_start=False, + @_deprecate_positional_args + def __init__(self, n_components=None, *, init='auto', warm_start=False, max_iter=50, tol=1e-5, callback=None, verbose=0, random_state=None): self.n_components = n_components diff --git a/sklearn/neighbors/_nearest_centroid.py b/sklearn/neighbors/_nearest_centroid.py index bf00d8b8f88d2..62f74940100e7 100644 --- a/sklearn/neighbors/_nearest_centroid.py +++ b/sklearn/neighbors/_nearest_centroid.py @@ -16,6 +16,7 @@ from ..metrics.pairwise import pairwise_distances from ..preprocessing import LabelEncoder from ..utils.validation import check_array, check_is_fitted +from ..utils.validation import _deprecate_positional_args from ..utils.sparsefuncs import csc_median_axis_0 from ..utils.multiclass import check_classification_targets @@ -85,7 +86,8 @@ class NearestCentroid(ClassifierMixin, BaseEstimator): """ - def __init__(self, metric='euclidean', shrink_threshold=None): + @_deprecate_positional_args + def __init__(self, metric='euclidean', *, shrink_threshold=None): self.metric = metric self.shrink_threshold = shrink_threshold diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py index cce218062a3d5..845aacbfd4248 100644 --- a/sklearn/neighbors/_regression.py +++ b/sklearn/neighbors/_regression.py @@ -18,6 +18,7 @@ from ._base import RadiusNeighborsMixin, SupervisedFloatMixin from ..base import RegressorMixin from ..utils import check_array +from ..utils.validation import _deprecate_positional_args class KNeighborsRegressor(NeighborsBase, KNeighborsMixin, @@ -139,7 +140,8 @@ class KNeighborsRegressor(NeighborsBase, KNeighborsMixin, https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm """ - def __init__(self, n_neighbors=5, weights='uniform', + @_deprecate_positional_args + def __init__(self, n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None, **kwargs): @@ -307,7 +309,8 @@ class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin, https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm """ - def __init__(self, radius=1.0, weights='uniform', + @_deprecate_positional_args + def __init__(self, radius=1.0, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None, **kwargs): diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py index 923a465b1d31b..7e120d7587b66 100644 --- a/sklearn/neighbors/_unsupervised.py +++ b/sklearn/neighbors/_unsupervised.py @@ -3,6 +3,7 @@ from ._base import KNeighborsMixin from ._base import RadiusNeighborsMixin from ._base import UnsupervisedMixin +from ..utils.validation import _deprecate_positional_args class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, @@ -78,7 +79,7 @@ class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, >>> from sklearn.neighbors import NearestNeighbors >>> samples = [[0, 0, 2], [1, 0, 0], [0, 0, 1]] - >>> neigh = NearestNeighbors(2, 0.4) + >>> neigh = NearestNeighbors(n_neighbors=2, radius=0.4) >>> neigh.fit(samples) NearestNeighbors(...) @@ -105,7 +106,8 @@ class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm """ - def __init__(self, n_neighbors=5, radius=1.0, + @_deprecate_positional_args + def __init__(self, *, n_neighbors=5, radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=None): super().__init__( diff --git a/sklearn/neighbors/tests/test_kde.py b/sklearn/neighbors/tests/test_kde.py index e17e8e575f728..cff7ffafe5acd 100644 --- a/sklearn/neighbors/tests/test_kde.py +++ b/sklearn/neighbors/tests/test_kde.py @@ -74,7 +74,7 @@ def test_kernel_density_sampling(n_samples=100, n_features=3): for kernel in ['gaussian', 'tophat']: # draw a tophat sample - kde = KernelDensity(bandwidth, kernel=kernel).fit(X) + kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X) samp = kde.sample(100) assert X.shape == samp.shape @@ -91,7 +91,7 @@ def test_kernel_density_sampling(n_samples=100, n_features=3): # check unsupported kernels for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']: - kde = KernelDensity(bandwidth, kernel=kernel).fit(X) + kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X) assert_raises(NotImplementedError, kde.sample, 100) # non-regression test: used to return a scalar diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 88e32669777a1..d62b998052656 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1245,9 +1245,9 @@ def custom_metric(x1, x2): return np.sqrt(np.sum(x1 ** 2 + x2 ** 2)) X = np.random.RandomState(42).rand(20, 2) - nbrs1 = neighbors.NearestNeighbors(3, algorithm='auto', + nbrs1 = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto', metric=custom_metric) - nbrs2 = neighbors.NearestNeighbors(3, algorithm='brute', + nbrs2 = neighbors.NearestNeighbors(n_neighbors=3, algorithm='brute', metric=custom_metric) nbrs1.fit(X) @@ -1339,7 +1339,7 @@ def test_non_euclidean_kneighbors(): nbrs_graph = neighbors.kneighbors_graph( X, 3, metric=metric, mode='connectivity', include_self=True).toarray() - nbrs1 = neighbors.NearestNeighbors(3, metric=metric).fit(X) + nbrs1 = neighbors.NearestNeighbors(n_neighbors=3, metric=metric).fit(X) assert_array_equal(nbrs_graph, nbrs1.kneighbors_graph(X).toarray()) # Test radiusneighbors_graph @@ -1351,7 +1351,7 @@ def test_non_euclidean_kneighbors(): assert_array_equal(nbrs_graph, nbrs1.radius_neighbors_graph(X).A) # Raise error when wrong parameters are supplied, - X_nbrs = neighbors.NearestNeighbors(3, metric='manhattan') + X_nbrs = neighbors.NearestNeighbors(n_neighbors=3, metric='manhattan') X_nbrs.fit(X) assert_raises(ValueError, neighbors.kneighbors_graph, X_nbrs, 3, metric='euclidean') diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py index ccc6b889f41f6..efa9eb2255ce3 100644 --- a/sklearn/semi_supervised/_label_propagation.py +++ b/sklearn/semi_supervised/_label_propagation.py @@ -131,7 +131,7 @@ def _get_kernel(self, X, y=None): return rbf_kernel(X, y, gamma=self.gamma) elif self.kernel == "knn": if self.nn_fit is None: - self.nn_fit = NearestNeighbors(self.n_neighbors, + self.nn_fit = NearestNeighbors(n_neighbors=self.n_neighbors, n_jobs=self.n_jobs).fit(X) if y is None: return self.nn_fit.kneighbors_graph(self.nn_fit._fit_X, From 02309ffbdaae45af75b0e87946aab9aedb6b3634 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Fri, 24 Apr 2020 23:24:24 +0200 Subject: [PATCH 059/125] API kwonly args in manifold, metrics, mixture, model_selection, multclass, multioutput (#16982) --- sklearn/cluster/_optics.py | 2 +- sklearn/feature_selection/_rfe.py | 2 +- sklearn/manifold/_isomap.py | 5 +- sklearn/manifold/_locally_linear.py | 15 +++-- sklearn/manifold/_mds.py | 11 ++-- sklearn/manifold/_spectral_embedding.py | 8 ++- sklearn/manifold/_t_sne.py | 7 ++- sklearn/manifold/tests/test_isomap.py | 2 +- sklearn/manifold/tests/test_locally_linear.py | 2 +- sklearn/metrics/_classification.py | 60 ++++++++++++------- sklearn/metrics/_plot/confusion_matrix.py | 9 ++- .../metrics/_plot/precision_recall_curve.py | 10 ++-- sklearn/metrics/_plot/roc_curve.py | 10 ++-- .../_plot/tests/test_plot_confusion_matrix.py | 8 ++- .../_plot/tests/test_plot_roc_curve.py | 2 +- sklearn/metrics/_ranking.py | 30 +++++++--- sklearn/metrics/_regression.py | 31 ++++++---- sklearn/metrics/_scorer.py | 7 ++- sklearn/metrics/cluster/_bicluster.py | 4 +- sklearn/metrics/cluster/_supervised.py | 22 ++++--- sklearn/metrics/cluster/_unsupervised.py | 7 ++- sklearn/metrics/pairwise.py | 33 ++++++---- sklearn/metrics/tests/test_classification.py | 7 ++- sklearn/metrics/tests/test_pairwise.py | 2 +- sklearn/metrics/tests/test_score_objects.py | 16 ++--- sklearn/mixture/_bayesian_mixture.py | 5 +- sklearn/mixture/_gaussian_mixture.py | 5 +- sklearn/model_selection/_search.py | 23 ++++--- sklearn/model_selection/_split.py | 55 ++++++++++------- sklearn/model_selection/_validation.py | 21 ++++--- sklearn/model_selection/tests/test_split.py | 4 +- .../model_selection/tests/test_validation.py | 19 +++--- sklearn/multiclass.py | 12 ++-- sklearn/multioutput.py | 21 ++++--- 34 files changed, 302 insertions(+), 175 deletions(-) diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index c8ca3ec569a88..7f54a318d3d49 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -529,7 +529,7 @@ def _set_reach_dist(core_distances_, reachability_, predecessor_, # in the dict params _params['p'] = p dists = pairwise_distances(P, np.take(X, unproc, axis=0), - metric, n_jobs=None, + metric=metric, n_jobs=None, **_params).ravel() rdists = np.maximum(dists, core_distances_[point_index]) diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index 6d9bb8c463df6..7421bc50b7625 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -506,7 +506,7 @@ def fit(self, X, y, groups=None): ) # Initialization - cv = check_cv(self.cv, y, is_classifier(self.estimator)) + cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator)) scorer = check_scoring(self.estimator, scoring=self.scoring) n_features = X.shape[1] diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py index f26db5cc2028d..3229522d21c6e 100644 --- a/sklearn/manifold/_isomap.py +++ b/sklearn/manifold/_isomap.py @@ -8,6 +8,7 @@ from ..neighbors import NearestNeighbors, kneighbors_graph from ..utils.deprecation import deprecated from ..utils.validation import check_is_fitted +from ..utils.validation import _deprecate_positional_args from ..utils.graph import graph_shortest_path from ..decomposition import KernelPCA from ..preprocessing import KernelCenterer @@ -122,8 +123,8 @@ class Isomap(TransformerMixin, BaseEstimator): .. [1] Tenenbaum, J.B.; De Silva, V.; & Langford, J.C. A global geometric framework for nonlinear dimensionality reduction. Science 290 (5500) """ - - def __init__(self, n_neighbors=5, n_components=2, eigen_solver='auto', + @_deprecate_positional_args + def __init__(self, *, n_neighbors=5, n_components=2, eigen_solver='auto', tol=0, max_iter=None, path_method='auto', neighbors_algorithm='auto', n_jobs=None, metric='minkowski', p=2, metric_params=None): diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py index c2d1ffbae9361..a1db0c43daccb 100644 --- a/sklearn/manifold/_locally_linear.py +++ b/sklearn/manifold/_locally_linear.py @@ -14,6 +14,7 @@ from ..utils.extmath import stable_cumsum from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES +from ..utils.validation import _deprecate_positional_args from ..neighbors import NearestNeighbors @@ -183,10 +184,11 @@ def null_space(M, k, k_skip=1, eigen_solver='arpack', tol=1E-6, max_iter=100, raise ValueError("Unrecognized eigen_solver '%s'" % eigen_solver) +@_deprecate_positional_args def locally_linear_embedding( - X, n_neighbors, n_components, reg=1e-3, eigen_solver='auto', tol=1e-6, - max_iter=100, method='standard', hessian_tol=1E-4, modified_tol=1E-12, - random_state=None, n_jobs=None): + X, *, n_neighbors, n_components, reg=1e-3, eigen_solver='auto', + tol=1e-6, max_iter=100, method='standard', hessian_tol=1E-4, + modified_tol=1E-12, random_state=None, n_jobs=None): """Perform a Locally Linear Embedding analysis on the data. Read more in the :ref:`User Guide `. @@ -628,8 +630,8 @@ class LocallyLinearEmbedding(TransformerMixin, dimensionality reduction via tangent space alignment. Journal of Shanghai Univ. 8:406 (2004) """ - - def __init__(self, n_neighbors=5, n_components=2, reg=1E-3, + @_deprecate_positional_args + def __init__(self, *, n_neighbors=5, n_components=2, reg=1E-3, eigen_solver='auto', tol=1E-6, max_iter=100, method='standard', hessian_tol=1E-4, modified_tol=1E-12, neighbors_algorithm='auto', random_state=None, n_jobs=None): @@ -656,7 +658,8 @@ def _fit_transform(self, X): self.nbrs_.fit(X) self.embedding_, self.reconstruction_error_ = \ locally_linear_embedding( - self.nbrs_, self.n_neighbors, self.n_components, + X=self.nbrs_, n_neighbors=self.n_neighbors, + n_components=self.n_components, eigen_solver=self.eigen_solver, tol=self.tol, max_iter=self.max_iter, method=self.method, hessian_tol=self.hessian_tol, modified_tol=self.modified_tol, diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index ca8c08ed69f98..0314007264689 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -14,6 +14,7 @@ from ..metrics import euclidean_distances from ..utils import check_random_state, check_array, check_symmetric from ..isotonic import IsotonicRegression +from ..utils.validation import _deprecate_positional_args def _smacof_single(dissimilarities, metric=True, n_components=2, init=None, @@ -129,9 +130,10 @@ def _smacof_single(dissimilarities, metric=True, n_components=2, init=None, return X, stress, it + 1 -def smacof(dissimilarities, metric=True, n_components=2, init=None, n_init=8, - n_jobs=None, max_iter=300, verbose=0, eps=1e-3, random_state=None, - return_n_iter=False): +@_deprecate_positional_args +def smacof(dissimilarities, *, metric=True, n_components=2, init=None, + n_init=8, n_jobs=None, max_iter=300, verbose=0, eps=1e-3, + random_state=None, return_n_iter=False): """Computes multidimensional scaling using the SMACOF algorithm. The SMACOF (Scaling by MAjorizing a COmplicated Function) algorithm is a @@ -357,7 +359,8 @@ class MDS(BaseEstimator): hypothesis" Kruskal, J. Psychometrika, 29, (1964) """ - def __init__(self, n_components=2, metric=True, n_init=4, + @_deprecate_positional_args + def __init__(self, n_components=2, *, metric=True, n_init=4, max_iter=300, verbose=0, eps=1e-3, n_jobs=None, random_state=None, dissimilarity="euclidean"): self.n_components = n_components diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py index caac2236e1dd6..0c8bb4902c99a 100644 --- a/sklearn/manifold/_spectral_embedding.py +++ b/sklearn/manifold/_spectral_embedding.py @@ -20,6 +20,7 @@ from ..utils.fixes import lobpcg from ..metrics.pairwise import rbf_kernel from ..neighbors import kneighbors_graph, NearestNeighbors +from ..utils.validation import _deprecate_positional_args def _graph_connected_component(graph, node_id): @@ -132,7 +133,8 @@ def _set_diag(laplacian, value, norm_laplacian): return laplacian -def spectral_embedding(adjacency, n_components=8, eigen_solver=None, +@_deprecate_positional_args +def spectral_embedding(adjacency, *, n_components=8, eigen_solver=None, random_state=None, eigen_tol=0.0, norm_laplacian=True, drop_first=True): """Project the sample on the first eigenvectors of the graph Laplacian. @@ -440,8 +442,8 @@ class SpectralEmbedding(BaseEstimator): Jianbo Shi, Jitendra Malik http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324 """ - - def __init__(self, n_components=2, affinity="nearest_neighbors", + @_deprecate_positional_args + def __init__(self, n_components=2, *, affinity="nearest_neighbors", gamma=None, random_state=None, eigen_solver=None, n_neighbors=None, n_jobs=None): self.n_components = n_components diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py index 136a32cd86e73..d94bf777399f5 100644 --- a/sklearn/manifold/_t_sne.py +++ b/sklearn/manifold/_t_sne.py @@ -19,6 +19,7 @@ from ..utils import check_random_state from ..utils._openmp_helpers import _openmp_effective_n_threads from ..utils.validation import check_non_negative +from ..utils.validation import _deprecate_positional_args from ..decomposition import PCA from ..metrics.pairwise import pairwise_distances from . import _utils @@ -396,7 +397,8 @@ def _gradient_descent(objective, p0, it, n_iter, return p, error, i -def trustworthiness(X, X_embedded, n_neighbors=5, metric='euclidean'): +@_deprecate_positional_args +def trustworthiness(X, X_embedded, *, n_neighbors=5, metric='euclidean'): r"""Expresses to what extent the local structure is retained. The trustworthiness is within [0, 1]. It is defined as @@ -632,7 +634,8 @@ class TSNE(BaseEstimator): # Control the number of iterations between progress checks _N_ITER_CHECK = 50 - def __init__(self, n_components=2, perplexity=30.0, + @_deprecate_positional_args + def __init__(self, n_components=2, *, perplexity=30.0, early_exaggeration=12.0, learning_rate=200.0, n_iter=1000, n_iter_without_progress=300, min_grad_norm=1e-7, metric="euclidean", init="random", verbose=0, diff --git a/sklearn/manifold/tests/test_isomap.py b/sklearn/manifold/tests/test_isomap.py index 18133719bf85a..9007772674a99 100644 --- a/sklearn/manifold/tests/test_isomap.py +++ b/sklearn/manifold/tests/test_isomap.py @@ -91,7 +91,7 @@ def test_transform(): X, y = datasets.make_s_curve(n_samples, random_state=0) # Compute isomap embedding - iso = manifold.Isomap(n_components, 2) + iso = manifold.Isomap(n_components=n_components, n_neighbors=2) X_iso = iso.fit_transform(X) # Re-embed a noisy version of the points diff --git a/sklearn/manifold/tests/test_locally_linear.py b/sklearn/manifold/tests/test_locally_linear.py index 0968c5052a1b7..952da3ef41163 100644 --- a/sklearn/manifold/tests/test_locally_linear.py +++ b/sklearn/manifold/tests/test_locally_linear.py @@ -131,7 +131,7 @@ def test_singular_matrix(): M = np.ones((10, 3)) f = ignore_warnings with pytest.raises(ValueError): - f(manifold.locally_linear_embedding(M, 2, 1, + f(manifold.locally_linear_embedding(M, n_neighbors=2, n_components=1, method='standard', eigen_solver='arpack')) diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index a916bbe1dd955..90e1935e62f06 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -37,6 +37,7 @@ from ..utils.multiclass import unique_labels from ..utils.multiclass import type_of_target from ..utils.validation import _num_samples +from ..utils.validation import _deprecate_positional_args from ..utils.sparsefuncs import count_nonzero from ..exceptions import UndefinedMetricWarning @@ -121,7 +122,8 @@ def _weighted_sum(sample_score, sample_weight, normalize=False): return sample_score.sum() -def accuracy_score(y_true, y_pred, normalize=True, sample_weight=None): +@_deprecate_positional_args +def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None): """Accuracy classification score. In multilabel classification, this function computes subset accuracy: @@ -193,7 +195,8 @@ def accuracy_score(y_true, y_pred, normalize=True, sample_weight=None): return _weighted_sum(score, sample_weight, normalize) -def confusion_matrix(y_true, y_pred, labels=None, sample_weight=None, +@_deprecate_positional_args +def confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None, normalize=None): """Compute confusion matrix to evaluate the accuracy of a classification. @@ -330,7 +333,8 @@ def confusion_matrix(y_true, y_pred, labels=None, sample_weight=None, return cm -def multilabel_confusion_matrix(y_true, y_pred, sample_weight=None, +@_deprecate_positional_args +def multilabel_confusion_matrix(y_true, y_pred, *, sample_weight=None, labels=None, samplewise=False): """Compute a confusion matrix for each class or sample @@ -533,7 +537,9 @@ def multilabel_confusion_matrix(y_true, y_pred, sample_weight=None, return np.array([tn, fp, fn, tp]).T.reshape(-1, 2, 2) -def cohen_kappa_score(y1, y2, labels=None, weights=None, sample_weight=None): +@_deprecate_positional_args +def cohen_kappa_score(y1, y2, *, labels=None, weights=None, + sample_weight=None): r"""Cohen's kappa: a statistic that measures inter-annotator agreement. This function computes Cohen's kappa [1]_, a score that expresses the level @@ -613,7 +619,8 @@ class labels [2]_. return 1 - k -def jaccard_score(y_true, y_pred, labels=None, pos_label=1, +@_deprecate_positional_args +def jaccard_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary', sample_weight=None): """Jaccard similarity coefficient score @@ -752,7 +759,8 @@ def jaccard_score(y_true, y_pred, labels=None, pos_label=1, return np.average(jaccard, weights=weights) -def matthews_corrcoef(y_true, y_pred, sample_weight=None): +@_deprecate_positional_args +def matthews_corrcoef(y_true, y_pred, *, sample_weight=None): """Compute the Matthews correlation coefficient (MCC) The Matthews correlation coefficient is used in machine learning as a @@ -839,7 +847,8 @@ def matthews_corrcoef(y_true, y_pred, sample_weight=None): return mcc -def zero_one_loss(y_true, y_pred, normalize=True, sample_weight=None): +@_deprecate_positional_args +def zero_one_loss(y_true, y_pred, *, normalize=True, sample_weight=None): """Zero-one classification loss. If normalize is ``True``, return the fraction of misclassifications @@ -909,7 +918,8 @@ def zero_one_loss(y_true, y_pred, normalize=True, sample_weight=None): return n_samples - score -def f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary', +@_deprecate_positional_args +def f1_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division="warn"): """Compute the F1 score, also known as balanced F-score or F-measure @@ -1027,13 +1037,14 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary', and ``UndefinedMetricWarning`` will be raised. This behavior can be modified with ``zero_division``. """ - return fbeta_score(y_true, y_pred, 1, labels=labels, + return fbeta_score(y_true, y_pred, beta=1, labels=labels, pos_label=pos_label, average=average, sample_weight=sample_weight, zero_division=zero_division) -def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1, +@_deprecate_positional_args +def fbeta_score(y_true, y_pred, *, beta, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division="warn"): """Compute the F-beta score @@ -1256,7 +1267,8 @@ def _check_set_wise_labels(y_true, y_pred, average, labels, pos_label): return labels -def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, +@_deprecate_positional_args +def precision_recall_fscore_support(y_true, y_pred, *, beta=1.0, labels=None, pos_label=1, average=None, warn_for=('precision', 'recall', 'f-score'), @@ -1488,7 +1500,8 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, return precision, recall, f_score, true_sum -def precision_score(y_true, y_pred, labels=None, pos_label=1, +@_deprecate_positional_args +def precision_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division="warn"): """Compute the precision @@ -1607,7 +1620,8 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1, return p -def recall_score(y_true, y_pred, labels=None, pos_label=1, average='binary', +@_deprecate_positional_args +def recall_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary', sample_weight=None, zero_division="warn"): """Compute the recall @@ -1724,7 +1738,8 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='binary', return r -def balanced_accuracy_score(y_true, y_pred, sample_weight=None, +@_deprecate_positional_args +def balanced_accuracy_score(y_true, y_pred, *, sample_weight=None, adjusted=False): """Compute the balanced accuracy @@ -1801,7 +1816,8 @@ def balanced_accuracy_score(y_true, y_pred, sample_weight=None, return score -def classification_report(y_true, y_pred, labels=None, target_names=None, +@_deprecate_positional_args +def classification_report(y_true, y_pred, *, labels=None, target_names=None, sample_weight=None, digits=2, output_dict=False, zero_division="warn"): """Build a text report showing the main classification metrics. @@ -1999,7 +2015,8 @@ class 2 1.00 0.67 0.80 3 return report -def hamming_loss(y_true, y_pred, sample_weight=None): +@_deprecate_positional_args +def hamming_loss(y_true, y_pred, *, sample_weight=None): """Compute the average Hamming loss. The Hamming loss is the fraction of labels that are incorrectly predicted. @@ -2090,7 +2107,8 @@ def hamming_loss(y_true, y_pred, sample_weight=None): raise ValueError("{0} is not supported".format(y_type)) -def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None, +@_deprecate_positional_args +def log_loss(y_true, y_pred, *, eps=1e-15, normalize=True, sample_weight=None, labels=None): """Log loss, aka logistic loss or cross-entropy loss. @@ -2215,7 +2233,8 @@ def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None, return _weighted_sum(loss, sample_weight, normalize) -def hinge_loss(y_true, pred_decision, labels=None, sample_weight=None): +@_deprecate_positional_args +def hinge_loss(y_true, pred_decision, *, labels=None, sample_weight=None): """Average hinge loss (non-regularized) In binary class case, assuming labels in y_true are encoded with +1 and -1, @@ -2292,7 +2311,7 @@ def hinge_loss(y_true, pred_decision, labels=None, sample_weight=None): LinearSVC() >>> pred_decision = est.decision_function([[-1], [2], [3]]) >>> y_true = [0, 2, 3] - >>> hinge_loss(y_true, pred_decision, labels) + >>> hinge_loss(y_true, pred_decision, labels=labels) 0.56... """ check_consistent_length(y_true, pred_decision, sample_weight) @@ -2336,7 +2355,8 @@ def hinge_loss(y_true, pred_decision, labels=None, sample_weight=None): return np.average(losses, weights=sample_weight) -def brier_score_loss(y_true, y_prob, sample_weight=None, pos_label=None): +@_deprecate_positional_args +def brier_score_loss(y_true, y_prob, *, sample_weight=None, pos_label=None): """Compute the Brier score. The smaller the Brier score, the better, hence the naming with "loss". diff --git a/sklearn/metrics/_plot/confusion_matrix.py b/sklearn/metrics/_plot/confusion_matrix.py index 861a2558a3cef..c858ac3950f86 100644 --- a/sklearn/metrics/_plot/confusion_matrix.py +++ b/sklearn/metrics/_plot/confusion_matrix.py @@ -4,6 +4,7 @@ from .. import confusion_matrix from ...utils import check_matplotlib_support +from ...utils.validation import _deprecate_positional_args from ...base import is_classifier @@ -40,11 +41,12 @@ class ConfusionMatrixDisplay: figure_ : matplotlib Figure Figure containing the confusion matrix. """ - def __init__(self, confusion_matrix, display_labels=None): + def __init__(self, confusion_matrix, *, display_labels=None): self.confusion_matrix = confusion_matrix self.display_labels = display_labels - def plot(self, include_values=True, cmap='viridis', + @_deprecate_positional_args + def plot(self, *, include_values=True, cmap='viridis', xticks_rotation='horizontal', values_format=None, ax=None): """Plot visualization. @@ -130,7 +132,8 @@ def plot(self, include_values=True, cmap='viridis', return self -def plot_confusion_matrix(estimator, X, y_true, labels=None, +@_deprecate_positional_args +def plot_confusion_matrix(estimator, X, y_true, *, labels=None, sample_weight=None, normalize=None, display_labels=None, include_values=True, xticks_rotation='horizontal', diff --git a/sklearn/metrics/_plot/precision_recall_curve.py b/sklearn/metrics/_plot/precision_recall_curve.py index 10dd14e938984..bb2a91c198c41 100644 --- a/sklearn/metrics/_plot/precision_recall_curve.py +++ b/sklearn/metrics/_plot/precision_recall_curve.py @@ -4,6 +4,7 @@ from .. import precision_recall_curve from ...utils import check_matplotlib_support +from ...utils.validation import _deprecate_positional_args from ...base import is_classifier @@ -40,15 +41,15 @@ class PrecisionRecallDisplay: figure_ : matplotlib Figure Figure containing the curve. """ - - def __init__(self, precision, recall, + def __init__(self, precision, recall, *, average_precision=None, estimator_name=None): self.precision = precision self.recall = recall self.average_precision = average_precision self.estimator_name = estimator_name - def plot(self, ax=None, name=None, **kwargs): + @_deprecate_positional_args + def plot(self, ax=None, *, name=None, **kwargs): """Plot visualization. Extra keyword arguments will be passed to matplotlib's `plot`. @@ -101,7 +102,8 @@ def plot(self, ax=None, name=None, **kwargs): return self -def plot_precision_recall_curve(estimator, X, y, +@_deprecate_positional_args +def plot_precision_recall_curve(estimator, X, y, *, sample_weight=None, response_method="auto", name=None, ax=None, **kwargs): """Plot Precision Recall Curve for binary classifiers. diff --git a/sklearn/metrics/_plot/roc_curve.py b/sklearn/metrics/_plot/roc_curve.py index 0881646e8a1af..21af0aa388b07 100644 --- a/sklearn/metrics/_plot/roc_curve.py +++ b/sklearn/metrics/_plot/roc_curve.py @@ -4,6 +4,7 @@ from .base import _check_classifer_response_method from ...utils import check_matplotlib_support from ...base import is_classifier +from ...utils.validation import _deprecate_positional_args class RocCurveDisplay: @@ -53,14 +54,14 @@ class RocCurveDisplay: >>> display.plot() # doctest: +SKIP >>> plt.show() # doctest: +SKIP """ - - def __init__(self, fpr, tpr, roc_auc=None, estimator_name=None): + def __init__(self, *, fpr, tpr, roc_auc=None, estimator_name=None): self.fpr = fpr self.tpr = tpr self.roc_auc = roc_auc self.estimator_name = estimator_name - def plot(self, ax=None, name=None, **kwargs): + @_deprecate_positional_args + def plot(self, ax=None, *, name=None, **kwargs): """Plot visualization Extra keyword arguments will be passed to matplotlib's ``plot``. @@ -110,7 +111,8 @@ def plot(self, ax=None, name=None, **kwargs): return self -def plot_roc_curve(estimator, X, y, sample_weight=None, +@_deprecate_positional_args +def plot_roc_curve(estimator, X, y, *, sample_weight=None, drop_intermediate=True, response_method="auto", name=None, ax=None, **kwargs): """Plot Receiver operating characteristic (ROC) curve. diff --git a/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py b/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py index 6a0b880ebabb1..e65c12904b757 100644 --- a/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py +++ b/sklearn/metrics/_plot/tests/test_plot_confusion_matrix.py @@ -267,15 +267,17 @@ def test_confusion_matrix_text_format(pyplot, data, y_pred, n_classes, def test_confusion_matrix_standard_format(pyplot): cm = np.array([[10000000, 0], [123456, 12345678]]) - plotted_text = ConfusionMatrixDisplay(cm, [False, True]).plot().text_ + plotted_text = ConfusionMatrixDisplay( + cm, display_labels=[False, True]).plot().text_ # Values should be shown as whole numbers 'd', # except the first number which should be shown as 1e+07 (longer length) - # and the last number will be showns as 1.2e+07 (longer length) + # and the last number will be shown as 1.2e+07 (longer length) test = [t.get_text() for t in plotted_text.ravel()] assert test == ['1e+07', '0', '123456', '1.2e+07'] cm = np.array([[0.1, 10], [100, 0.525]]) - plotted_text = ConfusionMatrixDisplay(cm, [False, True]).plot().text_ + plotted_text = ConfusionMatrixDisplay( + cm, display_labels=[False, True]).plot().text_ # Values should now formatted as '.2g', since there's a float in # Values are have two dec places max, (e.g 100 becomes 1e+02) test = [t.get_text() for t in plotted_text.ravel()] diff --git a/sklearn/metrics/_plot/tests/test_plot_roc_curve.py b/sklearn/metrics/_plot/tests/test_plot_roc_curve.py index 1aa34bdca7279..50e69ad41af8d 100644 --- a/sklearn/metrics/_plot/tests/test_plot_roc_curve.py +++ b/sklearn/metrics/_plot/tests/test_plot_roc_curve.py @@ -165,6 +165,6 @@ def test_default_labels(pyplot, roc_auc, estimator_name, expected_label): fpr = np.array([0, 0.5, 1]) tpr = np.array([0, 0.5, 1]) - disp = RocCurveDisplay(fpr, tpr, roc_auc=roc_auc, + disp = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name=estimator_name).plot() assert disp.line_.get_label() == expected_label diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py index a9e45310f330d..2ac226bfe0299 100644 --- a/sklearn/metrics/_ranking.py +++ b/sklearn/metrics/_ranking.py @@ -31,6 +31,7 @@ from ..utils.multiclass import type_of_target from ..utils.extmath import stable_cumsum from ..utils.sparsefuncs import count_nonzero +from ..utils.validation import _deprecate_positional_args from ..exceptions import UndefinedMetricWarning from ..preprocessing import label_binarize from ..preprocessing._label import _encode @@ -101,7 +102,8 @@ def auc(x, y): return area -def average_precision_score(y_true, y_score, average="macro", pos_label=1, +@_deprecate_positional_args +def average_precision_score(y_true, y_score, *, average="macro", pos_label=1, sample_weight=None): """Compute average precision (AP) from prediction scores @@ -243,7 +245,8 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None, max_fpr=None): return 0.5 * (1 + (partial_auc - min_area) / (max_area - min_area)) -def roc_auc_score(y_true, y_score, average="macro", sample_weight=None, +@_deprecate_positional_args +def roc_auc_score(y_true, y_score, *, average="macro", sample_weight=None, max_fpr=None, multi_class="raise", labels=None): """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores. @@ -594,7 +597,8 @@ def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None): return fps, tps, y_score[threshold_idxs] -def precision_recall_curve(y_true, probas_pred, pos_label=None, +@_deprecate_positional_args +def precision_recall_curve(y_true, probas_pred, *, pos_label=None, sample_weight=None): """Compute precision-recall pairs for different probability thresholds @@ -683,7 +687,8 @@ def precision_recall_curve(y_true, probas_pred, pos_label=None, return np.r_[precision[sl], 1], np.r_[recall[sl], 0], thresholds[sl] -def roc_curve(y_true, y_score, pos_label=None, sample_weight=None, +@_deprecate_positional_args +def roc_curve(y_true, y_score, *, pos_label=None, sample_weight=None, drop_intermediate=True): """Compute Receiver operating characteristic (ROC) @@ -813,7 +818,9 @@ def roc_curve(y_true, y_score, pos_label=None, sample_weight=None, return fpr, tpr, thresholds -def label_ranking_average_precision_score(y_true, y_score, sample_weight=None): +@_deprecate_positional_args +def label_ranking_average_precision_score(y_true, y_score, *, + sample_weight=None): """Compute ranking-based average precision Label ranking average precision (LRAP) is the average over each ground @@ -899,7 +906,8 @@ def label_ranking_average_precision_score(y_true, y_score, sample_weight=None): return out -def coverage_error(y_true, y_score, sample_weight=None): +@_deprecate_positional_args +def coverage_error(y_true, y_score, *, sample_weight=None): """Coverage error measure Compute how far we need to go through the ranked scores to cover all @@ -958,7 +966,8 @@ def coverage_error(y_true, y_score, sample_weight=None): return np.average(coverage, weights=sample_weight) -def label_ranking_loss(y_true, y_score, sample_weight=None): +@_deprecate_positional_args +def label_ranking_loss(y_true, y_score, *, sample_weight=None): """Compute Ranking loss measure Compute the average number of label pairs that are incorrectly ordered @@ -1163,7 +1172,8 @@ def _check_dcg_target_type(y_true): supported_fmt, y_type)) -def dcg_score(y_true, y_score, k=None, +@_deprecate_positional_args +def dcg_score(y_true, y_score, *, k=None, log_base=2, sample_weight=None, ignore_ties=False): """Compute Discounted Cumulative Gain. @@ -1320,7 +1330,9 @@ def _ndcg_sample_scores(y_true, y_score, k=None, ignore_ties=False): return gain -def ndcg_score(y_true, y_score, k=None, sample_weight=None, ignore_ties=False): +@_deprecate_positional_args +def ndcg_score(y_true, y_score, *, k=None, sample_weight=None, + ignore_ties=False): """Compute Normalized Discounted Cumulative Gain. Sum the true scores ranked in the order induced by the predicted scores, diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index 6026a5293806a..afbb469072cf5 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -29,6 +29,7 @@ from ..utils.validation import (check_array, check_consistent_length, _num_samples) from ..utils.validation import column_or_1d +from ..utils.validation import _deprecate_positional_args from ..exceptions import UndefinedMetricWarning @@ -117,7 +118,8 @@ def _check_reg_targets(y_true, y_pred, multioutput, dtype="numeric"): return y_type, y_true, y_pred, multioutput -def mean_absolute_error(y_true, y_pred, +@_deprecate_positional_args +def mean_absolute_error(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average'): """Mean absolute error regression loss @@ -188,7 +190,8 @@ def mean_absolute_error(y_true, y_pred, return np.average(output_errors, weights=multioutput) -def mean_squared_error(y_true, y_pred, +@_deprecate_positional_args +def mean_squared_error(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average', squared=True): """Mean squared error regression loss @@ -263,7 +266,8 @@ def mean_squared_error(y_true, y_pred, return mse if squared else np.sqrt(mse) -def mean_squared_log_error(y_true, y_pred, +@_deprecate_positional_args +def mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average'): """Mean squared logarithmic error regression loss @@ -326,10 +330,12 @@ def mean_squared_log_error(y_true, y_pred, "targets contain negative values.") return mean_squared_error(np.log1p(y_true), np.log1p(y_pred), - sample_weight, multioutput) + sample_weight=sample_weight, + multioutput=multioutput) -def median_absolute_error(y_true, y_pred, multioutput='uniform_average'): +@_deprecate_positional_args +def median_absolute_error(y_true, y_pred, *, multioutput='uniform_average'): """Median absolute error regression loss Median absolute error output is non-negative floating point. The best value @@ -392,7 +398,8 @@ def median_absolute_error(y_true, y_pred, multioutput='uniform_average'): return np.average(output_errors, weights=multioutput) -def explained_variance_score(y_true, y_pred, +@_deprecate_positional_args +def explained_variance_score(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average'): """Explained variance regression score function @@ -484,7 +491,8 @@ def explained_variance_score(y_true, y_pred, return np.average(output_scores, weights=avg_weights) -def r2_score(y_true, y_pred, sample_weight=None, +@_deprecate_positional_args +def r2_score(y_true, y_pred, *, sample_weight=None, multioutput="uniform_average"): """R^2 (coefficient of determination) regression score function. @@ -655,7 +663,8 @@ def max_error(y_true, y_pred): return np.max(np.abs(y_true - y_pred)) -def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, power=0): +@_deprecate_positional_args +def mean_tweedie_deviance(y_true, y_pred, *, sample_weight=None, power=0): """Mean Tweedie deviance regression loss. Read more in the :ref:`User Guide `. @@ -719,7 +728,8 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, power=0): return np.average(dev, weights=sample_weight) -def mean_poisson_deviance(y_true, y_pred, sample_weight=None): +@_deprecate_positional_args +def mean_poisson_deviance(y_true, y_pred, *, sample_weight=None): """Mean Poisson deviance regression loss. Poisson deviance is equivalent to the Tweedie deviance with @@ -756,7 +766,8 @@ def mean_poisson_deviance(y_true, y_pred, sample_weight=None): ) -def mean_gamma_deviance(y_true, y_pred, sample_weight=None): +@_deprecate_positional_args +def mean_gamma_deviance(y_true, y_pred, *, sample_weight=None): """Mean Gamma deviance regression loss. Gamma deviance is equivalent to the Tweedie deviance with diff --git a/sklearn/metrics/_scorer.py b/sklearn/metrics/_scorer.py index 3df175c2ca306..400e92c158ca8 100644 --- a/sklearn/metrics/_scorer.py +++ b/sklearn/metrics/_scorer.py @@ -43,6 +43,7 @@ from .cluster import fowlkes_mallows_score from ..utils.multiclass import type_of_target +from ..utils.validation import _deprecate_positional_args from ..base import is_regressor @@ -371,7 +372,8 @@ def _passthrough_scorer(estimator, *args, **kwargs): return estimator.score(*args, **kwargs) -def check_scoring(estimator, scoring=None, allow_none=False): +@_deprecate_positional_args +def check_scoring(estimator, scoring=None, *, allow_none=False): """Determine scorer from user options. A TypeError will be thrown if the estimator cannot be scored. @@ -528,7 +530,8 @@ def _check_multimetric_scoring(estimator, scoring=None): return scorers, True -def make_scorer(score_func, greater_is_better=True, needs_proba=False, +@_deprecate_positional_args +def make_scorer(score_func, *, greater_is_better=True, needs_proba=False, needs_threshold=False, **kwargs): """Make a scorer from a performance metric or loss function. diff --git a/sklearn/metrics/cluster/_bicluster.py b/sklearn/metrics/cluster/_bicluster.py index f8d8d18e9f6b0..ac0d0a454a74a 100644 --- a/sklearn/metrics/cluster/_bicluster.py +++ b/sklearn/metrics/cluster/_bicluster.py @@ -2,6 +2,7 @@ from scipy.optimize import linear_sum_assignment from ...utils.validation import check_consistent_length, check_array +from ...utils.validation import _deprecate_positional_args __all__ = ["consensus_score"] @@ -44,7 +45,8 @@ def _pairwise_similarity(a, b, similarity): return result -def consensus_score(a, b, similarity="jaccard"): +@_deprecate_positional_args +def consensus_score(a, b, *, similarity="jaccard"): """The similarity of two sets of biclusters. Similarity between individual biclusters is computed. Then the diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py index 973c45a908bf1..8a0fdcacb67f1 100644 --- a/sklearn/metrics/cluster/_supervised.py +++ b/sklearn/metrics/cluster/_supervised.py @@ -23,6 +23,7 @@ from ._expected_mutual_info_fast import expected_mutual_information from ...utils.validation import check_array, check_consistent_length +from ...utils.validation import _deprecate_positional_args from ...utils.fixes import _astype_copy_false @@ -77,7 +78,8 @@ def _generalized_average(U, V, average_method): "'arithmetic', or 'max'") -def contingency_matrix(labels_true, labels_pred, eps=None, sparse=False): +@_deprecate_positional_args +def contingency_matrix(labels_true, labels_pred, *, eps=None, sparse=False): """Build a contingency matrix describing the relationship between labels. Parameters @@ -241,7 +243,8 @@ def adjusted_rand_score(labels_true, labels_pred): return (sum_comb - prod_comb) / (mean_comb - prod_comb) -def homogeneity_completeness_v_measure(labels_true, labels_pred, beta=1.0): +@_deprecate_positional_args +def homogeneity_completeness_v_measure(labels_true, labels_pred, *, beta=1.0): """Compute the homogeneity and completeness and V-Measure scores at once. Those metrics are based on normalized conditional entropy measures of @@ -463,7 +466,8 @@ def completeness_score(labels_true, labels_pred): return homogeneity_completeness_v_measure(labels_true, labels_pred)[1] -def v_measure_score(labels_true, labels_pred, beta=1.0): +@_deprecate_positional_args +def v_measure_score(labels_true, labels_pred, *, beta=1.0): """V-measure cluster labeling given a ground truth. This score is identical to :func:`normalized_mutual_info_score` with @@ -563,7 +567,8 @@ def v_measure_score(labels_true, labels_pred, beta=1.0): beta=beta)[2] -def mutual_info_score(labels_true, labels_pred, contingency=None): +@_deprecate_positional_args +def mutual_info_score(labels_true, labels_pred, *, contingency=None): """Mutual Information between two clusterings. The Mutual Information is a measure of the similarity between two labels of @@ -649,7 +654,8 @@ def mutual_info_score(labels_true, labels_pred, contingency=None): return np.clip(mi.sum(), 0.0, None) -def adjusted_mutual_info_score(labels_true, labels_pred, +@_deprecate_positional_args +def adjusted_mutual_info_score(labels_true, labels_pred, *, average_method='arithmetic'): """Adjusted Mutual Information between two clusterings. @@ -770,7 +776,8 @@ def adjusted_mutual_info_score(labels_true, labels_pred, return ami -def normalized_mutual_info_score(labels_true, labels_pred, +@_deprecate_positional_args +def normalized_mutual_info_score(labels_true, labels_pred, *, average_method='arithmetic'): """Normalized Mutual Information between two clusterings. @@ -870,7 +877,8 @@ def normalized_mutual_info_score(labels_true, labels_pred, return nmi -def fowlkes_mallows_score(labels_true, labels_pred, sparse=False): +@_deprecate_positional_args +def fowlkes_mallows_score(labels_true, labels_pred, *, sparse=False): """Measure the similarity of two clusterings of a set of points. The Fowlkes-Mallows index (FMI) is defined as the geometric mean between of diff --git a/sklearn/metrics/cluster/_unsupervised.py b/sklearn/metrics/cluster/_unsupervised.py index 8841df701c69f..9e2ef713b352e 100644 --- a/sklearn/metrics/cluster/_unsupervised.py +++ b/sklearn/metrics/cluster/_unsupervised.py @@ -16,6 +16,7 @@ from ..pairwise import pairwise_distances_chunked from ..pairwise import pairwise_distances from ...preprocessing import LabelEncoder +from ...utils.validation import _deprecate_positional_args def check_number_of_labels(n_labels, n_samples): @@ -34,7 +35,8 @@ def check_number_of_labels(n_labels, n_samples): "to n_samples - 1 (inclusive)" % n_labels) -def silhouette_score(X, labels, metric='euclidean', sample_size=None, +@_deprecate_positional_args +def silhouette_score(X, labels, *, metric='euclidean', sample_size=None, random_state=None, **kwds): """Compute the mean Silhouette Coefficient of all samples. @@ -147,7 +149,8 @@ def _silhouette_reduce(D_chunk, start, labels, label_freqs): return intra_clust_dists, inter_clust_dists -def silhouette_samples(X, labels, metric='euclidean', **kwds): +@_deprecate_positional_args +def silhouette_samples(X, labels, *, metric='euclidean', **kwds): """Compute the Silhouette Coefficient for each sample. The Silhouette Coefficient is a measure of how well samples are clustered diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 25646acb49ea7..9d4107ebd66d6 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -28,6 +28,7 @@ from ..utils.extmath import row_norms, safe_sparse_dot from ..preprocessing import normalize from ..utils._mask import _get_mask +from ..utils.validation import _deprecate_positional_args from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan from ..exceptions import DataConversionWarning @@ -58,7 +59,8 @@ def _return_float_dtype(X, Y): return X, Y, dtype -def check_pairwise_arrays(X, Y, precomputed=False, dtype=None, +@_deprecate_positional_args +def check_pairwise_arrays(X, Y, *, precomputed=False, dtype=None, accept_sparse='csr', force_all_finite=True, copy=False): """ Set X and Y appropriately and checks inputs @@ -192,7 +194,8 @@ def check_paired_arrays(X, Y): # Pairwise distances -def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, +@_deprecate_positional_args +def euclidean_distances(X, Y=None, *, Y_norm_squared=None, squared=False, X_norm_squared=None): """ Considering the rows of X (and Y=X) as vectors, compute the @@ -313,7 +316,8 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, return distances if squared else np.sqrt(distances, out=distances) -def nan_euclidean_distances(X, Y=None, squared=False, +@_deprecate_positional_args +def nan_euclidean_distances(X, Y=None, *, squared=False, missing_values=np.nan, copy=True): """Calculate the euclidean distances in the presence of missing values. @@ -503,7 +507,8 @@ def _argmin_min_reduce(dist, start): return indices, values -def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean", +@_deprecate_positional_args +def pairwise_distances_argmin_min(X, Y, *, axis=1, metric="euclidean", metric_kwargs=None): """Compute minimum distances between one point and a set of points. @@ -589,7 +594,8 @@ def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean", return indices, values -def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean", +@_deprecate_positional_args +def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean", metric_kwargs=None): """Compute minimum distances between one point and a set of points. @@ -659,7 +665,7 @@ def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean", if metric_kwargs is None: metric_kwargs = {} - return pairwise_distances_argmin_min(X, Y, axis, metric, + return pairwise_distances_argmin_min(X, Y, axis=axis, metric=metric, metric_kwargs=metric_kwargs)[0] @@ -711,7 +717,8 @@ def haversine_distances(X, Y=None): return DistanceMetric.get_metric('haversine').pairwise(X, Y) -def manhattan_distances(X, Y=None, sum_over_features=True): +@_deprecate_positional_args +def manhattan_distances(X, Y=None, *, sum_over_features=True): """ Compute the L1 distances between the vectors in X and Y. With sum_over_features equal to False it returns the componentwise @@ -908,7 +915,8 @@ def paired_cosine_distances(X, Y): 'cityblock': paired_manhattan_distances} -def paired_distances(X, Y, metric="euclidean", **kwds): +@_deprecate_positional_args +def paired_distances(X, Y, *, metric="euclidean", **kwds): """ Computes the paired distances between X and Y. @@ -1444,7 +1452,8 @@ def _precompute_metric_params(X, Y, metric=None, **kwds): return {} -def pairwise_distances_chunked(X, Y=None, reduce_func=None, +@_deprecate_positional_args +def pairwise_distances_chunked(X, Y=None, *, reduce_func=None, metric='euclidean', n_jobs=None, working_memory=None, **kwds): """Generate a distance matrix chunk by chunk with optional reduction @@ -1606,7 +1615,8 @@ def pairwise_distances_chunked(X, Y=None, reduce_func=None, yield D_chunk -def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None, +@_deprecate_positional_args +def pairwise_distances(X, Y=None, metric="euclidean", *, n_jobs=None, force_all_finite=True, **kwds): """ Compute the distance matrix from a vector array X and optional Y. @@ -1820,7 +1830,8 @@ def kernel_metrics(): } -def pairwise_kernels(X, Y=None, metric="linear", filter_params=False, +@_deprecate_positional_args +def pairwise_kernels(X, Y=None, metric="linear", *, filter_params=False, n_jobs=None, **kwds): """Compute the kernel between arrays X and optional array Y. diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index ca56e79299adb..1f959d95ce844 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -638,7 +638,7 @@ def test_matthews_corrcoef_against_jurman(): for k in range(N) ]) mcc_jurman = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp) - mcc_ours = matthews_corrcoef(y_true, y_pred, sample_weight) + mcc_ours = matthews_corrcoef(y_true, y_pred, sample_weight=sample_weight) assert_almost_equal(mcc_ours, mcc_jurman, 10) @@ -725,7 +725,8 @@ def test_matthews_corrcoef_multiclass(): y_true = [0, 0, 1, 1, 2] y_pred = [1, 1, 0, 0, 2] sample_weight = [1, 1, 1, 1, 0] - assert_almost_equal(matthews_corrcoef(y_true, y_pred, sample_weight), -1) + assert_almost_equal(matthews_corrcoef(y_true, y_pred, + sample_weight=sample_weight), -1) # For the zero vector case, the corrcoef cannot be calculated and should # result in a RuntimeWarning @@ -734,7 +735,7 @@ def test_matthews_corrcoef_multiclass(): sample_weight = [1, 1, 0, 0] mcc = assert_warns_message(RuntimeWarning, 'invalid value encountered', matthews_corrcoef, y_true, y_pred, - sample_weight) + sample_weight=sample_weight) # But will output 0 assert_almost_equal(mcc, 0.) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index fdff2c4c3959e..f2c7a307571bc 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -350,7 +350,7 @@ def test_pairwise_kernels_filter_param(): assert_array_almost_equal(K, K2) with pytest.raises(TypeError): - pairwise_kernels(X, Y, "rbf", **params) + pairwise_kernels(X, Y, metric="rbf", **params) @pytest.mark.parametrize('metric, func', PAIRED_DISTANCES.items()) diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index 64e88f37ed2bc..189d36ae88328 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -190,11 +190,11 @@ def check_scoring_validator_for_single_metric_usecases(scoring_validator): with pytest.raises(TypeError, match=pattern): scoring_validator(estimator) - scorer = scoring_validator(estimator, "accuracy") + scorer = scoring_validator(estimator, scoring="accuracy") assert_almost_equal(scorer(estimator, [[1]], [1]), 1.0) estimator = EstimatorWithFit() - scorer = scoring_validator(estimator, "accuracy") + scorer = scoring_validator(estimator, scoring="accuracy") assert isinstance(scorer, _PredictScorer) # Test the allow_none parameter for check_scoring alone @@ -274,11 +274,11 @@ def test_check_scoring_gridsearchcv(): # slightly redundant non-regression test. grid = GridSearchCV(LinearSVC(), param_grid={'C': [.1, 1]}, cv=3) - scorer = check_scoring(grid, "f1") + scorer = check_scoring(grid, scoring="f1") assert isinstance(scorer, _PredictScorer) pipe = make_pipeline(LinearSVC()) - scorer = check_scoring(pipe, "f1") + scorer = check_scoring(pipe, scoring="f1") assert isinstance(scorer, _PredictScorer) # check that cross_val_score definitely calls the scorer @@ -544,13 +544,13 @@ def test_scorer_memmap_input(name): def test_scoring_is_not_metric(): with pytest.raises(ValueError, match='make_scorer'): - check_scoring(LogisticRegression(), f1_score) + check_scoring(LogisticRegression(), scoring=f1_score) with pytest.raises(ValueError, match='make_scorer'): - check_scoring(LogisticRegression(), roc_auc_score) + check_scoring(LogisticRegression(), scoring=roc_auc_score) with pytest.raises(ValueError, match='make_scorer'): - check_scoring(Ridge(), r2_score) + check_scoring(Ridge(), scoring=r2_score) with pytest.raises(ValueError, match='make_scorer'): - check_scoring(KMeans(), cluster_module.adjusted_rand_score) + check_scoring(KMeans(), scoring=cluster_module.adjusted_rand_score) def test_deprecated_scorer(): diff --git a/sklearn/mixture/_bayesian_mixture.py b/sklearn/mixture/_bayesian_mixture.py index c68fa260faee3..648fb8d903d38 100644 --- a/sklearn/mixture/_bayesian_mixture.py +++ b/sklearn/mixture/_bayesian_mixture.py @@ -15,6 +15,7 @@ from ._gaussian_mixture import _estimate_gaussian_parameters from ._gaussian_mixture import _estimate_log_gaussian_prob from ..utils import check_array +from ..utils.validation import _deprecate_positional_args def _log_dirichlet_norm(dirichlet_concentration): @@ -307,8 +308,8 @@ class BayesianGaussianMixture(BaseMixture): inference for Dirichlet process mixtures". Bayesian analysis 1.1 `_ """ - - def __init__(self, n_components=1, covariance_type='full', tol=1e-3, + @_deprecate_positional_args + def __init__(self, *, n_components=1, covariance_type='full', tol=1e-3, reg_covar=1e-6, max_iter=100, n_init=1, init_params='kmeans', weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=None, diff --git a/sklearn/mixture/_gaussian_mixture.py b/sklearn/mixture/_gaussian_mixture.py index 277f65f929eac..596e66f6a4e64 100644 --- a/sklearn/mixture/_gaussian_mixture.py +++ b/sklearn/mixture/_gaussian_mixture.py @@ -11,6 +11,7 @@ from ._base import BaseMixture, _check_shape from ..utils import check_array from ..utils.extmath import row_norms +from ..utils.validation import _deprecate_positional_args ############################################################################### @@ -585,8 +586,8 @@ class GaussianMixture(BaseMixture): BayesianGaussianMixture : Gaussian mixture model fit with a variational inference. """ - - def __init__(self, n_components=1, covariance_type='full', tol=1e-3, + @_deprecate_positional_args + def __init__(self, n_components=1, *, covariance_type='full', tol=1e-3, reg_covar=1e-6, max_iter=100, n_init=1, init_params='kmeans', weights_init=None, means_init=None, precisions_init=None, random_state=None, warm_start=False, diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 80cb269d4a0c7..920900db20fe7 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -34,6 +34,7 @@ from ..utils import check_random_state from ..utils.random import sample_without_replacement from ..utils.validation import indexable, check_is_fitted, _check_fit_params +from ..utils.validation import _deprecate_positional_args from ..utils.metaestimators import if_delegate_has_method from ..metrics._scorer import _check_multimetric_scoring from ..metrics import check_scoring @@ -234,7 +235,8 @@ class ParameterSampler: ... {'b': 1.038159, 'a': 2}] True """ - def __init__(self, param_distributions, n_iter, random_state=None): + @_deprecate_positional_args + def __init__(self, param_distributions, n_iter, *, random_state=None): if not isinstance(param_distributions, (Mapping, Iterable)): raise TypeError('Parameter distribution is not a dict or ' 'a list ({!r})'.format(param_distributions)) @@ -400,9 +402,11 @@ class BaseSearchCV(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta): """ @abstractmethod - def __init__(self, estimator, scoring=None, n_jobs=None, iid='deprecated', - refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', - error_score=np.nan, return_train_score=True): + @_deprecate_positional_args + def __init__(self, estimator, *, scoring=None, n_jobs=None, + iid='deprecated', refit=True, cv=None, verbose=0, + pre_dispatch='2*n_jobs', error_score=np.nan, + return_train_score=True): self.scoring = scoring self.estimator = estimator @@ -620,7 +624,8 @@ def _run_search(self, evaluate_candidates): """ raise NotImplementedError("_run_search not implemented.") - def fit(self, X, y=None, groups=None, **fit_params): + @_deprecate_positional_args + def fit(self, X, y=None, *, groups=None, **fit_params): """Run fit with all sets of parameters. Parameters @@ -1160,7 +1165,8 @@ class GridSearchCV(BaseSearchCV): """ _required_parameters = ["estimator", "param_grid"] - def __init__(self, estimator, param_grid, scoring=None, + @_deprecate_positional_args + def __init__(self, estimator, param_grid, *, scoring=None, n_jobs=None, iid='deprecated', refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score=np.nan, return_train_score=False): @@ -1492,8 +1498,9 @@ class RandomizedSearchCV(BaseSearchCV): """ _required_parameters = ["estimator", "param_distributions"] - def __init__(self, estimator, param_distributions, n_iter=10, scoring=None, - n_jobs=None, iid='deprecated', refit=True, + @_deprecate_positional_args + def __init__(self, estimator, param_distributions, *, n_iter=10, + scoring=None, n_jobs=None, iid='deprecated', refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score=np.nan, return_train_score=False): diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index e728533c3b5cf..edcb9b375ae79 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -24,6 +24,7 @@ from ..utils import _approximate_mode from ..utils.validation import _num_samples, column_or_1d from ..utils.validation import check_array +from ..utils.validation import _deprecate_positional_args from ..utils.multiclass import type_of_target from ..base import _pprint @@ -50,7 +51,6 @@ class BaseCrossValidator(metaclass=ABCMeta): Implementations must define `_iter_test_masks` or `_iter_test_indices`. """ - def split(self, X, y=None, groups=None): """Generate indices to split data into training and test set. @@ -270,7 +270,8 @@ class _BaseKFold(BaseCrossValidator, metaclass=ABCMeta): """Base class for KFold, GroupKFold, and StratifiedKFold""" @abstractmethod - def __init__(self, n_splits, shuffle, random_state): + @_deprecate_positional_args + def __init__(self, n_splits, *, shuffle, random_state): if not isinstance(n_splits, numbers.Integral): raise ValueError('The number of folds must be of Integral type. ' '%s of type %s was passed.' @@ -426,10 +427,11 @@ class KFold(_BaseKFold): RepeatedKFold: Repeats K-Fold n times. """ - - def __init__(self, n_splits=5, shuffle=False, + @_deprecate_positional_args + def __init__(self, n_splits=5, *, shuffle=False, random_state=None): - super().__init__(n_splits, shuffle, random_state) + super().__init__(n_splits=n_splits, shuffle=shuffle, + random_state=random_state) def _iter_test_indices(self, X, y=None, groups=None): n_samples = _num_samples(X) @@ -633,9 +635,10 @@ class StratifiedKFold(_BaseKFold): -------- RepeatedStratifiedKFold: Repeats Stratified K-Fold n times. """ - - def __init__(self, n_splits=5, shuffle=False, random_state=None): - super().__init__(n_splits, shuffle, random_state) + @_deprecate_positional_args + def __init__(self, n_splits=5, *, shuffle=False, random_state=None): + super().__init__(n_splits=n_splits, shuffle=shuffle, + random_state=random_state) def _make_test_folds(self, X, y=None): rng = check_random_state(self.random_state) @@ -787,7 +790,8 @@ class TimeSeriesSplit(_BaseKFold): with a test set of size ``n_samples//(n_splits + 1)``, where ``n_samples`` is the number of samples. """ - def __init__(self, n_splits=5, max_train_size=None): + @_deprecate_positional_args + def __init__(self, n_splits=5, *, max_train_size=None): super().__init__(n_splits, shuffle=False, random_state=None) self.max_train_size = max_train_size @@ -1099,7 +1103,8 @@ class _RepeatedSplits(metaclass=ABCMeta): Constructor parameters for cv. Must not contain random_state and shuffle. """ - def __init__(self, cv, n_repeats=10, random_state=None, **cvargs): + @_deprecate_positional_args + def __init__(self, cv, *, n_repeats=10, random_state=None, **cvargs): if not isinstance(n_repeats, numbers.Integral): raise ValueError("Number of repetitions must be of Integral type.") @@ -1226,9 +1231,11 @@ class RepeatedKFold(_RepeatedSplits): -------- RepeatedStratifiedKFold: Repeats Stratified K-Fold n times. """ - def __init__(self, n_splits=5, n_repeats=10, random_state=None): + @_deprecate_positional_args + def __init__(self, *, n_splits=5, n_repeats=10, random_state=None): super().__init__( - KFold, n_repeats, random_state, n_splits=n_splits) + KFold, n_repeats=n_repeats, + random_state=random_state, n_splits=n_splits) class RepeatedStratifiedKFold(_RepeatedSplits): @@ -1280,15 +1287,17 @@ class RepeatedStratifiedKFold(_RepeatedSplits): -------- RepeatedKFold: Repeats K-Fold n times. """ - def __init__(self, n_splits=5, n_repeats=10, random_state=None): + @_deprecate_positional_args + def __init__(self, *, n_splits=5, n_repeats=10, random_state=None): super().__init__( - StratifiedKFold, n_repeats, random_state, n_splits=n_splits) + StratifiedKFold, n_repeats=n_repeats, random_state=random_state, + n_splits=n_splits) class BaseShuffleSplit(metaclass=ABCMeta): """Base class for ShuffleSplit and StratifiedShuffleSplit""" - - def __init__(self, n_splits=10, test_size=None, train_size=None, + @_deprecate_positional_args + def __init__(self, n_splits=10, *, test_size=None, train_size=None, random_state=None): self.n_splits = n_splits self.test_size = test_size @@ -1421,7 +1430,8 @@ class ShuffleSplit(BaseShuffleSplit): TRAIN: [3 4 1] TEST: [5 2] TRAIN: [3 5 1] TEST: [2 4] """ - def __init__(self, n_splits=10, test_size=None, train_size=None, + @_deprecate_positional_args + def __init__(self, n_splits=10, *, test_size=None, train_size=None, random_state=None): super().__init__( n_splits=n_splits, @@ -1510,8 +1520,8 @@ class GroupShuffleSplit(ShuffleSplit): TRAIN: [2 3 4 5 6 7] TEST: [0 1] TRAIN: [0 1 5 6 7] TEST: [2 3 4] ''' - - def __init__(self, n_splits=5, test_size=None, train_size=None, + @_deprecate_positional_args + def __init__(self, n_splits=5, *, test_size=None, train_size=None, random_state=None): super().__init__( n_splits=n_splits, @@ -1626,8 +1636,8 @@ class StratifiedShuffleSplit(BaseShuffleSplit): TRAIN: [4 1 0] TEST: [2 3 5] TRAIN: [0 5 1] TEST: [3 4 2] """ - - def __init__(self, n_splits=10, test_size=None, train_size=None, + @_deprecate_positional_args + def __init__(self, n_splits=10, *, test_size=None, train_size=None, random_state=None): super().__init__( n_splits=n_splits, @@ -1959,7 +1969,8 @@ def split(self, X=None, y=None, groups=None): yield train, test -def check_cv(cv=5, y=None, classifier=False): +@_deprecate_positional_args +def check_cv(cv=5, y=None, *, classifier=False): """Input checker utility for building a cross-validator Parameters diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index ef8890556ed1d..9618bf2fe2e09 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -25,6 +25,7 @@ _message_with_time) from ..utils.validation import _check_fit_params from ..utils.validation import _num_samples +from ..utils.validation import _deprecate_positional_args from ..utils.metaestimators import _safe_split from ..metrics import check_scoring from ..metrics._scorer import _check_multimetric_scoring, _MultimetricScorer @@ -37,7 +38,8 @@ 'permutation_test_score', 'learning_curve', 'validation_curve'] -def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None, +@_deprecate_positional_args +def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None, n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', return_train_score=False, return_estimator=False, error_score=np.nan): @@ -266,8 +268,9 @@ def cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None, return ret -def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None, - n_jobs=None, verbose=0, fit_params=None, +@_deprecate_positional_args +def cross_val_score(estimator, X, y=None, *, groups=None, scoring=None, + cv=None, n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', error_score=np.nan): """Evaluate a score by cross-validation @@ -618,7 +621,8 @@ def _score(estimator, X_test, y_test, scorer): return scores -def cross_val_predict(estimator, X, y=None, groups=None, cv=None, +@_deprecate_positional_args +def cross_val_predict(estimator, X, y=None, *, groups=None, cv=None, n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', method='predict'): """Generate cross-validated estimates for each input data point @@ -948,7 +952,8 @@ def _check_is_permutation(indices, n_samples): return True -def permutation_test_score(estimator, X, y, groups=None, cv=None, +@_deprecate_positional_args +def permutation_test_score(estimator, X, y, *, groups=None, cv=None, n_permutations=100, n_jobs=None, random_state=0, verbose=0, scoring=None): """Evaluate the significance of a cross-validated score with permutations @@ -1088,7 +1093,8 @@ def _shuffle(y, groups, random_state): return _safe_indexing(y, indices) -def learning_curve(estimator, X, y, groups=None, +@_deprecate_positional_args +def learning_curve(estimator, X, y, *, groups=None, train_sizes=np.linspace(0.1, 1.0, 5), cv=None, scoring=None, exploit_incremental_learning=False, n_jobs=None, pre_dispatch="all", verbose=0, shuffle=False, @@ -1372,7 +1378,8 @@ def _incremental_fit_estimator(estimator, X, y, classes, train, test, return np.array(ret).T -def validation_curve(estimator, X, y, param_name, param_range, groups=None, +@_deprecate_positional_args +def validation_curve(estimator, X, y, *, param_name, param_range, groups=None, cv=None, scoring=None, n_jobs=None, pre_dispatch="all", verbose=0, error_score=np.nan): """Validation curve. diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index 0205eb8901699..3b984745420f1 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -960,7 +960,7 @@ def test_repeated_kfold_determinstic_split(): def test_get_n_splits_for_repeated_kfold(): n_splits = 3 n_repeats = 4 - rkf = RepeatedKFold(n_splits, n_repeats) + rkf = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats) expected_n_splits = n_splits * n_repeats assert expected_n_splits == rkf.get_n_splits() @@ -968,7 +968,7 @@ def test_get_n_splits_for_repeated_kfold(): def test_get_n_splits_for_repeated_stratified_kfold(): n_splits = 3 n_repeats = 4 - rskf = RepeatedStratifiedKFold(n_splits, n_repeats) + rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats) expected_n_splits = n_splits * n_repeats assert expected_n_splits == rskf.get_n_splits() diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 67b66b6a91431..579726043f099 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -370,8 +370,8 @@ def test_cross_validate(): for X, y, est in ((X_reg, y_reg, reg), (X_clf, y_clf, clf)): # It's okay to evaluate regression metrics on classification too - mse_scorer = check_scoring(est, 'neg_mean_squared_error') - r2_scorer = check_scoring(est, 'r2') + mse_scorer = check_scoring(est, scoring='neg_mean_squared_error') + r2_scorer = check_scoring(est, scoring='r2') train_mse_scores = [] test_mse_scores = [] train_r2_scores = [] @@ -1251,7 +1251,8 @@ def test_validation_curve_cv_splits_consistency(): X, y = make_classification(n_samples=100, random_state=0) scores1 = validation_curve(SVC(kernel='linear', random_state=0), X, y, - 'C', [0.1, 0.1, 0.2, 0.2], + param_name='C', + param_range=[0.1, 0.1, 0.2, 0.2], cv=OneTimeSplitter(n_splits=n_splits, n_samples=n_samples)) # The OneTimeSplitter is a non-re-entrant cv splitter. Unless, the @@ -1262,7 +1263,8 @@ def test_validation_curve_cv_splits_consistency(): 2)) scores2 = validation_curve(SVC(kernel='linear', random_state=0), X, y, - 'C', [0.1, 0.1, 0.2, 0.2], + param_name='C', + param_range=[0.1, 0.1, 0.2, 0.2], cv=KFold(n_splits=n_splits, shuffle=True)) # For scores2, compare the 1st and 2nd parameter's scores @@ -1272,7 +1274,8 @@ def test_validation_curve_cv_splits_consistency(): 2)) scores3 = validation_curve(SVC(kernel='linear', random_state=0), X, y, - 'C', [0.1, 0.1, 0.2, 0.2], + param_name='C', + param_range=[0.1, 0.1, 0.2, 0.2], cv=KFold(n_splits=n_splits)) # OneTimeSplitter is basically unshuffled KFold(n_splits=5). Sanity check. @@ -1679,9 +1682,9 @@ def test_warn_trace(msg): failing_clf, X, y, cv=3, error_score='unvalid-string') assert_raise_message(ValueError, error_message, validation_curve, - failing_clf, X, y, 'parameter', - [FailingClassifier.FAILING_PARAMETER], cv=3, - error_score='unvalid-string') + failing_clf, X, y, param_name='parameter', + param_range=[FailingClassifier.FAILING_PARAMETER], + cv=3, error_score='unvalid-string') assert failing_clf.score() == 0. # FailingClassifier coverage diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 96ec40743fe2c..f6c80a1f5f2ab 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -48,6 +48,7 @@ from .utils.validation import _num_samples from .utils.validation import check_is_fitted from .utils.validation import check_X_y, check_array +from .utils.validation import _deprecate_positional_args from .utils.multiclass import (_check_partial_fit_first_call, check_classification_targets, _ovr_decision_function) @@ -201,7 +202,8 @@ class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin, array([2, 0, 1]) """ - def __init__(self, estimator, n_jobs=None): + @_deprecate_positional_args + def __init__(self, estimator, *, n_jobs=None): self.estimator = estimator self.n_jobs = n_jobs @@ -519,8 +521,8 @@ class OneVsOneClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator): Indices of samples used when training the estimators. ``None`` when ``estimator`` does not have ``_pairwise`` attribute. """ - - def __init__(self, estimator, n_jobs=None): + @_deprecate_positional_args + def __init__(self, estimator, *, n_jobs=None): self.estimator = estimator self.n_jobs = n_jobs @@ -760,8 +762,8 @@ class OutputCodeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseEstimator): Hastie T., Tibshirani R., Friedman J., page 606 (second-edition) 2008. """ - - def __init__(self, estimator, code_size=1.5, random_state=None, + @_deprecate_positional_args + def __init__(self, estimator, *, code_size=1.5, random_state=None, n_jobs=None): self.estimator = estimator self.code_size = code_size diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 8f94a0ae634da..b348dd0f78d09 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -25,7 +25,7 @@ from .utils import check_array, check_X_y, check_random_state from .utils.metaestimators import if_delegate_has_method from .utils.validation import (check_is_fitted, has_fit_parameter, - _check_fit_params) + _check_fit_params, _deprecate_positional_args) from .utils.multiclass import check_classification_targets from .utils import deprecated @@ -64,7 +64,8 @@ def _partial_fit_estimator(estimator, X, y, classes=None, sample_weight=None, class _MultiOutputEstimator(BaseEstimator, MetaEstimatorMixin, metaclass=ABCMeta): @abstractmethod - def __init__(self, estimator, n_jobs=None): + @_deprecate_positional_args + def __init__(self, estimator, *, n_jobs=None): self.estimator = estimator self.n_jobs = n_jobs @@ -245,9 +246,9 @@ class MultiOutputRegressor(RegressorMixin, _MultiOutputEstimator): >>> clf.predict(X[[0]]) array([[176..., 35..., 57...]]) """ - - def __init__(self, estimator, n_jobs=None): - super().__init__(estimator, n_jobs) + @_deprecate_positional_args + def __init__(self, estimator, *, n_jobs=None): + super().__init__(estimator, n_jobs=n_jobs) @if_delegate_has_method('estimator') def partial_fit(self, X, y, sample_weight=None): @@ -315,9 +316,9 @@ class MultiOutputClassifier(ClassifierMixin, _MultiOutputEstimator): >>> clf.predict(X[-2:]) array([[1, 1, 0], [1, 1, 1]]) """ - - def __init__(self, estimator, n_jobs=None): - super().__init__(estimator, n_jobs) + @_deprecate_positional_args + def __init__(self, estimator, *, n_jobs=None): + super().__init__(estimator, n_jobs=n_jobs) def fit(self, X, Y, sample_weight=None, **fit_params): """Fit the model to data matrix X and targets Y. @@ -414,7 +415,9 @@ def _more_tags(self): class _BaseChain(BaseEstimator, metaclass=ABCMeta): - def __init__(self, base_estimator, order=None, cv=None, random_state=None): + @_deprecate_positional_args + def __init__(self, base_estimator, *, order=None, cv=None, + random_state=None): self.base_estimator = base_estimator self.order = order self.cv = cv From be134a298f9c56af665d0575ebaae81eefef818b Mon Sep 17 00:00:00 2001 From: Lucy Liu Date: Sat, 25 Apr 2020 17:47:46 +0200 Subject: [PATCH 060/125] TST Replace Bostond dataset in test_iforest (#17031) --- sklearn/ensemble/tests/test_iforest.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py index 3593bc0422ff7..aeb384ab44503 100644 --- a/sklearn/ensemble/tests/test_iforest.py +++ b/sklearn/ensemble/tests/test_iforest.py @@ -21,7 +21,7 @@ from sklearn.ensemble import IsolationForest from sklearn.ensemble._iforest import _average_path_length from sklearn.model_selection import train_test_split -from sklearn.datasets import load_boston, load_iris +from sklearn.datasets import load_diabetes, load_iris from sklearn.utils import check_random_state from sklearn.metrics import roc_auc_score @@ -37,12 +37,12 @@ iris.data = iris.data[perm] iris.target = iris.target[perm] -# also load the boston dataset +# also load the diabetes dataset # and randomly permute it -boston = load_boston() -perm = rng.permutation(boston.target.size) -boston.data = boston.data[perm] -boston.target = boston.target[perm] +diabetes = load_diabetes() +perm = rng.permutation(diabetes.target.size) +diabetes.data = diabetes.data[perm] +diabetes.target = diabetes.target[perm] def test_iforest(): @@ -63,8 +63,8 @@ def test_iforest(): def test_iforest_sparse(): """Check IForest for various parameter settings on sparse input.""" rng = check_random_state(0) - X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], - boston.target[:50], + X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50], + diabetes.target[:50], random_state=rng) grid = ParameterGrid({"max_samples": [0.5, 1.0], "bootstrap": [True, False]}) @@ -157,8 +157,8 @@ def test_iforest_parallel_regression(): """Check parallel regression.""" rng = check_random_state(0) - X_train, X_test, y_train, y_test = train_test_split(boston.data, - boston.target, + X_train, X_test, y_train, y_test = train_test_split(diabetes.data, + diabetes.target, random_state=rng) ensemble = IsolationForest(n_jobs=3, @@ -226,8 +226,8 @@ def test_max_samples_consistency(): def test_iforest_subsampled_features(): # It tests non-regression for #5732 which failed at predict. rng = check_random_state(0) - X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], - boston.target[:50], + X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50], + diabetes.target[:50], random_state=rng) clf = IsolationForest(max_features=0.8) clf.fit(X_train, y_train) From 923d8879e22f340dd0cd4e7fd855e1642e139fa2 Mon Sep 17 00:00:00 2001 From: HaoYin Date: Sun, 26 Apr 2020 02:08:10 +0800 Subject: [PATCH 061/125] DOC Fix a typo in comment (#17037) --- sklearn/decomposition/_kernel_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index 527f78d34bbb5..617bf8541d830 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -236,7 +236,7 @@ def _fit_transform(self, K): # if v is an eigenvector of K # then Phi(X)v is an eigenvector of Phi(X)Phi(X)' # if u is an eigenvector of Phi(X)Phi(X)' - # then Phi(X)'u is an eigenvector of Phi(X)Phi(X)' + # then Phi(X)'u is an eigenvector of Phi(X)'Phi(X) # # At this stage our self.alphas_ (the v) have norm 1, we need to scale # them so that eigenvectors in kernel feature space (the u) have norm=1 From fc6ee00b0accceeec48cc5b606e713514b481617 Mon Sep 17 00:00:00 2001 From: Lucy Liu Date: Sun, 26 Apr 2020 04:22:33 +0200 Subject: [PATCH 062/125] MNT Fix 'clf' variable naming in test_forest (#16929) --- sklearn/ensemble/tests/test_forest.py | 108 +++++++++++++------------- 1 file changed, 54 insertions(+), 54 deletions(-) diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 8144a095cec3a..775ed851d5a53 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -163,17 +163,17 @@ def check_boston_criterion(name, criterion): # Check consistency on dataset boston house prices. ForestRegressor = FOREST_REGRESSORS[name] - clf = ForestRegressor(n_estimators=5, criterion=criterion, + reg = ForestRegressor(n_estimators=5, criterion=criterion, random_state=1) - clf.fit(boston.data, boston.target) - score = clf.score(boston.data, boston.target) + reg.fit(boston.data, boston.target) + score = reg.score(boston.data, boston.target) assert score > 0.94, ("Failed with max_features=None, criterion %s " "and score = %f" % (criterion, score)) - clf = ForestRegressor(n_estimators=5, criterion=criterion, + reg = ForestRegressor(n_estimators=5, criterion=criterion, max_features=6, random_state=1) - clf.fit(boston.data, boston.target) - score = clf.score(boston.data, boston.target) + reg.fit(boston.data, boston.target) + score = reg.score(boston.data, boston.target) assert score > 0.95, ("Failed with max_features=6, criterion %s " "and score = %f" % (criterion, score)) @@ -682,10 +682,10 @@ def test_distribution(): y = rng.rand(1000) n_trees = 500 - clf = ExtraTreesRegressor(n_estimators=n_trees, random_state=42).fit(X, y) + reg = ExtraTreesRegressor(n_estimators=n_trees, random_state=42).fit(X, y) uniques = defaultdict(int) - for tree in clf.estimators_: + for tree in reg.estimators_: tree = "".join(("%d,%d/" % (f, int(t)) if f >= 0 else "-") for f, t in zip(tree.tree_.feature, tree.tree_.threshold)) @@ -713,10 +713,10 @@ def test_distribution(): X[:, 1] = np.random.randint(0, 3, 1000) y = rng.rand(1000) - clf = ExtraTreesRegressor(max_features=1, random_state=1).fit(X, y) + reg = ExtraTreesRegressor(max_features=1, random_state=1).fit(X, y) uniques = defaultdict(int) - for tree in clf.estimators_: + for tree in reg.estimators_: tree = "".join(("%d,%d/" % (f, int(t)) if f >= 0 else "-") for f, t in zip(tree.tree_.feature, tree.tree_.threshold)) @@ -1065,25 +1065,25 @@ def check_warm_start(name, random_state=42): # right size and the same results as a normal fit. X, y = hastie_X, hastie_y ForestEstimator = FOREST_ESTIMATORS[name] - clf_ws = None + est_ws = None for n_estimators in [5, 10]: - if clf_ws is None: - clf_ws = ForestEstimator(n_estimators=n_estimators, + if est_ws is None: + est_ws = ForestEstimator(n_estimators=n_estimators, random_state=random_state, warm_start=True) else: - clf_ws.set_params(n_estimators=n_estimators) - clf_ws.fit(X, y) - assert len(clf_ws) == n_estimators + est_ws.set_params(n_estimators=n_estimators) + est_ws.fit(X, y) + assert len(est_ws) == n_estimators - clf_no_ws = ForestEstimator(n_estimators=10, random_state=random_state, + est_no_ws = ForestEstimator(n_estimators=10, random_state=random_state, warm_start=False) - clf_no_ws.fit(X, y) + est_no_ws.fit(X, y) - assert (set([tree.random_state for tree in clf_ws]) == - set([tree.random_state for tree in clf_no_ws])) + assert (set([tree.random_state for tree in est_ws]) == + set([tree.random_state for tree in est_no_ws])) - assert_array_equal(clf_ws.apply(X), clf_no_ws.apply(X), + assert_array_equal(est_ws.apply(X), est_no_ws.apply(X), err_msg="Failed with {0}".format(name)) @@ -1096,17 +1096,17 @@ def check_warm_start_clear(name): # Test if fit clears state and grows a new forest when warm_start==False. X, y = hastie_X, hastie_y ForestEstimator = FOREST_ESTIMATORS[name] - clf = ForestEstimator(n_estimators=5, max_depth=1, warm_start=False, + est = ForestEstimator(n_estimators=5, max_depth=1, warm_start=False, random_state=1) - clf.fit(X, y) + est.fit(X, y) - clf_2 = ForestEstimator(n_estimators=5, max_depth=1, warm_start=True, + est_2 = ForestEstimator(n_estimators=5, max_depth=1, warm_start=True, random_state=2) - clf_2.fit(X, y) # inits state - clf_2.set_params(warm_start=False, random_state=1) - clf_2.fit(X, y) # clears old state and equals clf + est_2.fit(X, y) # inits state + est_2.set_params(warm_start=False, random_state=1) + est_2.fit(X, y) # clears old state and equals est - assert_array_almost_equal(clf_2.apply(X), clf.apply(X)) + assert_array_almost_equal(est_2.apply(X), est.apply(X)) @pytest.mark.parametrize('name', FOREST_ESTIMATORS) @@ -1118,10 +1118,10 @@ def check_warm_start_smaller_n_estimators(name): # Test if warm start second fit with smaller n_estimators raises error. X, y = hastie_X, hastie_y ForestEstimator = FOREST_ESTIMATORS[name] - clf = ForestEstimator(n_estimators=5, max_depth=1, warm_start=True) - clf.fit(X, y) - clf.set_params(n_estimators=4) - assert_raises(ValueError, clf.fit, X, y) + est = ForestEstimator(n_estimators=5, max_depth=1, warm_start=True) + est.fit(X, y) + est.set_params(n_estimators=4) + assert_raises(ValueError, est.fit, X, y) @pytest.mark.parametrize('name', FOREST_ESTIMATORS) @@ -1134,20 +1134,20 @@ def check_warm_start_equal_n_estimators(name): # same forest and raises a warning. X, y = hastie_X, hastie_y ForestEstimator = FOREST_ESTIMATORS[name] - clf = ForestEstimator(n_estimators=5, max_depth=3, warm_start=True, + est = ForestEstimator(n_estimators=5, max_depth=3, warm_start=True, random_state=1) - clf.fit(X, y) + est.fit(X, y) - clf_2 = ForestEstimator(n_estimators=5, max_depth=3, warm_start=True, + est_2 = ForestEstimator(n_estimators=5, max_depth=3, warm_start=True, random_state=1) - clf_2.fit(X, y) - # Now clf_2 equals clf. + est_2.fit(X, y) + # Now est_2 equals est. - clf_2.set_params(random_state=2) - assert_warns(UserWarning, clf_2.fit, X, y) + est_2.set_params(random_state=2) + assert_warns(UserWarning, est_2.fit, X, y) # If we had fit the trees again we would have got a different forest as we # changed the random state. - assert_array_equal(clf.apply(X), clf_2.apply(X)) + assert_array_equal(est.apply(X), est_2.apply(X)) @pytest.mark.parametrize('name', FOREST_ESTIMATORS) @@ -1160,31 +1160,31 @@ def check_warm_start_oob(name): X, y = hastie_X, hastie_y ForestEstimator = FOREST_ESTIMATORS[name] # Use 15 estimators to avoid 'some inputs do not have OOB scores' warning. - clf = ForestEstimator(n_estimators=15, max_depth=3, warm_start=False, + est = ForestEstimator(n_estimators=15, max_depth=3, warm_start=False, random_state=1, bootstrap=True, oob_score=True) - clf.fit(X, y) + est.fit(X, y) - clf_2 = ForestEstimator(n_estimators=5, max_depth=3, warm_start=False, + est_2 = ForestEstimator(n_estimators=5, max_depth=3, warm_start=False, random_state=1, bootstrap=True, oob_score=False) - clf_2.fit(X, y) + est_2.fit(X, y) - clf_2.set_params(warm_start=True, oob_score=True, n_estimators=15) - clf_2.fit(X, y) + est_2.set_params(warm_start=True, oob_score=True, n_estimators=15) + est_2.fit(X, y) - assert hasattr(clf_2, 'oob_score_') - assert clf.oob_score_ == clf_2.oob_score_ + assert hasattr(est_2, 'oob_score_') + assert est.oob_score_ == est_2.oob_score_ # Test that oob_score is computed even if we don't need to train # additional trees. - clf_3 = ForestEstimator(n_estimators=15, max_depth=3, warm_start=True, + est_3 = ForestEstimator(n_estimators=15, max_depth=3, warm_start=True, random_state=1, bootstrap=True, oob_score=False) - clf_3.fit(X, y) - assert not hasattr(clf_3, 'oob_score_') + est_3.fit(X, y) + assert not hasattr(est_3, 'oob_score_') - clf_3.set_params(oob_score=True) - ignore_warnings(clf_3.fit)(X, y) + est_3.set_params(oob_score=True) + ignore_warnings(est_3.fit)(X, y) - assert clf.oob_score_ == clf_3.oob_score_ + assert est.oob_score_ == est_3.oob_score_ @pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS) From e8a42aee5d5b3f2d25f3dddbc0c55c96da03e230 Mon Sep 17 00:00:00 2001 From: Hugo van Kemenade Date: Sun, 26 Apr 2020 14:48:36 +0300 Subject: [PATCH 063/125] CI Remove pin now that Pillow issue is fixed (#17043) --- build_tools/azure/install.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index d1849a940d96c..f30db7f0ae08a 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -98,9 +98,6 @@ elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then python -m pip install -U pip python -m pip install pytest==$PYTEST_VERSION pytest-cov pytest-xdist - # TODO: Remove pin when https://github.com/python-pillow/Pillow/issues/4518 gets fixed - python -m pip install "pillow>=4.3.0,!=7.1.0,!=7.1.1" - python -m pip install pandas matplotlib pyamg scikit-image # do not install dependencies for lightgbm since it requires scikit-learn python -m pip install lightgbm --no-deps From 28c08d06f95f2ed8843bd7b04a53ca8710ca7971 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 26 Apr 2020 14:23:29 +0200 Subject: [PATCH 064/125] DOC fix typos in cross validation user guide (#17042) --- doc/modules/cross_validation.rst | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index e2de690658a25..ed014cea6f2ff 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -323,11 +323,11 @@ The following cross-validators can be used in such cases. While i.i.d. data is a common assumption in machine learning theory, it rarely holds in practice. If one knows that the samples have been generated using a -time-dependent process, it's safer to -use a :ref:`time-series aware cross-validation scheme ` -Similarly if we know that the generative process has a group structure -(samples from collected from different subjects, experiments, measurement -devices) it safer to use :ref:`group-wise cross-validation `. +time-dependent process, it is safer to +use a :ref:`time-series aware cross-validation scheme `. +Similarly, if we know that the generative process has a group structure +(samples collected from different subjects, experiments, measurement +devices), it is safer to use :ref:`group-wise cross-validation `. K-fold @@ -535,14 +535,14 @@ folds: each set contains approximately the same percentage of samples of each target class as the complete set. Here is an example of stratified 3-fold cross-validation on a dataset with 50 samples from -two unbalanced classes. We show the number of samples in each class and compare with +two unbalanced classes. We show the number of samples in each class and compare with :class:`KFold`. >>> from sklearn.model_selection import StratifiedKFold, KFold >>> import numpy as np >>> X, y = np.ones((50, 1)), np.hstack(([0] * 45, [1] * 5)) - >>> skf = StratifiedKFold(n_splits=3) - >>> for train, test in skf.split(X, y): + >>> skf = StratifiedKFold(n_splits=3) + >>> for train, test in skf.split(X, y): ... print('train - {} | test - {}'.format( ... np.bincount(y[train]), np.bincount(y[test]))) train - [30 3] | test - [15 2] @@ -556,7 +556,7 @@ two unbalanced classes. We show the number of samples in each class and compare train - [28 5] | test - [17] train - [34] | test - [11 5] -We can see that :class:`StratifiedKFold` preserves the class ratios +We can see that :class:`StratifiedKFold` preserves the class ratios (approximately 1 / 10) in both train and test dataset. Here is a visualization of the cross-validation behavior. From 4755ae76d2df10bbf41bc93fb7083b0142ef1044 Mon Sep 17 00:00:00 2001 From: Ekaterina Borovikova Date: Sun, 26 Apr 2020 15:29:36 +0200 Subject: [PATCH 065/125] DOC Add versionchanged and versionadded for v0.20 (#16199) Co-Authored-By: Adrin Jalali Co-Authored-By: Chiara Marmo Co-Authored-By: Nicolas Hug --- sklearn/cluster/_agglomerative.py | 3 +++ sklearn/cluster/_kmeans.py | 4 ++++ sklearn/compose/_target.py | 2 ++ sklearn/covariance/_elliptic_envelope.py | 2 ++ sklearn/covariance/_graph_lasso.py | 12 ++++++++++++ sklearn/datasets/_base.py | 10 ++++++++++ sklearn/datasets/_openml.py | 2 ++ sklearn/datasets/_samples_generator.py | 3 +++ sklearn/dummy.py | 2 ++ sklearn/ensemble/_iforest.py | 3 +++ sklearn/ensemble/_voting.py | 1 - sklearn/feature_extraction/text.py | 2 ++ sklearn/feature_selection/_rfe.py | 4 ++++ sklearn/linear_model/_stochastic_gradient.py | 12 ++++++++++++ sklearn/manifold/_t_sne.py | 2 ++ sklearn/metrics/_classification.py | 4 ++++ sklearn/metrics/_ranking.py | 4 +++- sklearn/metrics/cluster/_unsupervised.py | 2 ++ sklearn/model_selection/_search.py | 10 ++++++++++ sklearn/model_selection/_validation.py | 15 ++++++++++++++- sklearn/multiclass.py | 3 +++ sklearn/multioutput.py | 8 ++++++++ sklearn/naive_bayes.py | 4 ++++ sklearn/neighbors/_kde.py | 2 ++ sklearn/neighbors/_lof.py | 4 ++++ sklearn/pipeline.py | 8 ++++++++ sklearn/preprocessing/_discretization.py | 3 +++ sklearn/preprocessing/_encoders.py | 4 +++- sklearn/svm/_classes.py | 2 ++ sklearn/utils/_show_versions.py | 5 ++++- 30 files changed, 137 insertions(+), 5 deletions(-) diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index 182ae4b481116..92246141d6fe8 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -736,6 +736,9 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator): - single uses the minimum of the distances between all observations of the two sets. + .. versionadded:: 0.20 + Added the 'single' option + distance_threshold : float, default=None The linkage distance threshold above which, clusters will not be merged. If not ``None``, ``n_clusters`` must be ``None`` and diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index b185983c4b0f9..21a604bed3eb5 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -946,6 +946,8 @@ def fit(self, X, y=None, sample_weight=None): The weights for each observation in X. If None, all observations are assigned equal weight. + .. versionadded:: 0.20 + Returns ------- self @@ -1587,6 +1589,8 @@ def fit(self, X, y=None, sample_weight=None): The weights for each observation in X. If None, all observations are assigned equal weight (default: None). + .. versionadded:: 0.20 + Returns ------- self diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py index d8c062ed423a2..1d6695a808d81 100644 --- a/sklearn/compose/_target.py +++ b/sklearn/compose/_target.py @@ -42,6 +42,8 @@ class TransformedTargetRegressor(RegressorMixin, BaseEstimator): Read more in the :ref:`User Guide `. + .. versionadded:: 0.20 + Parameters ---------- regressor : object, default=None diff --git a/sklearn/covariance/_elliptic_envelope.py b/sklearn/covariance/_elliptic_envelope.py index 801611943f350..354c0f8998968 100644 --- a/sklearn/covariance/_elliptic_envelope.py +++ b/sklearn/covariance/_elliptic_envelope.py @@ -67,6 +67,8 @@ class EllipticEnvelope(OutlierMixin, MinCovDet): such a way we obtain the expected number of outliers (samples with decision function < 0) in training. + .. versionadded:: 0.20 + raw_location_ : ndarray of shape (n_features,) The raw robust estimated location before correction and re-weighting. diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py index 1d0d93db75101..35a398741bc15 100644 --- a/sklearn/covariance/_graph_lasso.py +++ b/sklearn/covariance/_graph_lasso.py @@ -84,6 +84,9 @@ def graphical_lasso(emp_cov, alpha, cov_init=None, mode='cd', tol=1e-4, Read more in the :ref:`User Guide `. + .. versionchanged:: v0.20 + graph_lasso has been renamed to graphical_lasso + Parameters ---------- emp_cov : ndarray of shape (n_features, n_features) @@ -283,6 +286,9 @@ class GraphicalLasso(EmpiricalCovariance): Read more in the :ref:`User Guide `. + .. versionchanged:: v0.20 + GraphLasso has been renamed to GraphicalLasso + Parameters ---------- alpha : float, default=0.01 @@ -509,6 +515,9 @@ class GraphicalLassoCV(GraphicalLasso): Read more in the :ref:`User Guide `. + .. versionchanged:: v0.20 + GraphLassoCV has been renamed to GraphicalLassoCV + Parameters ---------- alphas : int or array-like of shape (n_alphas,), dtype=float, default=4 @@ -563,6 +572,9 @@ class GraphicalLassoCV(GraphicalLasso): ``-1`` means using all processors. See :term:`Glossary ` for more details. + .. versionchanged:: v0.20 + `n_jobs` default changed from 1 to None + verbose : bool, default=False If verbose is True, the objective function and duality gap are printed at each iteration. diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py index d481288133991..9737a5f67891a 100644 --- a/sklearn/datasets/_base.py +++ b/sklearn/datasets/_base.py @@ -439,6 +439,8 @@ def load_iris(return_X_y=False, as_frame=False): filename: str The path to the location of the data. + .. versionadded:: 0.20 + (data, target) : tuple if ``return_X_y`` is True .. versionadded:: 0.18 @@ -551,6 +553,8 @@ def load_breast_cancer(return_X_y=False, as_frame=False): filename: str The path to the location of the data. + .. versionadded:: 0.20 + (data, target) : tuple if ``return_X_y`` is True .. versionadded:: 0.18 @@ -665,6 +669,9 @@ def load_digits(n_class=10, return_X_y=False, as_frame=False): The names of the dataset columns. target_names: list The names of target classes. + + .. versionadded:: 0.20 + frame: DataFrame of shape (1797, 65) Only present when `as_frame=True`. DataFrame with `data` and `target`. @@ -885,6 +892,8 @@ def load_linnerud(return_X_y=False, as_frame=False): target_filename: str The path to the location of the target. + .. versionadded:: 0.20 + (data, target) : tuple if ``return_X_y`` is True .. versionadded:: 0.18 @@ -961,6 +970,7 @@ def load_boston(return_X_y=False): The physical location of boston csv dataset. .. versionadded:: 0.20 + DESCR : str The full description of the dataset. feature_names : ndarray diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index cef0e6cb1f411..26260a27ec883 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -526,6 +526,8 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, Read more in the :ref:`User Guide `. + .. versionadded:: 0.20 + .. note:: EXPERIMENTAL The API is experimental (particularly the return value structure), diff --git a/sklearn/datasets/_samples_generator.py b/sklearn/datasets/_samples_generator.py index 62ef492f42f5e..fe0ac680ecd79 100644 --- a/sklearn/datasets/_samples_generator.py +++ b/sklearn/datasets/_samples_generator.py @@ -745,6 +745,9 @@ def make_blobs(n_samples=100, n_features=2, centers=None, cluster_std=1.0, If array-like, each element of the sequence indicates the number of samples per cluster. + .. versionchanged:: v0.20 + one can now pass an array-like to the ``n_samples`` parameter + n_features : int, optional (default=2) The number of features for each sample. diff --git a/sklearn/dummy.py b/sklearn/dummy.py index 37e9145f7536c..17b2c6cfd2e5d 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -561,6 +561,8 @@ def predict(self, X, return_std=False): Whether to return the standard deviation of posterior prediction. All zeros in this case. + .. versionadded:: 0.20 + Returns ------- y : array-like of shape (n_samples,) or (n_samples, n_outputs) diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index 9cec1c08efc9e..0c1bec9ebfb65 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -146,6 +146,8 @@ class IsolationForest(OutlierMixin, BaseBagging): is defined in such a way we obtain the expected number of outliers (samples with decision function < 0) in training. + .. versionadded:: 0.20 + estimators_features_ : list of arrays The subset of drawn features for each base estimator. @@ -391,6 +393,7 @@ def score_samples(self, X): The lower, the more abnormal. """ # code structure from ForestClassifier/predict_proba + check_is_fitted(self) # Check data diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py index 8d2bbbe8c2b8a..b044cb68e5151 100644 --- a/sklearn/ensemble/_voting.py +++ b/sklearn/ensemble/_voting.py @@ -161,7 +161,6 @@ class VotingClassifier(ClassifierMixin, _BaseVoting): named_estimators_ : :class:`~sklearn.utils.Bunch` Attribute to access any fitted sub-estimators by name. - .. versionadded:: 0.20 classes_ : array-like of shape (n_predictions,) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 27c5eb437805b..661f638b000fc 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1375,6 +1375,8 @@ class TfidfTransformer(TransformerMixin, BaseEstimator): The inverse document frequency (IDF) vector; only defined if ``use_idf`` is True. + .. versionadded:: 0.20 + Examples -------- >>> from sklearn.feature_extraction.text import TfidfTransformer diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index 7421bc50b7625..aedcd94943bc4 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -374,6 +374,8 @@ class RFECV(RFE): feature count and ``min_features_to_select`` isn't divisible by ``step``. + .. versionadded:: 0.20 + cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: @@ -497,6 +499,8 @@ def fit(self, X, y, groups=None): Group labels for the samples used while splitting the dataset into train/test set. Only used in conjunction with a "Group" :term:`cv` instance (e.g., :class:`~sklearn.model_selection.GroupKFold`). + + .. versionadded:: 0.20 """ tags = self._get_tags() X, y = self._validate_data( diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index 3bedd8a26674b..1606a7ff35adb 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -847,6 +847,9 @@ class SGDClassifier(BaseSGDClassifier): training loss by tol or fail to increase validation score by tol if early_stopping is True, the current learning rate is divided by 5. + .. versionadded:: 0.20 + Added 'adaptive' option + eta0 : double, default=0.0 The initial learning rate for the 'constant', 'invscaling' or 'adaptive' schedules. The default value is 0.0 as eta0 is not used by @@ -863,6 +866,7 @@ class SGDClassifier(BaseSGDClassifier): improving by at least tol for n_iter_no_change consecutive epochs. .. versionadded:: 0.20 + Added 'early_stopping' option validation_fraction : float, default=0.1 The proportion of training data to set aside as validation set for @@ -870,11 +874,13 @@ class SGDClassifier(BaseSGDClassifier): Only used if `early_stopping` is True. .. versionadded:: 0.20 + Added 'validation_fraction' option n_iter_no_change : int, default=5 Number of iterations with no improvement to wait before early stopping. .. versionadded:: 0.20 + Added 'n_iter_no_change' option class_weight : dict, {class_label: weight} or "balanced", default=None Preset for the class_weight fit parameter. @@ -1446,6 +1452,9 @@ class SGDRegressor(BaseSGDRegressor): training loss by tol or fail to increase validation score by tol if early_stopping is True, the current learning rate is divided by 5. + .. versionadded:: 0.20 + Added 'adaptive' option + eta0 : double, default=0.01 The initial learning rate for the 'constant', 'invscaling' or 'adaptive' schedules. The default value is 0.01. @@ -1462,6 +1471,7 @@ class SGDRegressor(BaseSGDRegressor): epochs. .. versionadded:: 0.20 + Added 'early_stopping' option validation_fraction : float, default=0.1 The proportion of training data to set aside as validation set for @@ -1469,11 +1479,13 @@ class SGDRegressor(BaseSGDRegressor): Only used if `early_stopping` is True. .. versionadded:: 0.20 + Added 'validation_fraction' option n_iter_no_change : int, default=5 Number of iterations with no improvement to wait before early stopping. .. versionadded:: 0.20 + Added 'n_iter_no_change' option warm_start : bool, default=False When set to True, reuse the solution of the previous call to fit as diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py index d94bf777399f5..eef67d5460e22 100644 --- a/sklearn/manifold/_t_sne.py +++ b/sklearn/manifold/_t_sne.py @@ -439,6 +439,8 @@ def trustworthiness(X, X_embedded, *, n_neighbors=5, metric='euclidean'): documentation of argument metric in sklearn.pairwise.pairwise_distances for a list of available metrics. + .. versionadded:: 0.20 + Returns ------- trustworthiness : float diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index 90e1935e62f06..b8a1a8e5e22b4 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -1751,6 +1751,8 @@ def balanced_accuracy_score(y_true, y_pred, *, sample_weight=None, Read more in the :ref:`User Guide `. + .. versionadded:: 0.20 + Parameters ---------- y_true : 1d array-like @@ -1849,6 +1851,8 @@ def classification_report(y_true, y_pred, *, labels=None, target_names=None, output_dict : bool (default = False) If True, return output as dict + .. versionadded:: 0.20 + zero_division : "warn", 0 or 1, default="warn" Sets the value to return when there is a zero division. If set to "warn", this acts as 0, but warnings are also raised. diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py index 2ac226bfe0299..18d948214bbec 100644 --- a/sklearn/metrics/_ranking.py +++ b/sklearn/metrics/_ranking.py @@ -848,6 +848,8 @@ def label_ranking_average_precision_score(y_true, y_score, *, sample_weight : array-like of shape (n_samples,), default=None Sample weights. + .. versionadded:: 0.20 + Returns ------- score : float @@ -1031,7 +1033,7 @@ def label_ranking_loss(y_true, y_score, *, sample_weight=None): unique_inverse[y_true.indices[start:stop]], minlength=len(unique_scores)) all_at_reversed_rank = np.bincount(unique_inverse, - minlength=len(unique_scores)) + minlength=len(unique_scores)) false_at_reversed_rank = all_at_reversed_rank - true_at_reversed_rank # if the scores are ordered, it's possible to count the number of diff --git a/sklearn/metrics/cluster/_unsupervised.py b/sklearn/metrics/cluster/_unsupervised.py index 9e2ef713b352e..ce5563c4763d3 100644 --- a/sklearn/metrics/cluster/_unsupervised.py +++ b/sklearn/metrics/cluster/_unsupervised.py @@ -312,6 +312,8 @@ def davies_bouldin_score(X, labels): Read more in the :ref:`User Guide `. + .. versionadded:: 0.20 + Parameters ---------- X : array-like, shape (``n_samples``, ``n_features``) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 920900db20fe7..e44511a9394b4 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -914,6 +914,9 @@ class GridSearchCV(BaseSearchCV): ``-1`` means using all processors. See :term:`Glossary ` for more details. + .. versionchanged:: v0.20 + `n_jobs` default changed from 1 to None + pre_dispatch : int, or str, default=n_jobs Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an @@ -1136,6 +1139,8 @@ class GridSearchCV(BaseSearchCV): This is present only if ``refit`` is not False. + .. versionadded:: 0.20 + Notes ----- The parameters selected are those that maximize the score of the left out @@ -1250,6 +1255,9 @@ class RandomizedSearchCV(BaseSearchCV): ``-1`` means using all processors. See :term:`Glossary ` for more details. + .. versionchanged:: v0.20 + `n_jobs` default changed from 1 to None + pre_dispatch : int, or str, default=None Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an @@ -1457,6 +1465,8 @@ class RandomizedSearchCV(BaseSearchCV): This is present only if ``refit`` is not False. + .. versionadded:: 0.20 + Notes ----- The parameters selected are those that maximize the score of the held-out diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 9618bf2fe2e09..dd204ad4a57d0 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -144,12 +144,16 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None, return_estimator : bool, default=False Whether to return the estimators fitted on each split. + .. versionadded:: 0.20 + error_score : 'raise' or numeric Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. + .. versionadded:: 0.20 + Returns ------- scores : dict of float arrays of shape (n_splits,) @@ -359,6 +363,8 @@ def cross_val_score(estimator, X, y=None, *, groups=None, scoring=None, If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. + .. versionadded:: 0.20 + Returns ------- scores : array of float, shape=(len(list(cv)),) @@ -495,7 +501,7 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, msg = '' else: msg = '%s' % (', '.join('%s=%s' % (k, v) - for k, v in parameters.items())) + for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights @@ -813,6 +819,9 @@ def _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, X : array-like of shape (n_samples, n_features) The data to fit. + .. versionchanged:: 0.20 + X is only required to be an object with finite length or shape now + y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None The target variable to try to predict in the case of supervised learning. @@ -1197,6 +1206,8 @@ def learning_curve(estimator, X, y, *, groups=None, If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. + .. versionadded:: 0.20 + return_times : bool, default=False Whether to return the fit and score times. @@ -1461,6 +1472,8 @@ def validation_curve(estimator, X, y, *, param_name, param_range, groups=None, If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. + .. versionadded:: 0.20 + Returns ------- train_scores : array of shape (n_ticks, n_cv_folds) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index f6c80a1f5f2ab..1f0bfaf6517b4 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -165,6 +165,9 @@ class OneVsRestClassifier(MultiOutputMixin, ClassifierMixin, ``-1`` means using all processors. See :term:`Glossary ` for more details. + .. versionchanged:: v0.20 + `n_jobs` default changed from 1 to None + Attributes ---------- estimators_ : list of `n_classes` estimators diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index b348dd0f78d09..815c1cbd67757 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -230,6 +230,9 @@ class MultiOutputRegressor(RegressorMixin, _MultiOutputEstimator): using `n_jobs>1` can result in slower performance due to the overhead of spawning processes. + .. versionchanged:: v0.20 + `n_jobs` default changed from 1 to None + Attributes ---------- estimators_ : list of ``n_output`` estimators @@ -296,6 +299,9 @@ class MultiOutputClassifier(ClassifierMixin, _MultiOutputEstimator): ``-1`` means using all processors. See :term:`Glossary ` for more details. + .. versionchanged:: v0.20 + `n_jobs` default changed from 1 to None + Attributes ---------- classes_ : array, shape = (n_classes,) @@ -697,6 +703,8 @@ class RegressorChain(MetaEstimatorMixin, RegressorMixin, _BaseChain): Read more in the :ref:`User Guide `. + .. versionadded:: 0.20 + Parameters ---------- base_estimator : estimator diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 247d9eea763c6..e631bb3dcd599 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -140,6 +140,8 @@ class GaussianNB(_BaseNB): Portion of the largest variance of all features that is added to variances for calculation stability. + .. versionadded:: 0.20 + Attributes ---------- class_count_ : ndarray of shape (n_classes,) @@ -785,6 +787,8 @@ class ComplementNB(_BaseDiscreteNB): Read more in the :ref:`User Guide `. + .. versionadded:: 0.20 + Parameters ---------- alpha : float, default=1.0 diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py index 684e07947cddd..1a967e301b357 100644 --- a/sklearn/neighbors/_kde.py +++ b/sklearn/neighbors/_kde.py @@ -148,6 +148,8 @@ def fit(self, X, y=None, sample_weight=None): sample_weight : array_like, shape (n_samples,), optional List of sample weights attached to the data X. + .. versionadded:: 0.20 + Returns ------- self : object diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py index dfdb89237f516..2d456ff3e620f 100644 --- a/sklearn/neighbors/_lof.py +++ b/sklearn/neighbors/_lof.py @@ -117,6 +117,8 @@ class LocalOutlierFactor(KNeighborsMixin, UnsupervisedMixin, that you should only use predict, decision_function and score_samples on new unseen data and not on the training set. + .. versionadded:: 0.20 + n_jobs : int, default=None The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. @@ -148,6 +150,8 @@ class LocalOutlierFactor(KNeighborsMixin, UnsupervisedMixin, case, the offset is defined in such a way we obtain the expected number of outliers in training. + .. versionadded:: 0.20 + Examples -------- >>> import numpy as np diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 477354107e133..8e2a539786557 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -395,6 +395,8 @@ def predict(self, X, **predict_params): transformations in the pipeline are not propagated to the final estimator. + .. versionadded:: 0.20 + Returns ------- y_pred : array-like @@ -773,6 +775,9 @@ class FeatureUnion(TransformerMixin, _BaseComposition): ``-1`` means using all processors. See :term:`Glossary ` for more details. + .. versionchanged:: v0.20 + `n_jobs` default changed from 1 to None + transformer_weights : dict, default=None Multiplicative weights for features per transformer. Keys are transformer names, values the weights. @@ -1018,6 +1023,9 @@ def make_union(*transformers, **kwargs): ``-1`` means using all processors. See :term:`Glossary ` for more details. + .. versionchanged:: v0.20 + `n_jobs` default changed from 1 to None + verbose : bool, default=False If True, the time elapsed while fitting each transformer will be printed as it is completed. diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 581765a81361e..fa7d574e65ccd 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -25,6 +25,8 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): Read more in the :ref:`User Guide `. + .. versionadded:: 0.20 + Parameters ---------- n_bins : int or array-like, shape (n_features,) (default=5) @@ -114,6 +116,7 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): [-0.5, 2.5, -2.5, -0.5], [ 0.5, 3.5, -1.5, 0.5], [ 0.5, 3.5, -1.5, 1.5]]) + """ @_deprecate_positional_args diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index c8f8ba6781400..3b0e43c151e0c 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -186,6 +186,8 @@ class OneHotEncoder(_BaseEncoder): The used categories can be found in the ``categories_`` attribute. + .. versionadded:: 0.20 + drop : {'first', 'if_binary'} or a array-like of shape (n_features,), \ default=None Specifies a methodology to use to drop one of the categories per @@ -603,7 +605,7 @@ class OrdinalEncoder(_BaseEncoder): Read more in the :ref:`User Guide `. - .. versionchanged:: 0.20.1 + .. versionadded:: 0.20 Parameters ---------- diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index f8b30e070711e..5ff6e74825e50 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1241,6 +1241,8 @@ class OneClassSVM(OutlierMixin, BaseLibSVM): The offset is the opposite of `intercept_` and is provided for consistency with other outlier detection algorithms. + .. versionadded:: 0.20 + fit_status_ : int 0 if correctly fitted, 1 otherwise (will raise warning) diff --git a/sklearn/utils/_show_versions.py b/sklearn/utils/_show_versions.py index 53bcf2f35269d..e9c7c687c5aaa 100644 --- a/sklearn/utils/_show_versions.py +++ b/sklearn/utils/_show_versions.py @@ -73,7 +73,10 @@ def get_version(module): def show_versions(): - """Print useful debugging information""" + """Print useful debugging information" + + .. versionadded:: 0.20 + """ sys_info = _get_sys_info() deps_info = _get_deps_info() From 81c1e9d1f265cfb32e155817ae14ef41521eea22 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 26 Apr 2020 12:05:16 -0400 Subject: [PATCH 066/125] API kwonly for utils (#17007) --- sklearn/utils/__init__.py | 20 +++-- sklearn/utils/class_weight.py | 5 +- sklearn/utils/multiclass.py | 7 +- sklearn/utils/tests/test_class_weight.py | 22 ++--- sklearn/utils/tests/test_validation.py | 5 +- sklearn/utils/validation.py | 109 ++++++++++++----------- 6 files changed, 94 insertions(+), 74 deletions(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index aac6e292a198a..afde7614070fd 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -29,7 +29,8 @@ assert_all_finite, check_random_state, column_or_1d, check_array, check_consistent_length, check_X_y, indexable, - check_symmetric, check_scalar) + check_symmetric, check_scalar, + _deprecate_positional_args) from .. import get_config @@ -314,10 +315,10 @@ def safe_indexing(X, indices, axis=0): CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are not supported. """ - return _safe_indexing(X, indices, axis) + return _safe_indexing(X, indices, axis=axis) -def _safe_indexing(X, indices, axis=0): +def _safe_indexing(X, indices, *, axis=0): """Return rows, items or columns of X using indices. .. warning:: @@ -684,7 +685,8 @@ def shuffle(*arrays, **options): return resample(*arrays, **options) -def safe_sqr(X, copy=True): +@_deprecate_positional_args +def safe_sqr(X, *, copy=True): """Element wise squaring of array-likes and sparse matrices. Parameters @@ -723,7 +725,8 @@ def _chunk_generator(gen, chunksize): return -def gen_batches(n, batch_size, min_batch_size=0): +@_deprecate_positional_args +def gen_batches(n, batch_size, *, min_batch_size=0): """Generator to create slices containing batch_size elements, from 0 to n. The last slice may contain less than batch_size elements, when batch_size @@ -772,7 +775,8 @@ def gen_batches(n, batch_size, min_batch_size=0): yield slice(start, n) -def gen_even_slices(n, n_packs, n_samples=None): +@_deprecate_positional_args +def gen_even_slices(n, n_packs, *, n_samples=None): """Generator to create n_packs slices going up to n. Parameters @@ -957,8 +961,8 @@ def _print_elapsed_time(source, message=None): timeit.default_timer() - start)) -def get_chunk_n_rows(row_bytes, max_n_rows=None, - working_memory=None): +@_deprecate_positional_args +def get_chunk_n_rows(row_bytes, *, max_n_rows=None, working_memory=None): """Calculates how many rows can be processed within working_memory Parameters diff --git a/sklearn/utils/class_weight.py b/sklearn/utils/class_weight.py index 5f785cb36df45..8c64e33e1d0d4 100644 --- a/sklearn/utils/class_weight.py +++ b/sklearn/utils/class_weight.py @@ -4,6 +4,8 @@ import numpy as np +from .validation import _deprecate_positional_args + def compute_class_weight(class_weight, classes, y): """Estimate class weights for unbalanced datasets. @@ -69,7 +71,8 @@ def compute_class_weight(class_weight, classes, y): return weight -def compute_sample_weight(class_weight, y, indices=None): +@_deprecate_positional_args +def compute_sample_weight(class_weight, y, *, indices=None): """Estimate sample weights by class for unbalanced datasets. Parameters diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index 3301ac977b4b9..8e471d5fdf577 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -27,7 +27,9 @@ def _unique_multiclass(y): def _unique_indicator(y): - return np.arange(check_array(y, ['csr', 'csc', 'coo']).shape[1]) + return np.arange( + check_array(y, accept_sparse=['csr', 'csc', 'coo']).shape[1] + ) _FN_UNIQUE_LABELS = { @@ -83,7 +85,8 @@ def unique_labels(*ys): # Check consistency for the indicator format if (label_type == "multilabel-indicator" and - len(set(check_array(y, ['csr', 'csc', 'coo']).shape[1] + len(set(check_array(y, + accept_sparse=['csr', 'csc', 'coo']).shape[1] for y in ys)) > 1): raise ValueError("Multi-label binary indicator input with " "different numbers of labels") diff --git a/sklearn/utils/tests/test_class_weight.py b/sklearn/utils/tests/test_class_weight.py index 487da5b431be0..067b12cc32f28 100644 --- a/sklearn/utils/tests/test_class_weight.py +++ b/sklearn/utils/tests/test_class_weight.py @@ -192,39 +192,41 @@ def test_compute_sample_weight_with_subsample(): # Test compute_sample_weight with subsamples specified. # Test with balanced classes and all samples present y = np.asarray([1, 1, 1, 2, 2, 2]) - sample_weight = compute_sample_weight("balanced", y, range(6)) + sample_weight = compute_sample_weight("balanced", y, indices=range(6)) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.]) # Test with column vector of balanced classes and all samples present y = np.asarray([[1], [1], [1], [2], [2], [2]]) - sample_weight = compute_sample_weight("balanced", y, range(6)) + sample_weight = compute_sample_weight("balanced", y, indices=range(6)) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.]) # Test with a subsample y = np.asarray([1, 1, 1, 2, 2, 2]) - sample_weight = compute_sample_weight("balanced", y, range(4)) + sample_weight = compute_sample_weight("balanced", y, indices=range(4)) assert_array_almost_equal(sample_weight, [2. / 3, 2. / 3, 2. / 3, 2., 2., 2.]) # Test with a bootstrap subsample y = np.asarray([1, 1, 1, 2, 2, 2]) - sample_weight = compute_sample_weight("balanced", y, [0, 1, 1, 2, 2, 3]) + sample_weight = compute_sample_weight("balanced", y, + indices=[0, 1, 1, 2, 2, 3]) expected_balanced = np.asarray([0.6, 0.6, 0.6, 3., 3., 3.]) assert_array_almost_equal(sample_weight, expected_balanced) # Test with a bootstrap subsample for multi-output y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]]) - sample_weight = compute_sample_weight("balanced", y, [0, 1, 1, 2, 2, 3]) + sample_weight = compute_sample_weight("balanced", y, + indices=[0, 1, 1, 2, 2, 3]) assert_array_almost_equal(sample_weight, expected_balanced ** 2) # Test with a missing class y = np.asarray([1, 1, 1, 2, 2, 2, 3]) - sample_weight = compute_sample_weight("balanced", y, range(6)) + sample_weight = compute_sample_weight("balanced", y, indices=range(6)) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.]) # Test with a missing class for multi-output y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [2, 2]]) - sample_weight = compute_sample_weight("balanced", y, range(6)) + sample_weight = compute_sample_weight("balanced", y, indices=range(6)) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.]) @@ -237,15 +239,15 @@ def test_compute_sample_weight_errors(): with pytest.raises(ValueError): compute_sample_weight("ni", y) with pytest.raises(ValueError): - compute_sample_weight("ni", y, range(4)) + compute_sample_weight("ni", y, indices=range(4)) with pytest.raises(ValueError): compute_sample_weight("ni", y_) with pytest.raises(ValueError): - compute_sample_weight("ni", y_, range(4)) + compute_sample_weight("ni", y_, indices=range(4)) # Not "balanced" for subsample with pytest.raises(ValueError): - compute_sample_weight({1: 2, 2: 1}, y, range(4)) + compute_sample_weight({1: 2, 2: 1}, y, indices=range(4)) # Not a list or preset for multi-output with pytest.raises(ValueError): diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index b178ccc148d9d..418f037936c64 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -63,7 +63,7 @@ def test_as_float_array(): X = X.astype(np.int64) X2 = as_float_array(X, copy=True) # Checking that the array wasn't overwritten - assert as_float_array(X, False) is not X + assert as_float_array(X, copy=False) is not X assert X2.dtype == np.float64 # Test int dtypes <= 32bit tested_dtypes = [np.bool, @@ -912,7 +912,8 @@ def test_check_scalar_valid(x, target_type, min_val, max_val): """Test that check_scalar returns no error/warning if valid inputs are provided""" with pytest.warns(None) as record: - check_scalar(x, "test_name", target_type, min_val, max_val) + check_scalar(x, "test_name", target_type=target_type, + min_val=min_val, max_val=max_val) assert len(record) == 0 diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 953584fff0f8a..1082ad7337dee 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -36,6 +36,44 @@ warnings.simplefilter('ignore', NonBLASDotWarning) +def _deprecate_positional_args(f): + """Decorator for methods that issues warnings for positional arguments + + Using the keyword-only argument syntax in pep 3102, arguments after the + * will issue a warning when passed as a positional argument. + + Parameters + ---------- + f : function + function to check arguments on + """ + sig = signature(f) + kwonly_args = [] + all_args = [] + + for name, param in sig.parameters.items(): + if param.kind == Parameter.POSITIONAL_OR_KEYWORD: + all_args.append(name) + elif param.kind == Parameter.KEYWORD_ONLY: + kwonly_args.append(name) + + @wraps(f) + def inner_f(*args, **kwargs): + extra_args = len(args) - len(all_args) + if extra_args > 0: + # ignore first 'self' argument for instance methods + args_msg = ['{}={}'.format(name, arg) + for name, arg in zip(kwonly_args[:extra_args], + args[-extra_args:])] + warnings.warn("Pass {} as keyword args. From version 0.25 " + "passing these as positional arguments will " + "result in an error".format(", ".join(args_msg)), + FutureWarning) + kwargs.update({k: arg for k, arg in zip(sig.parameters, args)}) + return f(**kwargs) + return inner_f + + def _assert_all_finite(X, allow_nan=False, msg_dtype=None): """Like assert_all_finite, but only for ndarray.""" # validation is also imported in extmath @@ -67,7 +105,8 @@ def _assert_all_finite(X, allow_nan=False, msg_dtype=None): raise ValueError("Input contains NaN") -def assert_all_finite(X, allow_nan=False): +@_deprecate_positional_args +def assert_all_finite(X, *, allow_nan=False): """Throw a ValueError if X contains NaN or infinity. Parameters @@ -79,7 +118,8 @@ def assert_all_finite(X, allow_nan=False): _assert_all_finite(X.data if sp.issparse(X) else X, allow_nan) -def as_float_array(X, copy=True, force_all_finite=True): +@_deprecate_positional_args +def as_float_array(X, *, copy=True, force_all_finite=True): """Converts an array-like to an array of floats. The new dtype will be np.float32 or np.float64, depending on the original @@ -113,9 +153,9 @@ def as_float_array(X, copy=True, force_all_finite=True): """ if isinstance(X, np.matrix) or (not isinstance(X, np.ndarray) and not sp.issparse(X)): - return check_array(X, ['csr', 'csc', 'coo'], dtype=np.float64, - copy=copy, force_all_finite=force_all_finite, - ensure_2d=False) + return check_array(X, accept_sparse=['csr', 'csc', 'coo'], + dtype=np.float64, copy=copy, + force_all_finite=force_all_finite, ensure_2d=False) elif sp.issparse(X) and X.dtype in [np.float32, np.float64]: return X.copy() if copy else X elif X.dtype in [np.float32, np.float64]: # is numpy array @@ -349,7 +389,8 @@ def _ensure_no_complex_data(array): "{}\n".format(array)) -def check_array(array, accept_sparse=False, accept_large_sparse=True, +@_deprecate_positional_args +def check_array(array, *, accept_sparse=False, accept_large_sparse=True, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=1, ensure_min_features=1, estimator=None): @@ -620,7 +661,8 @@ def _check_large_sparse(X, accept_large_sparse=False): % indices_datatype) -def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True, +@_deprecate_positional_args +def check_X_y(X, y, *, accept_sparse=False, accept_large_sparse=True, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, multi_output=False, ensure_min_samples=1, ensure_min_features=1, y_numeric=False, @@ -732,8 +774,8 @@ def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True, ensure_min_features=ensure_min_features, estimator=estimator) if multi_output: - y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False, - dtype=None) + y = check_array(y, accept_sparse='csr', force_all_finite=True, + ensure_2d=False, dtype=None) else: y = column_or_1d(y, warn=True) _assert_all_finite(y) @@ -745,7 +787,8 @@ def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True, return X, y -def column_or_1d(y, warn=False): +@_deprecate_positional_args +def column_or_1d(y, *, warn=False): """ Ravel column or 1d numpy array, else raises an error Parameters @@ -825,7 +868,8 @@ def has_fit_parameter(estimator, parameter): return parameter in signature(estimator.fit).parameters -def check_symmetric(array, tol=1E-10, raise_warning=True, +@_deprecate_positional_args +def check_symmetric(array, *, tol=1E-10, raise_warning=True, raise_exception=False): """Make sure that array is 2D, square and symmetric. @@ -881,7 +925,8 @@ def check_symmetric(array, tol=1E-10, raise_warning=True, return array -def check_is_fitted(estimator, attributes=None, msg=None, all_or_any=all): +@_deprecate_positional_args +def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all): """Perform is_fitted validation for estimator. Checks if the estimator is fitted by verifying the presence of @@ -974,7 +1019,7 @@ def check_non_negative(X, whom): raise ValueError("Negative values in data passed to %s" % whom) -def check_scalar(x, name, target_type, min_val=None, max_val=None): +def check_scalar(x, name, target_type, *, min_val=None, max_val=None): """Validate scalar parameters type and value. Parameters @@ -1268,44 +1313,6 @@ def _allclose_dense_sparse(x, y, rtol=1e-7, atol=1e-9): "matrix and an array") -def _deprecate_positional_args(f): - """Decorator for methods that issues warnings for positional arguments - - Using the keyword-only argument syntax in pep 3102, arguments after the - * will issue a warning when passed as a positional argument. - - Parameters - ---------- - f : function - function to check arguments on - """ - sig = signature(f) - kwonly_args = [] - all_args = [] - - for name, param in sig.parameters.items(): - if param.kind == Parameter.POSITIONAL_OR_KEYWORD: - all_args.append(name) - elif param.kind == Parameter.KEYWORD_ONLY: - kwonly_args.append(name) - - @wraps(f) - def inner_f(*args, **kwargs): - extra_args = len(args) - len(all_args) - if extra_args > 0: - # ignore first 'self' argument for instance methods - args_msg = ['{}={}'.format(name, arg) - for name, arg in zip(kwonly_args[:extra_args], - args[-extra_args:])] - warnings.warn("Pass {} as keyword args. From version 0.25 " - "passing these as positional arguments will " - "result in an error".format(", ".join(args_msg)), - FutureWarning) - kwargs.update({k: arg for k, arg in zip(sig.parameters, args)}) - return f(**kwargs) - return inner_f - - def _check_fit_params(X, fit_params, indices=None): """Check and validate the parameters passed during `fit`. From f621d8a39287dbe831dd9e6469934593598b6ed3 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sun, 26 Apr 2020 18:30:09 +0200 Subject: [PATCH 067/125] Revert "API kwonly for utils (#17007)" (#17045) --- sklearn/utils/__init__.py | 20 ++--- sklearn/utils/class_weight.py | 5 +- sklearn/utils/multiclass.py | 7 +- sklearn/utils/tests/test_class_weight.py | 22 +++-- sklearn/utils/tests/test_validation.py | 5 +- sklearn/utils/validation.py | 109 +++++++++++------------ 6 files changed, 74 insertions(+), 94 deletions(-) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index afde7614070fd..aac6e292a198a 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -29,8 +29,7 @@ assert_all_finite, check_random_state, column_or_1d, check_array, check_consistent_length, check_X_y, indexable, - check_symmetric, check_scalar, - _deprecate_positional_args) + check_symmetric, check_scalar) from .. import get_config @@ -315,10 +314,10 @@ def safe_indexing(X, indices, axis=0): CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are not supported. """ - return _safe_indexing(X, indices, axis=axis) + return _safe_indexing(X, indices, axis) -def _safe_indexing(X, indices, *, axis=0): +def _safe_indexing(X, indices, axis=0): """Return rows, items or columns of X using indices. .. warning:: @@ -685,8 +684,7 @@ def shuffle(*arrays, **options): return resample(*arrays, **options) -@_deprecate_positional_args -def safe_sqr(X, *, copy=True): +def safe_sqr(X, copy=True): """Element wise squaring of array-likes and sparse matrices. Parameters @@ -725,8 +723,7 @@ def _chunk_generator(gen, chunksize): return -@_deprecate_positional_args -def gen_batches(n, batch_size, *, min_batch_size=0): +def gen_batches(n, batch_size, min_batch_size=0): """Generator to create slices containing batch_size elements, from 0 to n. The last slice may contain less than batch_size elements, when batch_size @@ -775,8 +772,7 @@ def gen_batches(n, batch_size, *, min_batch_size=0): yield slice(start, n) -@_deprecate_positional_args -def gen_even_slices(n, n_packs, *, n_samples=None): +def gen_even_slices(n, n_packs, n_samples=None): """Generator to create n_packs slices going up to n. Parameters @@ -961,8 +957,8 @@ def _print_elapsed_time(source, message=None): timeit.default_timer() - start)) -@_deprecate_positional_args -def get_chunk_n_rows(row_bytes, *, max_n_rows=None, working_memory=None): +def get_chunk_n_rows(row_bytes, max_n_rows=None, + working_memory=None): """Calculates how many rows can be processed within working_memory Parameters diff --git a/sklearn/utils/class_weight.py b/sklearn/utils/class_weight.py index 8c64e33e1d0d4..5f785cb36df45 100644 --- a/sklearn/utils/class_weight.py +++ b/sklearn/utils/class_weight.py @@ -4,8 +4,6 @@ import numpy as np -from .validation import _deprecate_positional_args - def compute_class_weight(class_weight, classes, y): """Estimate class weights for unbalanced datasets. @@ -71,8 +69,7 @@ def compute_class_weight(class_weight, classes, y): return weight -@_deprecate_positional_args -def compute_sample_weight(class_weight, y, *, indices=None): +def compute_sample_weight(class_weight, y, indices=None): """Estimate sample weights by class for unbalanced datasets. Parameters diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index 8e471d5fdf577..3301ac977b4b9 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -27,9 +27,7 @@ def _unique_multiclass(y): def _unique_indicator(y): - return np.arange( - check_array(y, accept_sparse=['csr', 'csc', 'coo']).shape[1] - ) + return np.arange(check_array(y, ['csr', 'csc', 'coo']).shape[1]) _FN_UNIQUE_LABELS = { @@ -85,8 +83,7 @@ def unique_labels(*ys): # Check consistency for the indicator format if (label_type == "multilabel-indicator" and - len(set(check_array(y, - accept_sparse=['csr', 'csc', 'coo']).shape[1] + len(set(check_array(y, ['csr', 'csc', 'coo']).shape[1] for y in ys)) > 1): raise ValueError("Multi-label binary indicator input with " "different numbers of labels") diff --git a/sklearn/utils/tests/test_class_weight.py b/sklearn/utils/tests/test_class_weight.py index 067b12cc32f28..487da5b431be0 100644 --- a/sklearn/utils/tests/test_class_weight.py +++ b/sklearn/utils/tests/test_class_weight.py @@ -192,41 +192,39 @@ def test_compute_sample_weight_with_subsample(): # Test compute_sample_weight with subsamples specified. # Test with balanced classes and all samples present y = np.asarray([1, 1, 1, 2, 2, 2]) - sample_weight = compute_sample_weight("balanced", y, indices=range(6)) + sample_weight = compute_sample_weight("balanced", y, range(6)) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.]) # Test with column vector of balanced classes and all samples present y = np.asarray([[1], [1], [1], [2], [2], [2]]) - sample_weight = compute_sample_weight("balanced", y, indices=range(6)) + sample_weight = compute_sample_weight("balanced", y, range(6)) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.]) # Test with a subsample y = np.asarray([1, 1, 1, 2, 2, 2]) - sample_weight = compute_sample_weight("balanced", y, indices=range(4)) + sample_weight = compute_sample_weight("balanced", y, range(4)) assert_array_almost_equal(sample_weight, [2. / 3, 2. / 3, 2. / 3, 2., 2., 2.]) # Test with a bootstrap subsample y = np.asarray([1, 1, 1, 2, 2, 2]) - sample_weight = compute_sample_weight("balanced", y, - indices=[0, 1, 1, 2, 2, 3]) + sample_weight = compute_sample_weight("balanced", y, [0, 1, 1, 2, 2, 3]) expected_balanced = np.asarray([0.6, 0.6, 0.6, 3., 3., 3.]) assert_array_almost_equal(sample_weight, expected_balanced) # Test with a bootstrap subsample for multi-output y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]]) - sample_weight = compute_sample_weight("balanced", y, - indices=[0, 1, 1, 2, 2, 3]) + sample_weight = compute_sample_weight("balanced", y, [0, 1, 1, 2, 2, 3]) assert_array_almost_equal(sample_weight, expected_balanced ** 2) # Test with a missing class y = np.asarray([1, 1, 1, 2, 2, 2, 3]) - sample_weight = compute_sample_weight("balanced", y, indices=range(6)) + sample_weight = compute_sample_weight("balanced", y, range(6)) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.]) # Test with a missing class for multi-output y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [2, 2]]) - sample_weight = compute_sample_weight("balanced", y, indices=range(6)) + sample_weight = compute_sample_weight("balanced", y, range(6)) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.]) @@ -239,15 +237,15 @@ def test_compute_sample_weight_errors(): with pytest.raises(ValueError): compute_sample_weight("ni", y) with pytest.raises(ValueError): - compute_sample_weight("ni", y, indices=range(4)) + compute_sample_weight("ni", y, range(4)) with pytest.raises(ValueError): compute_sample_weight("ni", y_) with pytest.raises(ValueError): - compute_sample_weight("ni", y_, indices=range(4)) + compute_sample_weight("ni", y_, range(4)) # Not "balanced" for subsample with pytest.raises(ValueError): - compute_sample_weight({1: 2, 2: 1}, y, indices=range(4)) + compute_sample_weight({1: 2, 2: 1}, y, range(4)) # Not a list or preset for multi-output with pytest.raises(ValueError): diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 418f037936c64..b178ccc148d9d 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -63,7 +63,7 @@ def test_as_float_array(): X = X.astype(np.int64) X2 = as_float_array(X, copy=True) # Checking that the array wasn't overwritten - assert as_float_array(X, copy=False) is not X + assert as_float_array(X, False) is not X assert X2.dtype == np.float64 # Test int dtypes <= 32bit tested_dtypes = [np.bool, @@ -912,8 +912,7 @@ def test_check_scalar_valid(x, target_type, min_val, max_val): """Test that check_scalar returns no error/warning if valid inputs are provided""" with pytest.warns(None) as record: - check_scalar(x, "test_name", target_type=target_type, - min_val=min_val, max_val=max_val) + check_scalar(x, "test_name", target_type, min_val, max_val) assert len(record) == 0 diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 1082ad7337dee..953584fff0f8a 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -36,44 +36,6 @@ warnings.simplefilter('ignore', NonBLASDotWarning) -def _deprecate_positional_args(f): - """Decorator for methods that issues warnings for positional arguments - - Using the keyword-only argument syntax in pep 3102, arguments after the - * will issue a warning when passed as a positional argument. - - Parameters - ---------- - f : function - function to check arguments on - """ - sig = signature(f) - kwonly_args = [] - all_args = [] - - for name, param in sig.parameters.items(): - if param.kind == Parameter.POSITIONAL_OR_KEYWORD: - all_args.append(name) - elif param.kind == Parameter.KEYWORD_ONLY: - kwonly_args.append(name) - - @wraps(f) - def inner_f(*args, **kwargs): - extra_args = len(args) - len(all_args) - if extra_args > 0: - # ignore first 'self' argument for instance methods - args_msg = ['{}={}'.format(name, arg) - for name, arg in zip(kwonly_args[:extra_args], - args[-extra_args:])] - warnings.warn("Pass {} as keyword args. From version 0.25 " - "passing these as positional arguments will " - "result in an error".format(", ".join(args_msg)), - FutureWarning) - kwargs.update({k: arg for k, arg in zip(sig.parameters, args)}) - return f(**kwargs) - return inner_f - - def _assert_all_finite(X, allow_nan=False, msg_dtype=None): """Like assert_all_finite, but only for ndarray.""" # validation is also imported in extmath @@ -105,8 +67,7 @@ def _assert_all_finite(X, allow_nan=False, msg_dtype=None): raise ValueError("Input contains NaN") -@_deprecate_positional_args -def assert_all_finite(X, *, allow_nan=False): +def assert_all_finite(X, allow_nan=False): """Throw a ValueError if X contains NaN or infinity. Parameters @@ -118,8 +79,7 @@ def assert_all_finite(X, *, allow_nan=False): _assert_all_finite(X.data if sp.issparse(X) else X, allow_nan) -@_deprecate_positional_args -def as_float_array(X, *, copy=True, force_all_finite=True): +def as_float_array(X, copy=True, force_all_finite=True): """Converts an array-like to an array of floats. The new dtype will be np.float32 or np.float64, depending on the original @@ -153,9 +113,9 @@ def as_float_array(X, *, copy=True, force_all_finite=True): """ if isinstance(X, np.matrix) or (not isinstance(X, np.ndarray) and not sp.issparse(X)): - return check_array(X, accept_sparse=['csr', 'csc', 'coo'], - dtype=np.float64, copy=copy, - force_all_finite=force_all_finite, ensure_2d=False) + return check_array(X, ['csr', 'csc', 'coo'], dtype=np.float64, + copy=copy, force_all_finite=force_all_finite, + ensure_2d=False) elif sp.issparse(X) and X.dtype in [np.float32, np.float64]: return X.copy() if copy else X elif X.dtype in [np.float32, np.float64]: # is numpy array @@ -389,8 +349,7 @@ def _ensure_no_complex_data(array): "{}\n".format(array)) -@_deprecate_positional_args -def check_array(array, *, accept_sparse=False, accept_large_sparse=True, +def check_array(array, accept_sparse=False, accept_large_sparse=True, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=1, ensure_min_features=1, estimator=None): @@ -661,8 +620,7 @@ def _check_large_sparse(X, accept_large_sparse=False): % indices_datatype) -@_deprecate_positional_args -def check_X_y(X, y, *, accept_sparse=False, accept_large_sparse=True, +def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, multi_output=False, ensure_min_samples=1, ensure_min_features=1, y_numeric=False, @@ -774,8 +732,8 @@ def check_X_y(X, y, *, accept_sparse=False, accept_large_sparse=True, ensure_min_features=ensure_min_features, estimator=estimator) if multi_output: - y = check_array(y, accept_sparse='csr', force_all_finite=True, - ensure_2d=False, dtype=None) + y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False, + dtype=None) else: y = column_or_1d(y, warn=True) _assert_all_finite(y) @@ -787,8 +745,7 @@ def check_X_y(X, y, *, accept_sparse=False, accept_large_sparse=True, return X, y -@_deprecate_positional_args -def column_or_1d(y, *, warn=False): +def column_or_1d(y, warn=False): """ Ravel column or 1d numpy array, else raises an error Parameters @@ -868,8 +825,7 @@ def has_fit_parameter(estimator, parameter): return parameter in signature(estimator.fit).parameters -@_deprecate_positional_args -def check_symmetric(array, *, tol=1E-10, raise_warning=True, +def check_symmetric(array, tol=1E-10, raise_warning=True, raise_exception=False): """Make sure that array is 2D, square and symmetric. @@ -925,8 +881,7 @@ def check_symmetric(array, *, tol=1E-10, raise_warning=True, return array -@_deprecate_positional_args -def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all): +def check_is_fitted(estimator, attributes=None, msg=None, all_or_any=all): """Perform is_fitted validation for estimator. Checks if the estimator is fitted by verifying the presence of @@ -1019,7 +974,7 @@ def check_non_negative(X, whom): raise ValueError("Negative values in data passed to %s" % whom) -def check_scalar(x, name, target_type, *, min_val=None, max_val=None): +def check_scalar(x, name, target_type, min_val=None, max_val=None): """Validate scalar parameters type and value. Parameters @@ -1313,6 +1268,44 @@ def _allclose_dense_sparse(x, y, rtol=1e-7, atol=1e-9): "matrix and an array") +def _deprecate_positional_args(f): + """Decorator for methods that issues warnings for positional arguments + + Using the keyword-only argument syntax in pep 3102, arguments after the + * will issue a warning when passed as a positional argument. + + Parameters + ---------- + f : function + function to check arguments on + """ + sig = signature(f) + kwonly_args = [] + all_args = [] + + for name, param in sig.parameters.items(): + if param.kind == Parameter.POSITIONAL_OR_KEYWORD: + all_args.append(name) + elif param.kind == Parameter.KEYWORD_ONLY: + kwonly_args.append(name) + + @wraps(f) + def inner_f(*args, **kwargs): + extra_args = len(args) - len(all_args) + if extra_args > 0: + # ignore first 'self' argument for instance methods + args_msg = ['{}={}'.format(name, arg) + for name, arg in zip(kwonly_args[:extra_args], + args[-extra_args:])] + warnings.warn("Pass {} as keyword args. From version 0.25 " + "passing these as positional arguments will " + "result in an error".format(", ".join(args_msg)), + FutureWarning) + kwargs.update({k: arg for k, arg in zip(sig.parameters, args)}) + return f(**kwargs) + return inner_f + + def _check_fit_params(X, fit_params, indices=None): """Check and validate the parameters passed during `fit`. From a14953aca9cb0c1b0e33b2c31ec5666919b95871 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Mon, 27 Apr 2020 02:52:47 +1000 Subject: [PATCH 068/125] ENH buffer openml stream rather than reading all at once (#16084) --- doc/whats_new/v0.23.rst | 4 + sklearn/datasets/_openml.py | 233 +++++++++++++++----------- sklearn/datasets/tests/test_openml.py | 21 ++- 3 files changed, 154 insertions(+), 104 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 844cdf0360f73..2bd44dcd54486 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -123,6 +123,10 @@ Changelog :func:`datasets.make_moons` now accept two-element tuple. :pr:`15707` by :user:`Maciej J Mikulski `. +- |Efficiency| :func:`datasets.fetch_openml` has reduced memory usage because + it no longer stores the full dataset text stream in memory. :pr:`16084` by + `Joel Nothman`_. + - |Feature| :func:`datasets.fetch_california_housing` now supports heterogeneous data using pandas by setting `as_frame=True`. :pr:`15950` by :user:`Stephanie Andrews ` and diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index 26260a27ec883..10f40dc8906bf 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -9,6 +9,7 @@ import itertools from collections.abc import Generator from collections import OrderedDict +from functools import partial from urllib.request import urlopen, Request @@ -44,11 +45,11 @@ def _retry_with_clean_cache(openml_path, data_home): """ def decorator(f): @wraps(f) - def wrapper(): + def wrapper(*args, **kw): if data_home is None: - return f() + return f(*args, **kw) try: - return f() + return f(*args, **kw) except HTTPError: raise except Exception: @@ -56,7 +57,7 @@ def wrapper(): local_path = _get_local_path(openml_path, data_home) if os.path.exists(local_path): os.unlink(local_path) - return f() + return f(*args, **kw) return wrapper return decorator @@ -217,7 +218,7 @@ def _sparse_data_to_array(arff_data, include_columns): return y -def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None): +def _convert_arff_data(arff, col_slice_x, col_slice_y, shape=None): """ converts the arff object into the appropriate matrix type (np.array or scipy.sparse.csr_matrix) based on the 'data part' (i.e., in the @@ -225,8 +226,8 @@ def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None): Parameters ---------- - arff_data : list or dict - as obtained from liac-arff object + arff : dict + As obtained from liac-arff object. col_slice_x : list The column indices that are sliced from the original array to return @@ -241,6 +242,7 @@ def _convert_arff_data(arff_data, col_slice_x, col_slice_y, shape=None): X : np.array or scipy.sparse.csr_matrix y : np.array """ + arff_data = arff['data'] if isinstance(arff_data, Generator): if shape[0] == -1: count = -1 @@ -300,7 +302,8 @@ def _convert_arff_data_dataframe(arff, columns, features_dict): Returns ------- - dataframe : pandas DataFrame + result : tuple + tuple with the resulting dataframe """ pd = check_pandas_support('fetch_openml with as_frame=True') @@ -327,7 +330,7 @@ def _convert_arff_data_dataframe(arff, columns, features_dict): if dtype == 'category': dtype = pd.api.types.CategoricalDtype(attributes[column]) df[column] = df[column].astype(dtype, copy=False) - return df + return (df, ) def _get_data_info_by_name(name, version, data_home): @@ -448,27 +451,119 @@ def _get_num_samples(data_qualities): return int(float(qualities.get('NumberOfInstances', default_n_samples))) -def _download_data_arff(file_id, sparse, data_home, encode_nominal=True): - # Accesses an ARFF file on the OpenML server. Documentation: - # https://www.openml.org/api_data_docs#!/data/get_download_id - # encode_nominal argument is to ensure unit testing, do not alter in - # production! - url = _DATA_FILE.format(file_id) +def _load_arff_response(url, data_home, return_type, encode_nominal, + parse_arff): + """Load arff data with url and parses arff response with parse_arff""" + response = _open_openml_url(url, data_home) - @_retry_with_clean_cache(url, data_home) - def _arff_load(): - with closing(_open_openml_url(url, data_home)) as response: - if sparse is True: - return_type = _arff.COO - else: - return_type = _arff.DENSE_GEN + with closing(response): + # Note that if the data is dense, no reading is done until the data + # generator is iterated. + arff = _arff.load((line.decode('utf-8') for line in response), + return_type=return_type, + encode_nominal=encode_nominal) + return parse_arff(arff) + + +def _download_data_to_bunch(url, sparse, data_home, *, + as_frame, features_list, data_columns, + target_columns, shape): + """Download OpenML ARFF and convert to Bunch of data + """ + # NB: this function is long in order to handle retry for any failure + # during the streaming parse of the ARFF. + + # Prepare which columns and data types should be returned for the X and y + features_dict = {feature['name']: feature for feature in features_list} + + # XXX: col_slice_y should be all nominal or all numeric + _verify_target_data_type(features_dict, target_columns) + + col_slice_y = [int(features_dict[col_name]['index']) + for col_name in target_columns] - arff_file = _arff.loads(response.read().decode('utf-8'), - encode_nominal=encode_nominal, - return_type=return_type) - return arff_file + col_slice_x = [int(features_dict[col_name]['index']) + for col_name in data_columns] + for col_idx in col_slice_y: + feat = features_list[col_idx] + nr_missing = int(feat['number_of_missing_values']) + if nr_missing > 0: + raise ValueError('Target column {} has {} missing values. ' + 'Missing values are not supported for target ' + 'columns. '.format(feat['name'], nr_missing)) - return _arff_load() + # Access an ARFF file on the OpenML server. Documentation: + # https://www.openml.org/api_data_docs#!/data/get_download_id + + if sparse is True: + return_type = _arff.COO + else: + return_type = _arff.DENSE_GEN + + frame = nominal_attributes = None + if as_frame: + columns = data_columns + target_columns + parse_arff = partial(_convert_arff_data_dataframe, columns=columns, + features_dict=features_dict) + + def postprocess(frame): + X = frame[data_columns] + if len(target_columns) >= 2: + y = frame[target_columns] + elif len(target_columns) == 1: + y = frame[target_columns[0]] + else: + y = None + return X, y, frame, nominal_attributes + else: + def parse_arff(arff): + X, y = _convert_arff_data(arff, col_slice_x, col_slice_y, shape) + # nominal attributes is a dict mapping from the attribute name to + # the possible values. Includes also the target column (which will + # be popped off below, before it will be packed in the Bunch + # object) + nominal_attributes = {k: v for k, v in arff['attributes'] + if isinstance(v, list) and + k in data_columns + target_columns} + return X, y, nominal_attributes + + def postprocess(X, y, nominal_attributes): + is_classification = {col_name in nominal_attributes + for col_name in target_columns} + if not is_classification: + # No target + pass + elif all(is_classification): + y = np.hstack([ + np.take( + np.asarray(nominal_attributes.pop(col_name), + dtype='O'), + y[:, i:i + 1].astype(int, copy=False)) + for i, col_name in enumerate(target_columns) + ]) + elif any(is_classification): + raise ValueError('Mix of nominal and non-nominal targets is ' + 'not currently supported') + + # reshape y back to 1-D array, if there is only 1 target column; + # back to None if there are not target columns + if y.shape[1] == 1: + y = y.reshape((-1,)) + elif y.shape[1] == 0: + y = None + return X, y, frame, nominal_attributes + + out = _retry_with_clean_cache(url, data_home)( + _load_arff_response)(url, data_home, + return_type=return_type, + encode_nominal=not as_frame, + parse_arff=parse_arff) + X, y, frame, nominal_attributes = postprocess(*out) + + return Bunch(data=X, target=y, frame=frame, + categories=nominal_attributes, + feature_names=data_columns, + target_names=target_columns) def _verify_target_data_type(features_dict, target_columns): @@ -705,25 +800,6 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, data_columns = _valid_data_column_names(features_list, target_columns) - # prepare which columns and data types should be returned for the X and y - features_dict = {feature['name']: feature for feature in features_list} - - # XXX: col_slice_y should be all nominal or all numeric - _verify_target_data_type(features_dict, target_columns) - - col_slice_y = [int(features_dict[col_name]['index']) - for col_name in target_columns] - - col_slice_x = [int(features_dict[col_name]['index']) - for col_name in data_columns] - for col_idx in col_slice_y: - feat = features_list[col_idx] - nr_missing = int(feat['number_of_missing_values']) - if nr_missing > 0: - raise ValueError('Target column {} has {} missing values. ' - 'Missing values are not supported for target ' - 'columns. '.format(feat['name'], nr_missing)) - # determine arff encoding to return if not return_sparse: # The shape must include the ignored features to keep the right indexes @@ -734,66 +810,21 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, shape = None # obtain the data - arff = _download_data_arff(data_description['file_id'], return_sparse, - data_home, encode_nominal=not as_frame) + url = _DATA_FILE.format(data_description['file_id']) + bunch = _download_data_to_bunch(url, return_sparse, data_home, + as_frame=as_frame, + features_list=features_list, shape=shape, + target_columns=target_columns, + data_columns=data_columns) + + if return_X_y: + return bunch.data, bunch.target description = "{}\n\nDownloaded from openml.org.".format( data_description.pop('description')) - nominal_attributes = None - frame = None - if as_frame: - columns = data_columns + target_columns - frame = _convert_arff_data_dataframe(arff, columns, features_dict) - X = frame[data_columns] - if len(target_columns) >= 2: - y = frame[target_columns] - elif len(target_columns) == 1: - y = frame[target_columns[0]] - else: - y = None - else: - # nominal attributes is a dict mapping from the attribute name to the - # possible values. Includes also the target column (which will be - # popped off below, before it will be packed in the Bunch object) - nominal_attributes = {k: v for k, v in arff['attributes'] - if isinstance(v, list) and - k in data_columns + target_columns} - - X, y = _convert_arff_data(arff['data'], col_slice_x, - col_slice_y, shape) - - is_classification = {col_name in nominal_attributes - for col_name in target_columns} - if not is_classification: - # No target - pass - elif all(is_classification): - y = np.hstack([ - np.take( - np.asarray(nominal_attributes.pop(col_name), dtype='O'), - y[:, i:i + 1].astype(int, copy=False)) - for i, col_name in enumerate(target_columns) - ]) - elif any(is_classification): - raise ValueError('Mix of nominal and non-nominal targets is not ' - 'currently supported') - - # reshape y back to 1-D array, if there is only 1 target column; back - # to None if there are not target columns - if y.shape[1] == 1: - y = y.reshape((-1,)) - elif y.shape[1] == 0: - y = None - - if return_X_y: - return X, y - - bunch = Bunch( - data=X, target=y, frame=frame, feature_names=data_columns, - target_names=target_columns, + bunch.update( DESCR=description, details=data_description, - categories=nominal_attributes, url="https://www.openml.org/d/{}".format(data_id)) return bunch diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index f9969c75d5c8e..44fe392e42e74 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -12,8 +12,9 @@ from sklearn import config_context from sklearn.datasets import fetch_openml from sklearn.datasets._openml import (_open_openml_url, + _arff, + _DATA_FILE, _get_data_description_by_id, - _download_data_arff, _get_local_path, _retry_with_clean_cache, _feature_to_dtype) @@ -56,8 +57,13 @@ def decode_column(data_bunch, col_idx): if sparse is True: raise ValueError('This test is not intended for sparse data, to keep ' 'code relatively simple') - data_arff = _download_data_arff(data_description['file_id'], - sparse, None, False) + url = _DATA_FILE.format(data_description['file_id']) + with _open_openml_url(url, data_home=None) as f: + data_arff = _arff.load((line.decode('utf-8') for line in f), + return_type=(_arff.COO if sparse + else _arff.DENSE_GEN), + encode_nominal=False) + data_downloaded = np.array(list(data_arff['data']), dtype='O') for i in range(len(data_bunch.feature_names)): @@ -176,6 +182,15 @@ def info(self): return {'Content-Encoding': 'gzip'} return {} + def __iter__(self): + return iter(self.data) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + return False + def _file_name(url, suffix): return (re.sub(r'\W', '-', url[len("https://openml.org/"):]) + suffix + path_suffix) From a35b892522499bfe7a0e5fdfdfbd15752e63fbb0 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 26 Apr 2020 14:37:07 -0400 Subject: [PATCH 069/125] Fix Mypy issue in _openml.py (#17047) --- doc/developers/contributing.rst | 4 +++- sklearn/datasets/_openml.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index 33ab3fcecb887..c886119e908c1 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -431,7 +431,9 @@ You can check for common programming errors with the following tools: mypy --ignore-missing-import sklearn - must not produce new errors in your pull request. Using `# type: ignore` annotation can be a workaround for a few cases that are not supported by mypy, in particular, + must not produce new errors in your pull request. Using `# type: ignore` + annotation can be a workaround for a few cases that are not supported by + mypy, in particular, - when importing C or Cython modules - on properties with decorators diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index 10f40dc8906bf..4040641a17574 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -506,7 +506,7 @@ def _download_data_to_bunch(url, sparse, data_home, *, parse_arff = partial(_convert_arff_data_dataframe, columns=columns, features_dict=features_dict) - def postprocess(frame): + def postprocess(frame): # type:ignore X = frame[data_columns] if len(target_columns) >= 2: y = frame[target_columns] @@ -527,7 +527,7 @@ def parse_arff(arff): k in data_columns + target_columns} return X, y, nominal_attributes - def postprocess(X, y, nominal_attributes): + def postprocess(X, y, nominal_attributes): # type:ignore is_classification = {col_name in nominal_attributes for col_name in target_columns} if not is_classification: From 7ede0285db17bc64719749255027032651388edf Mon Sep 17 00:00:00 2001 From: Alex Liang Date: Sun, 26 Apr 2020 16:04:23 -0400 Subject: [PATCH 070/125] DOC add detail about flip_y parameter in make_classification (#17049) --- sklearn/datasets/_samples_generator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/datasets/_samples_generator.py b/sklearn/datasets/_samples_generator.py index fe0ac680ecd79..7642b6fb7dc59 100644 --- a/sklearn/datasets/_samples_generator.py +++ b/sklearn/datasets/_samples_generator.py @@ -102,7 +102,8 @@ def make_classification(n_samples=100, n_features=20, n_informative=2, flip_y : float, optional (default=0.01) The fraction of samples whose class is assigned randomly. Larger values introduce noise in the labels and make the classification - task harder. + task harder. Note that the default setting flip_y > 0 might lead + to less than n_classes in y in some cases. class_sep : float, optional (default=1.0) The factor multiplying the hypercube size. Larger values spread From f624f4e48781927cb3c8d140b1f033fcdb809d67 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Mon, 27 Apr 2020 15:21:31 +0200 Subject: [PATCH 071/125] DOC cleaning up to 0.23/whats new (#17015) --- doc/whats_new.rst | 1 + doc/whats_new/v0.23.rst | 225 +++++++++++++++++++++-------------- doc/whats_new/v0.24.rst | 55 +++++++++ maint_tools/whats_missing.sh | 2 +- 4 files changed, 194 insertions(+), 89 deletions(-) create mode 100644 doc/whats_new/v0.24.rst diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 7b84374bd5146..66f2a3818cec8 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -12,6 +12,7 @@ on libraries.io to be notified when new versions are released. .. toctree:: :maxdepth: 1 + Version 0.24 Version 0.23 Version 0.22 Version 0.21 diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 2bd44dcd54486..4357845885e3f 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -22,14 +22,44 @@ parameters, may produce different models from the previous version. This often occurs due to changes in the modelling logic (bug fixes or enhancements), or in random sampling procedures. -- :class:`ensemble.BaggingClassifier`, :class:`ensemble.BaggingRegressor`, - and :class:`ensemble.IsolationForest`. |Fix| - -- Any model using the :func:`svm.libsvm` or the :func:`svm.liblinear` solver, +- |Fix| :class:`ensemble.BaggingClassifier`, :class:`ensemble.BaggingRegressor`, + and :class:`ensemble.IsolationForest`. +- |Fix| :class:`cluster.KMeans` with ``algorithm="elkan"`` and + ``algorithm="full"``. +- |Fix| :class:`cluster.Birch` +- |Fix| :func:`compose.ColumnTransformer.get_feature_names` +- |Fix| :func:`compose.ColumnTransformer.fit` +- |Fix| :func:`datasets.make_multilabel_classification` +- |Fix| :class:`decomposition.PCA` with `n_components='mle'` +- |Enhancement| :class:`decomposition.NMF` and + :func:`decomposition.non_negative_factorization` with float32 dtype input. +- |Fix| :func:`decomposition.KernelPCA.inverse_transform` +- |API| :class:`ensemble.HistGradientBoostingClassifier` and + :class:`ensemble.HistGradientBoostingRegrerssor` +- |Fix| ``estimator_samples_`` in :class:`ensemble.BaggingClassifier`, + :class:`ensemble.BaggingRegressor` and :class:`ensemble.IsolationForest` +- |Fix| :class:`ensemble.StackingClassifier` and + :class:`ensemble.StackingRegressor` with `sample_weight` +- |Fix| :class:`gaussian_process.GaussianProcessRegressor` +- |Fix| :class:`linear_model.RANSACRegressor` with ``sample_weight``. +- |Fix| :class:`linear_model.RidgeClassifierCV` +- |Fix| :func:`metrics.mean_squared_error` with `squared` and + `multioutput='raw_values'`. +- |Fix| :func:`metrics.mutual_info_score` with negative scores. +- |Fix| :func:`metrics.confusion_matrix` with zero length `y_true` and `y_pred` +- |Fix| :class:`neural_network.MLPClassifier` +- |Fix| :class:`preprocessing.StandardScaler` with `partial_fit` and sparse + input. +- |Fix| :class:`preprocessing.Normalizer` with norm='max' +- |Fix| Any model using the :func:`svm.libsvm` or the :func:`svm.liblinear` solver, including :class:`svm.LinearSVC`, :class:`svm.LinearSVR`, :class:`svm.NuSVC`, :class:`svm.NuSVR`, :class:`svm.OneClassSVM`, :class:`svm.SVC`, :class:`svm.SVR`, :class:`linear_model.LogisticRegression`. - |Efficiency| |Fix| +- |Fix| :class:`tree.DecisionTreeClassifier`, :class:`tree.ExtraTreeClassifier` and + :class:`ensemble.GradientBoostingClassifier` as well as ``predict`` method of + :class:`tree.DecisionTreeRegressor`, :class:`tree.ExtraTreeRegressor`, and + :class:`ensemble.GradientBoostingRegressor` and read-only float32 input in + ``predict``, ``decision_path`` and ``predict_proba``. Details are listed in the changelog below. @@ -53,19 +83,29 @@ Changelog :mod:`sklearn.cluster` ...................... -- |Enhancement| :class:`cluster.AgglomerativeClustering` has a faster and more - more memory efficient implementation of single linkage clustering. - :pr:`11514` by :user:`Leland McInnes `. -- |Fix| :class:`cluster.KMeans` with ``algorithm="elkan"`` now converges with - ``tol=0`` as with the default ``algorithm="full"``. :pr:`16075` by - :user:`Erich Schubert `. - - |Efficiency| :class:`cluster.Birch` implementation of the predict method avoids high memory footprint by calculating the distances matrix using a chunked scheme. :pr:`16149` by :user:`Jeremie du Boisberranger ` and :user:`Alex Shacked `. +- |Efficiency| The critical parts of :class:`cluster.KMeans` have a more + optimized implementation. Parallelism is now over the data instead of over + initializations allowing better scalability. :pr:`11950` by + :user:`Jeremie du Boisberranger `. + +- |Enhancement| :class:`cluster.KMeans` now supports sparse data when + `solver = "elkan"`. :pr:`11950` by + :user:`Jeremie du Boisberranger `. + +- |Enhancement| :class:`cluster.AgglomerativeClustering` has a faster and more + memory efficient implementation of single linkage clustering. + :pr:`11514` by :user:`Leland McInnes `. + +- |Fix| :class:`cluster.KMeans` with ``algorithm="elkan"`` now converges with + ``tol=0`` as with the default ``algorithm="full"``. :pr:`16075` by + :user:`Erich Schubert `. + - |Fix| Fixed a bug in :class:`cluster.Birch` where the `n_clusters` parameter could not have a `np.int64` type. :pr:`16484` by :user:`Jeremie du Boisberranger `. @@ -81,47 +121,28 @@ Changelog deprecated. It has no effect. :pr:`11950` by :user:`Jeremie du Boisberranger `. -- |Efficiency| The critical parts of :class:`cluster.KMeans` have a more - optimized implementation. Parallelism is now over the data instead of over - initializations allowing better scalability. :pr:`11950` by - :user:`Jeremie du Boisberranger `. - -- |Enhancement| :class:`cluster.KMeans` now supports sparse data when - `solver = "elkan"`. :pr:`11950` by - :user:`Jeremie du Boisberranger `. - :mod:`sklearn.compose` ...................... -- |Fix| :class:`compose.ColumnTransformer` method ``get_feature_names`` now - returns correct results when one of the transformer steps applies on an - empty list of columns :pr:`15963` by `Roman Yurchak`_. - - |Efficiency| :class:`compose.ColumnTransformer` is now faster when working with dataframes and strings are used to specific subsets of data for transformers. :pr:`16431` by `Thomas Fan`_. -- |Fix| :func:`compose.ColumnTransformer.fit` will error when selecting - a column name that is not unique in the dataframe. :pr:`16431` by - `Thomas Fan`_. - - |Enhancement| :class:`compose.ColumnTransformer` method ``get_feature_names`` now supports `'passthrough'` columns, with the feature name being either the column name for a dataframe, or `'xi'` for column index `i`. :pr:`14048` by :user:`Lewis Ball `. -:mod:`sklearn.datasets` -....................... +- |Fix| :class:`compose.ColumnTransformer` method ``get_feature_names`` now + returns correct results when one of the transformer steps applies on an + empty list of columns :pr:`15963` by `Roman Yurchak`_. -- |Enhancement| Added ``return_centers`` parameter in - :func:`datasets.make_blobs`, which can be used to return - centers for each cluster. - :pr:`15709` by :user:`` and - :user:`Venkatachalam N `. +- |Fix| :func:`compose.ColumnTransformer.fit` will error when selecting + a column name that is not unique in the dataframe. :pr:`16431` by + `Thomas Fan`_. -- |Enhancement| Functions :func:`datasets.make_circles` and - :func:`datasets.make_moons` now accept two-element tuple. - :pr:`15707` by :user:`Maciej J Mikulski `. +:mod:`sklearn.datasets` +....................... - |Efficiency| :func:`datasets.fetch_openml` has reduced memory usage because it no longer stores the full dataset text stream in memory. :pr:`16084` by @@ -138,6 +159,16 @@ Changelog ``DataFrame`` by setting `as_frame=True`. :pr:`15980` by :user:`wconnell` and :user:`Reshama Shaikh `. +- |Enhancement| Added ``return_centers`` parameter in + :func:`datasets.make_blobs`, which can be used to return + centers for each cluster. + :pr:`15709` by :user:`` and + :user:`Venkatachalam N `. + +- |Enhancement| Functions :func:`datasets.make_circles` and + :func:`datasets.make_moons` now accept two-element tuple. + :pr:`15707` by :user:`Maciej J Mikulski `. + - |Fix| :func:`datasets.make_multilabel_classification` now generates `ValueError` for arguments `n_classes < 1` OR `length < 1`. :pr:`16006` by :user:`Rushabh Vasani `. @@ -145,20 +176,23 @@ Changelog :mod:`sklearn.decomposition` ............................ +- |Enhancement| :class:`decomposition.NMF` and + :func:`decomposition.non_negative_factorization` now preserves float32 dtype. + :pr:`16280` by :user:`Jeremie du Boisberranger `. + +- |Enhancement| :func:`TruncatedSVD.transform` is now faster on given sparse + ``csc`` matrices. :pr:`16837` by :user:`wornbb`. + - |Fix| :class:`decomposition.PCA` with a float `n_components` parameter, will exclusively choose the components that explain the variance greater than `n_components`. :pr:`15669` by :user:`Krishna Chaitanya ` - |Fix| :class:`decomposition.PCA` with `n_components='mle'` now correctly handles small eigenvalues, and does not infer 0 as the correct number of - components. :pr: `4441` by :user:`Lisa Schwetlick `, and + components. :pr:`16224` by :user:`Lisa Schwetlick `, and :user:`Gelavizh Ahmadi ` and :user:`Marija Vlajic Wheeler ` and :pr:`16841` by `Nicolas Hug`_. -- |Enhancement| :class:`decomposition.NMF` and - :func:`decomposition.non_negative_factorization` now preserves float32 dtype. - :pr:`16280` by :user:`Jeremie du Boisberranger `. - - |Fix| :class:`decomposition.KernelPCA` method ``inverse_transform`` now applies the correct inverse transform to the transformed data. :pr:`16655` by :user:`Lewis Ball `. @@ -174,9 +208,22 @@ Changelog :class:`ensemble.HistGradientBoostingRegressor` now support :term:`sample_weight`. :pr:`14696` by `Adrin Jalali`_ and `Nicolas Hug`_. +- |Feature| Early stopping in + :class:`ensemble.HistGradientBoostingClassifier` and + :class:`ensemble.HistGradientBoostingRegressor` is now determined with a + new `early_stopping` parameter instead of `n_iter_no_change`. Default value + is 'auto', which enables early stopping if there are at least 10,000 + samples in the training set. :pr:`14516` by :user:`Johann Faouzi + `. + +- |Feature| :class:`ensemble.HistGradientBoostingClassifier` and + :class:`ensemble.HistGradientBoostingRegressor` now support monotonic + constraints, useful when features are supposed to have a positive/negative + effect on the target. :pr:`15582` by `Nicolas Hug`_. + - |API| Added boolean `verbose` flag to classes: :class:`ensemble.VotingClassifier` and :class:`ensemble.VotingRegressor`. - :pr:`15991` by :user:`Sam Bail `, + :pr:`16069` by :user:`Sam Bail `, :user:`Hanna Bruce MacDonald `, :user:`Reshama Shaikh `, and :user:`Chiara Marmo `. @@ -191,20 +238,7 @@ Changelog :class:`ensemble.HistGradientBoostingRegressor`. The depth now corresponds to the number of edges to go from the root to the deepest leaf. Stumps (trees with one split) are now allowed. - :pr: `16182` by :user:`Santhosh B ` - -- |Feature| Early stopping in - :class:`ensemble.HistGradientBoostingClassifier` and - :class:`ensemble.HistGradientBoostingRegressor` is now determined with a - new `early_stopping` parameter instead of `n_iter_no_change`. Default value - is 'auto', which enables early stopping if there are at least 10,000 - samples in the training set. :pr:`14516` by :user:`Johann Faouzi - `. - -- |Feature| :class:`ensemble.HistGradientBoostingClassifier` and - :class:`ensemble.HistGradientBoostingRegressor` now support monotonic - constraints, useful when features are supposed to have a positive/negative - effect on the target. :pr:`15582` by `Nicolas Hug`_. + :pr:`16182` by :user:`Santhosh B ` - |Fix| Fixed a bug in :class:`ensemble.BaggingClassifier`, :class:`ensemble.BaggingRegressor` and :class:`ensemble.IsolationForest` @@ -278,11 +312,6 @@ Changelog :class:`linear_model:Lasso` for dense feature matrix `X`. :pr:`15436` by :user:`Christian Lorentzen `. -- |Fix| Fixed a bug where if a `sample_weight` parameter was passed to the fit - method of :class:`linear_model.RANSACRegressor`, it would not be passed to - the wrapped `base_estimator` during the fitting of the final model. - :pr:`15573` by :user:`Jeremy Alexandre `. - - |Efficiency| :class:`linear_model.RidgeCV` and :class:`linear_model.RidgeClassifierCV` now does not allocate a potentially large array to store dual coefficients for all hyperparameters @@ -290,6 +319,16 @@ Changelog `store_cv_values` is `True`. :pr:`15652` by :user:`Jérôme Dockès `. +- |Enhancement| :class:`linear_model.LassoLars` and + :class:`linear_model.Lars` now support a `jitter` parameter that adds + random noise to the target. This might help with stability in some edge + cases. :pr:`15179` by :user:`angelaambroz`. + +- |Fix| Fixed a bug where if a `sample_weight` parameter was passed to the fit + method of :class:`linear_model.RANSACRegressor`, it would not be passed to + the wrapped `base_estimator` during the fitting of the final model. + :pr:`15773` by :user:`Jeremy Alexandre `. + - |Fix| add `best_score_` attribute to :class:`linear_model.RidgeCV` and :class:`linear_model.RidgeClassifierCV`. :pr:`15653` by :user:`Jérôme Dockès `. @@ -299,6 +338,11 @@ Changelog instead of predictions. :pr:`14848` by :user:`Venkatachalam N `. +- |Fix| :class:`linear_model.LogisticRegression` will now avoid an unnecessary + iteration when `solver='newton-cg'` by checking for inferior or equal instead + of strictly inferior for maximum of `absgrad` and `tol` in `utils.optimize._newton_cg`. + :pr:`16266` by :user:`Rushabh Vasani `. + - |API| Deprecated public attributes `standard_coef_`, `standard_intercept_`, `average_coef_`, and `average_intercept_` in :class:`linear_model.SGDClassifier`, @@ -307,31 +351,15 @@ Changelog :class:`linear_model.PassiveAggressiveRegressor`. :pr:`16261` by :user:`Carlos Brandt `. -- |Fix| :class:`linear_model.LogisticRegression` will now avoid an unnecessary - iteration when `solver='newton-cg'` by checking for inferior or equal instead - of strictly inferior for maximum of `absgrad` and `tol` in `utils.optimize._newton_cg`. - :pr:`16266` by :user:`Rushabh Vasani `. - - |Fix| |Efficiency| :class:`linear_model.ARDRegression` is more stable and much faster when `n_samples > n_features`. It can now scale to hundreds of thousands of samples. The stability fix might imply changes in the number of non-zero coefficients and in the predicted output. :pr:`16849` by `Nicolas Hug`_. -- |Enhancement| :class:`linear_model.LassoLars` and - :class:`linear_model.Lars` now support a `jitter` parameter that adds - random noise to the target. This might help with stability in some edge - cases. :pr:`15179` by :user:`angelaambroz`. - :mod:`sklearn.metrics` ...................... -- |API| Changed the formatting of values in - :meth:`metrics.ConfusionMatrixDisplay.plot` and - :func:`metrics.plot_confusion_matrix` to pick the shorter format (either '2g' - or 'd'). :pr:`16159` by :user:`Rick Mackenbach ` and - `Thomas Fan`_. - - |Enhancement| :func:`metrics.pairwise.pairwise_distances_chunked` now allows its ``reduce_func`` to not have a return value, enabling in-place operations. :pr:`16397` by `Joel Nothman`_. @@ -349,6 +377,12 @@ Changelog the `labels` parameter. :pr:`16442` by `Kyle Parsons `. +- |API| Changed the formatting of values in + :meth:`metrics.ConfusionMatrixDisplay.plot` and + :func:`metrics.plot_confusion_matrix` to pick the shorter format (either '2g' + or 'd'). :pr:`16159` by :user:`Rick Mackenbach ` and + `Thomas Fan`_. + :mod:`sklearn.model_selection` .............................. @@ -398,14 +432,14 @@ Changelog :mod:`sklearn.preprocessing` ............................ -- |Efficiency| :class:`preprocessing.OneHotEncoder` is now faster at - transforming. :pr:`15762` by `Thomas Fan`_. - - |Feature| argument `drop` of :class:`preprocessing.OneHotEncoder` will now accept value 'if_binary' and will drop the first category of each feature with two categories. :pr:`16245` by :user:`Rushabh Vasani `. +- |Efficiency| :class:`preprocessing.OneHotEncoder` is now faster at + transforming. :pr:`15762` by `Thomas Fan`_. + - |Fix| Fix a bug in :class:`preprocessing.StandardScaler` which was incorrectly computing statistics when calling `partial_fit` on sparse inputs. :pr:`16466` by :user:`Guillaume Lemaitre `. @@ -438,16 +472,16 @@ Changelog number of samples (LibSVM) or the number of features (LibLinear) is large. :pr:`13511` by :user:`Sylvain Marié `. -- |API| :class:`svm.SVR` and :class:`svm.OneClassSVM` attributes, `probA_` and - `probB_`, are now deprecated as they were not useful. :pr:`15558` by - `Thomas Fan`_. - - |Fix| Fix use of custom kernel not taking float entries such as string kernels in :class:`svm.SVC` and :class:`svm.SVR`. Note that custom kennels are now expected to validate their input where they previously received valid numeric arrays. :pr:`11296` by `Alexandre Gramfort`_ and :user:`Georgi Peev `. +- |API| :class:`svm.SVR` and :class:`svm.OneClassSVM` attributes, `probA_` and + `probB_`, are now deprecated as they were not useful. :pr:`15558` by + `Thomas Fan`_. + :mod:`sklearn.tree` ................... @@ -487,14 +521,29 @@ Changelog Miscellaneous ............. +- |Enhancement| ``scikit-learn`` now works with ``mypy`` without errors. + :pr:`16726` by `Roman Yurchak`_. + - |API| Most estimators now expose a `n_features_in_` attribute. This attribute is equal to the number of features passed to the `fit` method. See `SLEP010 `_ - for details. :pr:`16112` and :pr:`16622` by `Nicolas Hug`_. + for details. :pr:`16112` by `Nicolas Hug`_. - |API| Estimators now have a `requires_y` tags which is False by default except for estimators that inherit from `~sklearn.base.RegressorMixin` or `~sklearn.base.ClassifierMixin`. This tag is used to ensure that a proper error message is raised when y was expected but None was passed. :pr:`16622` by `Nicolas Hug`_. + +- |API| Most constructor and function parameters are now expected to be passed + as a keyword and not positional. :issue:`15005` by `Joel Nothman`_, + `Adrin Jalali`_, `Thomas Fan`_, and `Nicolas Hug`_. See `SLEP009 + `_ + for more details. + +Code and Documentation Contributors +----------------------------------- + +Thanks to everyone who has contributed to the maintenance and improvement of the +project since version 0.20, including: diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst new file mode 100644 index 0000000000000..dd4ab30a7f2ff --- /dev/null +++ b/doc/whats_new/v0.24.rst @@ -0,0 +1,55 @@ +.. include:: _contributors.rst + +.. currentmodule:: sklearn + +.. _changes_0_24: + +Version 0.24.0 +============== + +**In Development** + + +.. include:: changelog_legend.inc + +Put the changes in their relevant module. + +Changed models +-------------- + +The following estimators and functions, when fit with the same data and +parameters, may produce different models from the previous version. This often +occurs due to changes in the modelling logic (bug fixes or enhancements), or in +random sampling procedures. + +- items +- items + +Details are listed in the changelog below. + +(While we are trying to better inform users by providing this information, we +cannot assure that this list is complete.) + +Changelog +--------- + +.. + Entries should be grouped by module (in alphabetic order) and prefixed with + one of the labels: |MajorFeature|, |Feature|, |Efficiency|, |Enhancement|, + |Fix| or |API| (see whats_new.rst for descriptions). + Entries should be ordered by those labels (e.g. |Fix| after |Efficiency|). + Changes not specific to a module should be listed under *Multiple Modules* + or *Miscellaneous*. + Entries should end with: + :pr:`123456` by :user:`Joe Bloggs `. + where 123456 is the *pull request* number, not the issue number. + +:mod:`sklearn.module` +..................... + + +Code and Documentation Contributors +----------------------------------- + +Thanks to everyone who has contributed to the maintenance and improvement of the +project since version 0.20, including: diff --git a/maint_tools/whats_missing.sh b/maint_tools/whats_missing.sh index 54ce06f8bbcf5..5b2d6b8fd8a01 100755 --- a/maint_tools/whats_missing.sh +++ b/maint_tools/whats_missing.sh @@ -19,7 +19,7 @@ logged_prs() { mentioned_issues() { cat doc/whats_new/v$to_file.rst | - grep -o 'issue:`[0-9]\+`' | + grep -o 'issue:`[0-9]\+`\|pr:`[0-9]\+`' | grep -o '[0-9]\+' } From 76ef8b0ef07f9c03b97d29a51e1543be7720e85a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 27 Apr 2020 09:42:01 -0400 Subject: [PATCH 072/125] API kwonly for utils (#17046) * kwonly for utils * More * fixed some * some more * iwannagohomepls * accept_sparse not kwonly anymore --- sklearn/ensemble/_forest.py | 6 +- .../_univariate_selection.py | 5 +- sklearn/feature_selection/tests/test_base.py | 2 +- sklearn/linear_model/_coordinate_descent.py | 11 +- sklearn/linear_model/_stochastic_gradient.py | 4 +- sklearn/manifold/_spectral_embedding.py | 3 +- sklearn/neighbors/_nca.py | 8 +- sklearn/neural_network/_rbm.py | 2 +- sklearn/preprocessing/_data.py | 2 +- sklearn/utils/__init__.py | 20 ++-- sklearn/utils/class_weight.py | 5 +- sklearn/utils/multiclass.py | 7 +- sklearn/utils/tests/test_class_weight.py | 22 ++-- sklearn/utils/tests/test_validation.py | 5 +- sklearn/utils/validation.py | 109 ++++++++++-------- 15 files changed, 118 insertions(+), 93 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 40a1c2434316c..98d606961c1e1 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -159,9 +159,11 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, if class_weight == 'subsample': with catch_warnings(): simplefilter('ignore', DeprecationWarning) - curr_sample_weight *= compute_sample_weight('auto', y, indices) + curr_sample_weight *= compute_sample_weight('auto', y, + indices=indices) elif class_weight == 'balanced_subsample': - curr_sample_weight *= compute_sample_weight('balanced', y, indices) + curr_sample_weight *= compute_sample_weight('balanced', y, + indices=indices) tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False) else: diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 21a2ddc10a1eb..7e873b3a2b65c 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -146,7 +146,7 @@ def f_classif(X, y): chi2: Chi-squared stats of non-negative features for classification tasks. f_regression: F-value between label/feature for regression tasks. """ - X, y = check_X_y(X, y, ['csr', 'csc', 'coo']) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo']) args = [X[safe_mask(X, y == k)] for k in np.unique(y)] return f_oneway(*args) @@ -277,7 +277,8 @@ def f_regression(X, y, center=True): SelectPercentile: Select features based on percentile of the highest scores. """ - X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float64) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], + dtype=np.float64) n_samples = X.shape[0] # compute centered values diff --git a/sklearn/feature_selection/tests/test_base.py b/sklearn/feature_selection/tests/test_base.py index d1aaccde0efa3..9515bdc32c600 100644 --- a/sklearn/feature_selection/tests/test_base.py +++ b/sklearn/feature_selection/tests/test_base.py @@ -15,7 +15,7 @@ def __init__(self, step=2): self.step = step def fit(self, X, y=None): - X = check_array(X, 'csc') + X = check_array(X, accept_sparse='csc') self.n_input_feats = X.shape[1] return self diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index f4430c5bcac55..cd57b9612b362 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -131,7 +131,7 @@ def _alpha_grid(X, y, Xy=None, l1_ratio=1.0, fit_intercept=True, if Xy is None: X_sparse = sparse.isspmatrix(X) sparse_center = X_sparse and (fit_intercept or normalize) - X = check_array(X, 'csc', + X = check_array(X, accept_sparse='csc', copy=(copy_X and fit_intercept and not X_sparse)) if not X_sparse: # X can be touched inplace thanks to the above line @@ -435,10 +435,10 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None, # We expect X and y to be already Fortran ordered when bypassing # checks if check_input: - X = check_array(X, 'csc', dtype=[np.float64, np.float32], + X = check_array(X, accept_sparse='csc', dtype=[np.float64, np.float32], order='F', copy=copy_X) - y = check_array(y, 'csc', dtype=X.dtype.type, order='F', copy=False, - ensure_2d=False) + y = check_array(y, accept_sparse='csc', dtype=X.dtype.type, + order='F', copy=False, ensure_2d=False) if Xy is not None: # Xy should be a 1d contiguous array or a 2D C ordered array Xy = check_array(Xy, dtype=X.dtype.type, order='C', copy=False, @@ -1095,7 +1095,8 @@ def _path_residuals(X, y, train, test, path, path_params, alphas=None, # Do the ordering and type casting here, as if it is done in the path, # X is copied and a reference is kept here - X_train = check_array(X_train, 'csc', dtype=dtype, order=X_order) + X_train = check_array(X_train, accept_sparse='csc', dtype=dtype, + order=X_order) alphas, coefs, _ = path(X_train, y_train, **path_params) del X_train, y_train diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index 1606a7ff35adb..94428f61f1327 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -487,8 +487,8 @@ def _partial_fit(self, X, y, alpha, C, loss, learning_rate, max_iter, classes, sample_weight, coef_init, intercept_init): - X, y = check_X_y(X, y, 'csr', dtype=np.float64, order="C", - accept_large_sparse=False) + X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64, + order="C", accept_large_sparse=False) n_samples, n_features = X.shape diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py index 0c8bb4902c99a..a42c97bb5d6b4 100644 --- a/sklearn/manifold/_spectral_embedding.py +++ b/sklearn/manifold/_spectral_embedding.py @@ -301,7 +301,8 @@ def spectral_embedding(adjacency, *, n_components=8, eigen_solver=None, # matrix to the solver and afterward set it back to the original. diag_shift = 1e-5 * sparse.eye(laplacian.shape[0]) laplacian += diag_shift - ml = smoothed_aggregation_solver(check_array(laplacian, 'csr')) + ml = smoothed_aggregation_solver(check_array(laplacian, + accept_sparse='csr')) laplacian -= diag_shift M = ml.aspreconditioner() diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py index 9705c9050f6c7..8920b2d99ed02 100644 --- a/sklearn/neighbors/_nca.py +++ b/sklearn/neighbors/_nca.py @@ -308,7 +308,7 @@ def _validate_params(self, X, y): # Check the preferred dimensionality of the projected space if self.n_components is not None: check_scalar( - self.n_components, 'n_components', numbers.Integral, 1) + self.n_components, 'n_components', numbers.Integral, min_val=1) if self.n_components > X.shape[1]: raise ValueError('The preferred dimensionality of the ' @@ -327,9 +327,9 @@ def _validate_params(self, X, y): .format(X.shape[1], self.components_.shape[1])) - check_scalar(self.max_iter, 'max_iter', numbers.Integral, 1) - check_scalar(self.tol, 'tol', numbers.Real, 0.) - check_scalar(self.verbose, 'verbose', numbers.Integral, 0) + check_scalar(self.max_iter, 'max_iter', numbers.Integral, min_val=1) + check_scalar(self.tol, 'tol', numbers.Real, min_val=0.) + check_scalar(self.verbose, 'verbose', numbers.Integral, min_val=0) if self.callback is not None: if not callable(self.callback): diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py index 67e1d68a3607e..fcb4e90772598 100644 --- a/sklearn/neural_network/_rbm.py +++ b/sklearn/neural_network/_rbm.py @@ -357,7 +357,7 @@ def fit(self, X, y=None): n_batches = int(np.ceil(float(n_samples) / self.batch_size)) batch_slices = list(gen_even_slices(n_batches * self.batch_size, - n_batches, n_samples)) + n_batches, n_samples=n_samples)) verbose = self.verbose begin = time.time() for iteration in range(1, self.n_iter + 1): diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index f9af3dbac6d0d..cc8776951f114 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -1707,7 +1707,7 @@ def normalize(X, norm='l2', *, axis=1, copy=True, return_norm=False): else: raise ValueError("'%d' is not a supported axis" % axis) - X = check_array(X, sparse_format, copy=copy, + X = check_array(X, accept_sparse=sparse_format, copy=copy, estimator='the normalize function', dtype=FLOAT_DTYPES) if axis == 0: X = X.T diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index aac6e292a198a..afde7614070fd 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -29,7 +29,8 @@ assert_all_finite, check_random_state, column_or_1d, check_array, check_consistent_length, check_X_y, indexable, - check_symmetric, check_scalar) + check_symmetric, check_scalar, + _deprecate_positional_args) from .. import get_config @@ -314,10 +315,10 @@ def safe_indexing(X, indices, axis=0): CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are not supported. """ - return _safe_indexing(X, indices, axis) + return _safe_indexing(X, indices, axis=axis) -def _safe_indexing(X, indices, axis=0): +def _safe_indexing(X, indices, *, axis=0): """Return rows, items or columns of X using indices. .. warning:: @@ -684,7 +685,8 @@ def shuffle(*arrays, **options): return resample(*arrays, **options) -def safe_sqr(X, copy=True): +@_deprecate_positional_args +def safe_sqr(X, *, copy=True): """Element wise squaring of array-likes and sparse matrices. Parameters @@ -723,7 +725,8 @@ def _chunk_generator(gen, chunksize): return -def gen_batches(n, batch_size, min_batch_size=0): +@_deprecate_positional_args +def gen_batches(n, batch_size, *, min_batch_size=0): """Generator to create slices containing batch_size elements, from 0 to n. The last slice may contain less than batch_size elements, when batch_size @@ -772,7 +775,8 @@ def gen_batches(n, batch_size, min_batch_size=0): yield slice(start, n) -def gen_even_slices(n, n_packs, n_samples=None): +@_deprecate_positional_args +def gen_even_slices(n, n_packs, *, n_samples=None): """Generator to create n_packs slices going up to n. Parameters @@ -957,8 +961,8 @@ def _print_elapsed_time(source, message=None): timeit.default_timer() - start)) -def get_chunk_n_rows(row_bytes, max_n_rows=None, - working_memory=None): +@_deprecate_positional_args +def get_chunk_n_rows(row_bytes, *, max_n_rows=None, working_memory=None): """Calculates how many rows can be processed within working_memory Parameters diff --git a/sklearn/utils/class_weight.py b/sklearn/utils/class_weight.py index 5f785cb36df45..8c64e33e1d0d4 100644 --- a/sklearn/utils/class_weight.py +++ b/sklearn/utils/class_weight.py @@ -4,6 +4,8 @@ import numpy as np +from .validation import _deprecate_positional_args + def compute_class_weight(class_weight, classes, y): """Estimate class weights for unbalanced datasets. @@ -69,7 +71,8 @@ def compute_class_weight(class_weight, classes, y): return weight -def compute_sample_weight(class_weight, y, indices=None): +@_deprecate_positional_args +def compute_sample_weight(class_weight, y, *, indices=None): """Estimate sample weights by class for unbalanced datasets. Parameters diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index 3301ac977b4b9..8e471d5fdf577 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -27,7 +27,9 @@ def _unique_multiclass(y): def _unique_indicator(y): - return np.arange(check_array(y, ['csr', 'csc', 'coo']).shape[1]) + return np.arange( + check_array(y, accept_sparse=['csr', 'csc', 'coo']).shape[1] + ) _FN_UNIQUE_LABELS = { @@ -83,7 +85,8 @@ def unique_labels(*ys): # Check consistency for the indicator format if (label_type == "multilabel-indicator" and - len(set(check_array(y, ['csr', 'csc', 'coo']).shape[1] + len(set(check_array(y, + accept_sparse=['csr', 'csc', 'coo']).shape[1] for y in ys)) > 1): raise ValueError("Multi-label binary indicator input with " "different numbers of labels") diff --git a/sklearn/utils/tests/test_class_weight.py b/sklearn/utils/tests/test_class_weight.py index 487da5b431be0..067b12cc32f28 100644 --- a/sklearn/utils/tests/test_class_weight.py +++ b/sklearn/utils/tests/test_class_weight.py @@ -192,39 +192,41 @@ def test_compute_sample_weight_with_subsample(): # Test compute_sample_weight with subsamples specified. # Test with balanced classes and all samples present y = np.asarray([1, 1, 1, 2, 2, 2]) - sample_weight = compute_sample_weight("balanced", y, range(6)) + sample_weight = compute_sample_weight("balanced", y, indices=range(6)) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.]) # Test with column vector of balanced classes and all samples present y = np.asarray([[1], [1], [1], [2], [2], [2]]) - sample_weight = compute_sample_weight("balanced", y, range(6)) + sample_weight = compute_sample_weight("balanced", y, indices=range(6)) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1.]) # Test with a subsample y = np.asarray([1, 1, 1, 2, 2, 2]) - sample_weight = compute_sample_weight("balanced", y, range(4)) + sample_weight = compute_sample_weight("balanced", y, indices=range(4)) assert_array_almost_equal(sample_weight, [2. / 3, 2. / 3, 2. / 3, 2., 2., 2.]) # Test with a bootstrap subsample y = np.asarray([1, 1, 1, 2, 2, 2]) - sample_weight = compute_sample_weight("balanced", y, [0, 1, 1, 2, 2, 3]) + sample_weight = compute_sample_weight("balanced", y, + indices=[0, 1, 1, 2, 2, 3]) expected_balanced = np.asarray([0.6, 0.6, 0.6, 3., 3., 3.]) assert_array_almost_equal(sample_weight, expected_balanced) # Test with a bootstrap subsample for multi-output y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]]) - sample_weight = compute_sample_weight("balanced", y, [0, 1, 1, 2, 2, 3]) + sample_weight = compute_sample_weight("balanced", y, + indices=[0, 1, 1, 2, 2, 3]) assert_array_almost_equal(sample_weight, expected_balanced ** 2) # Test with a missing class y = np.asarray([1, 1, 1, 2, 2, 2, 3]) - sample_weight = compute_sample_weight("balanced", y, range(6)) + sample_weight = compute_sample_weight("balanced", y, indices=range(6)) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.]) # Test with a missing class for multi-output y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [2, 2]]) - sample_weight = compute_sample_weight("balanced", y, range(6)) + sample_weight = compute_sample_weight("balanced", y, indices=range(6)) assert_array_almost_equal(sample_weight, [1., 1., 1., 1., 1., 1., 0.]) @@ -237,15 +239,15 @@ def test_compute_sample_weight_errors(): with pytest.raises(ValueError): compute_sample_weight("ni", y) with pytest.raises(ValueError): - compute_sample_weight("ni", y, range(4)) + compute_sample_weight("ni", y, indices=range(4)) with pytest.raises(ValueError): compute_sample_weight("ni", y_) with pytest.raises(ValueError): - compute_sample_weight("ni", y_, range(4)) + compute_sample_weight("ni", y_, indices=range(4)) # Not "balanced" for subsample with pytest.raises(ValueError): - compute_sample_weight({1: 2, 2: 1}, y, range(4)) + compute_sample_weight({1: 2, 2: 1}, y, indices=range(4)) # Not a list or preset for multi-output with pytest.raises(ValueError): diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index b178ccc148d9d..418f037936c64 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -63,7 +63,7 @@ def test_as_float_array(): X = X.astype(np.int64) X2 = as_float_array(X, copy=True) # Checking that the array wasn't overwritten - assert as_float_array(X, False) is not X + assert as_float_array(X, copy=False) is not X assert X2.dtype == np.float64 # Test int dtypes <= 32bit tested_dtypes = [np.bool, @@ -912,7 +912,8 @@ def test_check_scalar_valid(x, target_type, min_val, max_val): """Test that check_scalar returns no error/warning if valid inputs are provided""" with pytest.warns(None) as record: - check_scalar(x, "test_name", target_type, min_val, max_val) + check_scalar(x, "test_name", target_type=target_type, + min_val=min_val, max_val=max_val) assert len(record) == 0 diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 953584fff0f8a..8ee18371a3009 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -36,6 +36,44 @@ warnings.simplefilter('ignore', NonBLASDotWarning) +def _deprecate_positional_args(f): + """Decorator for methods that issues warnings for positional arguments + + Using the keyword-only argument syntax in pep 3102, arguments after the + * will issue a warning when passed as a positional argument. + + Parameters + ---------- + f : function + function to check arguments on + """ + sig = signature(f) + kwonly_args = [] + all_args = [] + + for name, param in sig.parameters.items(): + if param.kind == Parameter.POSITIONAL_OR_KEYWORD: + all_args.append(name) + elif param.kind == Parameter.KEYWORD_ONLY: + kwonly_args.append(name) + + @wraps(f) + def inner_f(*args, **kwargs): + extra_args = len(args) - len(all_args) + if extra_args > 0: + # ignore first 'self' argument for instance methods + args_msg = ['{}={}'.format(name, arg) + for name, arg in zip(kwonly_args[:extra_args], + args[-extra_args:])] + warnings.warn("Pass {} as keyword args. From version 0.25 " + "passing these as positional arguments will " + "result in an error".format(", ".join(args_msg)), + FutureWarning) + kwargs.update({k: arg for k, arg in zip(sig.parameters, args)}) + return f(**kwargs) + return inner_f + + def _assert_all_finite(X, allow_nan=False, msg_dtype=None): """Like assert_all_finite, but only for ndarray.""" # validation is also imported in extmath @@ -67,7 +105,8 @@ def _assert_all_finite(X, allow_nan=False, msg_dtype=None): raise ValueError("Input contains NaN") -def assert_all_finite(X, allow_nan=False): +@_deprecate_positional_args +def assert_all_finite(X, *, allow_nan=False): """Throw a ValueError if X contains NaN or infinity. Parameters @@ -79,7 +118,8 @@ def assert_all_finite(X, allow_nan=False): _assert_all_finite(X.data if sp.issparse(X) else X, allow_nan) -def as_float_array(X, copy=True, force_all_finite=True): +@_deprecate_positional_args +def as_float_array(X, *, copy=True, force_all_finite=True): """Converts an array-like to an array of floats. The new dtype will be np.float32 or np.float64, depending on the original @@ -113,9 +153,9 @@ def as_float_array(X, copy=True, force_all_finite=True): """ if isinstance(X, np.matrix) or (not isinstance(X, np.ndarray) and not sp.issparse(X)): - return check_array(X, ['csr', 'csc', 'coo'], dtype=np.float64, - copy=copy, force_all_finite=force_all_finite, - ensure_2d=False) + return check_array(X, accept_sparse=['csr', 'csc', 'coo'], + dtype=np.float64, copy=copy, + force_all_finite=force_all_finite, ensure_2d=False) elif sp.issparse(X) and X.dtype in [np.float32, np.float64]: return X.copy() if copy else X elif X.dtype in [np.float32, np.float64]: # is numpy array @@ -349,7 +389,8 @@ def _ensure_no_complex_data(array): "{}\n".format(array)) -def check_array(array, accept_sparse=False, accept_large_sparse=True, +@_deprecate_positional_args +def check_array(array, accept_sparse=False, *, accept_large_sparse=True, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=1, ensure_min_features=1, estimator=None): @@ -620,7 +661,8 @@ def _check_large_sparse(X, accept_large_sparse=False): % indices_datatype) -def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True, +@_deprecate_positional_args +def check_X_y(X, y, accept_sparse=False, *, accept_large_sparse=True, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, multi_output=False, ensure_min_samples=1, ensure_min_features=1, y_numeric=False, @@ -732,8 +774,8 @@ def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True, ensure_min_features=ensure_min_features, estimator=estimator) if multi_output: - y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False, - dtype=None) + y = check_array(y, accept_sparse='csr', force_all_finite=True, + ensure_2d=False, dtype=None) else: y = column_or_1d(y, warn=True) _assert_all_finite(y) @@ -745,7 +787,8 @@ def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True, return X, y -def column_or_1d(y, warn=False): +@_deprecate_positional_args +def column_or_1d(y, *, warn=False): """ Ravel column or 1d numpy array, else raises an error Parameters @@ -825,7 +868,8 @@ def has_fit_parameter(estimator, parameter): return parameter in signature(estimator.fit).parameters -def check_symmetric(array, tol=1E-10, raise_warning=True, +@_deprecate_positional_args +def check_symmetric(array, *, tol=1E-10, raise_warning=True, raise_exception=False): """Make sure that array is 2D, square and symmetric. @@ -881,7 +925,8 @@ def check_symmetric(array, tol=1E-10, raise_warning=True, return array -def check_is_fitted(estimator, attributes=None, msg=None, all_or_any=all): +@_deprecate_positional_args +def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all): """Perform is_fitted validation for estimator. Checks if the estimator is fitted by verifying the presence of @@ -974,7 +1019,7 @@ def check_non_negative(X, whom): raise ValueError("Negative values in data passed to %s" % whom) -def check_scalar(x, name, target_type, min_val=None, max_val=None): +def check_scalar(x, name, target_type, *, min_val=None, max_val=None): """Validate scalar parameters type and value. Parameters @@ -1268,44 +1313,6 @@ def _allclose_dense_sparse(x, y, rtol=1e-7, atol=1e-9): "matrix and an array") -def _deprecate_positional_args(f): - """Decorator for methods that issues warnings for positional arguments - - Using the keyword-only argument syntax in pep 3102, arguments after the - * will issue a warning when passed as a positional argument. - - Parameters - ---------- - f : function - function to check arguments on - """ - sig = signature(f) - kwonly_args = [] - all_args = [] - - for name, param in sig.parameters.items(): - if param.kind == Parameter.POSITIONAL_OR_KEYWORD: - all_args.append(name) - elif param.kind == Parameter.KEYWORD_ONLY: - kwonly_args.append(name) - - @wraps(f) - def inner_f(*args, **kwargs): - extra_args = len(args) - len(all_args) - if extra_args > 0: - # ignore first 'self' argument for instance methods - args_msg = ['{}={}'.format(name, arg) - for name, arg in zip(kwonly_args[:extra_args], - args[-extra_args:])] - warnings.warn("Pass {} as keyword args. From version 0.25 " - "passing these as positional arguments will " - "result in an error".format(", ".join(args_msg)), - FutureWarning) - kwargs.update({k: arg for k, arg in zip(sig.parameters, args)}) - return f(**kwargs) - return inner_f - - def _check_fit_params(X, fit_params, indices=None): """Check and validate the parameters passed during `fit`. From 839b356f45fac7724eab739dcc129a0c8f650a23 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Mon, 27 Apr 2020 09:45:03 -0400 Subject: [PATCH 073/125] DOC Removes examples from exceptions docstrings (#17040) --- sklearn/exceptions.py | 56 ------------------------------------------- 1 file changed, 56 deletions(-) diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py index 1b71050813d2b..7140b98e53027 100644 --- a/sklearn/exceptions.py +++ b/sklearn/exceptions.py @@ -49,24 +49,6 @@ class ChangedBehaviorWarning(UserWarning): class ConvergenceWarning(UserWarning): """Custom warning to capture convergence problems - Examples - -------- - - >>> import numpy as np - >>> import warnings - >>> from sklearn.cluster import KMeans - >>> from sklearn.exceptions import ConvergenceWarning - >>> warnings.simplefilter("always", ConvergenceWarning) - >>> X = np.asarray([[0, 0], - ... [0, 1], - ... [1, 0], - ... [1, 0]]) # last point is duplicated - >>> with warnings.catch_warnings(record=True) as w: - ... km = KMeans(n_clusters=4).fit(X) - ... print(w[-1].message) - Number of distinct clusters (3) found smaller than n_clusters (4). - Possibly due to duplicate points in X. - .. versionchanged:: 0.18 Moved from sklearn.utils. """ @@ -85,23 +67,6 @@ class DataConversionWarning(UserWarning): implementation's data-type expectations; - passes an input whose shape can be interpreted ambiguously. - Examples - -------- - >>> from sklearn.utils import validation - >>> Y = [[1],[2],[3]] - >>> import warnings - >>> from sklearn.exceptions import DataConversionWarning - >>> warnings.simplefilter('always', DataConversionWarning) - >>> with warnings.catch_warnings(record=True) as w: - ... try: - ... # will trigger warning as Y is a column-vector - ... Y = validation.column_or_1d(Y,warn=True) - ... except ValueError: - ... pass - ... print(w[-1].message) - A column-vector y was passed when a 1d array was expected. Please change - the shape of y to (n_samples, ), for example using ravel(). - .. versionchanged:: 0.18 Moved from sklearn.utils.validation. """ @@ -139,27 +104,6 @@ class FitFailedWarning(RuntimeWarning): and the cross-validation helper function cross_val_score to warn when there is an error while fitting the estimator. - Examples - -------- - >>> from sklearn.model_selection import GridSearchCV - >>> from sklearn.svm import LinearSVC - >>> from sklearn.exceptions import FitFailedWarning - >>> import warnings - >>> warnings.simplefilter('always', FitFailedWarning) - >>> gs = GridSearchCV(LinearSVC(), {'C': [-1, -2]}, error_score=0, cv=2) - >>> X, y = [[1, 2], [3, 4], [5, 6], [7, 8]], [0, 0, 1, 1] - >>> with warnings.catch_warnings(record=True) as w: - ... try: - ... gs.fit(X, y) # This will raise a ValueError since C is < 0 - ... except ValueError: - ... pass - ... print(repr(w[-1].message)) - FitFailedWarning('Estimator fit failed. The score on this train-test - partition for these parameters will be set to 0.000000. - Details:...Traceback (most recent call last):...ValueError: - Penalty term must be positive; got (C=-2)... - - .. versionchanged:: 0.18 Moved from sklearn.cross_validation. """ From 1b119c46937f29b1b29fb8eaaee6910beb7807d0 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Mon, 27 Apr 2020 16:12:29 +0200 Subject: [PATCH 074/125] MNT fix generate_authors_table.py (#17011) --- build_tools/generate_authors_table.py | 4 ++-- doc/authors.rst | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/build_tools/generate_authors_table.py b/build_tools/generate_authors_table.py index 81e99856c6890..eaad1df75475c 100644 --- a/build_tools/generate_authors_table.py +++ b/build_tools/generate_authors_table.py @@ -11,6 +11,7 @@ import getpass import time from pathlib import Path +from os import path print("user:", file=sys.stderr) user = input() @@ -18,7 +19,7 @@ auth = (user, passwd) LOGO_URL = 'https://avatars2.githubusercontent.com/u/365630?v=4' -REPO_FOLDER = Path(__file__).parent.parent +REPO_FOLDER = Path(path.abspath(__file__)).parent.parent def get(url): @@ -100,7 +101,6 @@ def get_profile(login): 'Duchesnay': 'Edouard Duchesnay', 'Lars': 'Lars Buitinck', 'MechCoder': 'Manoj Kumar', - 'jeremiedbb': 'Jérémie Du Boisberranger', } if profile["name"] in missing_names: profile["name"] = missing_names[profile["name"]] diff --git a/doc/authors.rst b/doc/authors.rst index 6a03871d67e90..7b5426fe3128d 100644 --- a/doc/authors.rst +++ b/doc/authors.rst @@ -7,7 +7,7 @@

-

Jérémie Du Boisberranger

+

Jérémie du Boisberranger


From 91e942759fd7e16554cb5dc80918dce1f810b7f4 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Mon, 27 Apr 2020 11:23:15 -0400 Subject: [PATCH 075/125] DOC Make release highlights the first gallery section (#16952) --- doc/conf.py | 5 +++++ doc/developers/plotting.rst | 2 +- doc/modules/compose.rst | 2 +- doc/modules/ensemble.rst | 2 +- doc/modules/isotonic.rst | 4 ++-- doc/modules/kernel_approximation.rst | 6 +++--- doc/modules/kernel_ridge.rst | 8 ++++---- doc/modules/multiclass.rst | 6 +++--- doc/modules/neighbors.rst | 8 ++++---- doc/modules/outlier_detection.rst | 18 +++++++++--------- doc/modules/random_projection.rst | 10 +++++----- doc/modules/tree.rst | 8 ++++---- doc/modules/unsupervised_reduction.rst | 2 +- doc/visualizations.rst | 14 +++++++------- examples/README.txt | 5 ----- examples/miscellaneous/README.txt | 7 +++++++ .../plot_anomaly_comparison.py | 0 .../plot_changed_only_pprint_parameter.py | 0 .../plot_display_object_visualization.py | 0 .../plot_isotonic_regression.py | 0 .../plot_johnson_lindenstrauss_bound.py | 3 ++- .../plot_kernel_approximation.py | 0 .../plot_kernel_ridge_regression.py | 0 .../{ => miscellaneous}/plot_multilabel.py | 0 .../plot_multioutput_face_completion.py | 0 ...lot_partial_dependence_visualization_api.py | 4 ++-- .../plot_roc_curve_visualization_api.py | 0 sklearn/inspection/_plot/partial_dependence.py | 2 +- 28 files changed, 62 insertions(+), 54 deletions(-) create mode 100644 examples/miscellaneous/README.txt rename examples/{ => miscellaneous}/plot_anomaly_comparison.py (100%) rename examples/{ => miscellaneous}/plot_changed_only_pprint_parameter.py (100%) rename examples/{ => miscellaneous}/plot_display_object_visualization.py (100%) rename examples/{ => miscellaneous}/plot_isotonic_regression.py (100%) rename examples/{ => miscellaneous}/plot_johnson_lindenstrauss_bound.py (99%) rename examples/{ => miscellaneous}/plot_kernel_approximation.py (100%) rename examples/{ => miscellaneous}/plot_kernel_ridge_regression.py (100%) rename examples/{ => miscellaneous}/plot_multilabel.py (100%) rename examples/{ => miscellaneous}/plot_multioutput_face_completion.py (100%) rename examples/{ => miscellaneous}/plot_partial_dependence_visualization_api.py (98%) rename examples/{ => miscellaneous}/plot_roc_curve_visualization_api.py (100%) diff --git a/doc/conf.py b/doc/conf.py index a13ed14216de4..1783a676b6d01 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -281,6 +281,11 @@ def __repr__(self): def __call__(self, directory): src_path = os.path.normpath(os.path.join(self.src_dir, directory)) + + # Forces Release Highlights to the top + if os.path.basename(src_path) == "release_highlights": + return "0" + readme = os.path.join(src_path, "README.txt") try: diff --git a/doc/developers/plotting.rst b/doc/developers/plotting.rst index 98af195b56453..7a2f6ebf69415 100644 --- a/doc/developers/plotting.rst +++ b/doc/developers/plotting.rst @@ -50,7 +50,7 @@ attributes:: estimator.__class__.__name__) return viz.plot(ax=ax, name=name, **kwargs) -Read more in :ref:`sphx_glr_auto_examples_plot_roc_curve_visualization_api.py` +Read more in :ref:`sphx_glr_auto_examples_miscellaneous_plot_roc_curve_visualization_api.py` and the :ref:`User Guide `. Plotting with Multiple Axes diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index 51a933dcbee47..cd29b14b1f081 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -144,7 +144,7 @@ or by name:: * :ref:`sphx_glr_auto_examples_feature_selection_plot_feature_selection_pipeline.py` * :ref:`sphx_glr_auto_examples_model_selection_grid_search_text_feature_extraction.py` * :ref:`sphx_glr_auto_examples_compose_plot_digits_pipe.py` - * :ref:`sphx_glr_auto_examples_plot_kernel_approximation.py` + * :ref:`sphx_glr_auto_examples_miscellaneous_plot_kernel_approximation.py` * :ref:`sphx_glr_auto_examples_svm_plot_svm_anova.py` * :ref:`sphx_glr_auto_examples_compose_plot_compare_reduction.py` diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index bff08f542ce11..3cf8987fcfd5a 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -246,7 +246,7 @@ amount of time (e.g., on large datasets). * :ref:`sphx_glr_auto_examples_ensemble_plot_forest_iris.py` * :ref:`sphx_glr_auto_examples_ensemble_plot_forest_importances_faces.py` - * :ref:`sphx_glr_auto_examples_plot_multioutput_face_completion.py` + * :ref:`sphx_glr_auto_examples_miscellaneous_plot_multioutput_face_completion.py` .. topic:: References diff --git a/doc/modules/isotonic.rst b/doc/modules/isotonic.rst index 1f54dcfa50bad..8967ef18afcb3 100644 --- a/doc/modules/isotonic.rst +++ b/doc/modules/isotonic.rst @@ -28,6 +28,6 @@ correlation coefficient for predicting to unseen data. The predictions of :class:`IsotonicRegression` thus form a function that is piecewise linear: -.. figure:: ../auto_examples/images/sphx_glr_plot_isotonic_regression_001.png - :target: ../auto_examples/plot_isotonic_regression.html +.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_isotonic_regression_001.png + :target: ../auto_examples/miscellaneous/plot_isotonic_regression.html :align: center diff --git a/doc/modules/kernel_approximation.rst b/doc/modules/kernel_approximation.rst index 77354d5afaf1d..fb3843c6bc045 100644 --- a/doc/modules/kernel_approximation.rst +++ b/doc/modules/kernel_approximation.rst @@ -84,8 +84,8 @@ For a given value of ``n_components`` :class:`RBFSampler` is often less accurate as :class:`Nystroem`. :class:`RBFSampler` is cheaper to compute, though, making use of larger feature spaces more efficient. -.. figure:: ../auto_examples/images/sphx_glr_plot_kernel_approximation_002.png - :target: ../auto_examples/plot_kernel_approximation.html +.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_kernel_approximation_002.png + :target: ../auto_examples/miscellaneous/plot_kernel_approximation.html :scale: 50% :align: center @@ -93,7 +93,7 @@ use of larger feature spaces more efficient. .. topic:: Examples: - * :ref:`sphx_glr_auto_examples_plot_kernel_approximation.py` + * :ref:`sphx_glr_auto_examples_miscellaneous_plot_kernel_approximation.py` .. _additive_chi_kernel_approx: diff --git a/doc/modules/kernel_ridge.rst b/doc/modules/kernel_ridge.rst index a67733b1ca5a5..286e9d4ac5322 100644 --- a/doc/modules/kernel_ridge.rst +++ b/doc/modules/kernel_ridge.rst @@ -35,8 +35,8 @@ However, prediction of 100000 target values is more than three times faster with :class:`~sklearn.svm.SVR` since it has learned a sparse model using only approximately 1/3 of the 100 training datapoints as support vectors. -.. figure:: ../auto_examples/images/sphx_glr_plot_kernel_ridge_regression_001.png - :target: ../auto_examples/plot_kernel_ridge_regression.html +.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_kernel_ridge_regression_001.png + :target: ../auto_examples/miscellaneous/plot_kernel_ridge_regression.html :align: center The next figure compares the time for fitting and prediction of @@ -51,8 +51,8 @@ prediction time depends on the parameters :math:`\epsilon` and :math:`C` of the :class:`~sklearn.svm.SVR`; :math:`\epsilon = 0` would correspond to a dense model. -.. figure:: ../auto_examples/images/sphx_glr_plot_kernel_ridge_regression_002.png - :target: ../auto_examples/plot_kernel_ridge_regression.html +.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_kernel_ridge_regression_002.png + :target: ../auto_examples/miscellaneous/plot_kernel_ridge_regression.html :align: center diff --git a/doc/modules/multiclass.rst b/doc/modules/multiclass.rst index 606b4246a0b88..1f6556bfa54f3 100644 --- a/doc/modules/multiclass.rst +++ b/doc/modules/multiclass.rst @@ -311,15 +311,15 @@ To use this feature, feed the classifier an indicator matrix, in which cell [i, j] indicates the presence of label j in sample i. -.. figure:: ../auto_examples/images/sphx_glr_plot_multilabel_001.png - :target: ../auto_examples/plot_multilabel.html +.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_multilabel_001.png + :target: ../auto_examples/miscellaneous/plot_multilabel.html :align: center :scale: 75% .. topic:: Examples: - * :ref:`sphx_glr_auto_examples_plot_multilabel.py` + * :ref:`sphx_glr_auto_examples_miscellaneous_plot_multilabel.py` .. _ovo_classification: diff --git a/doc/modules/neighbors.rst b/doc/modules/neighbors.rst index 9aa27a53501b8..397fdd1dd9e90 100644 --- a/doc/modules/neighbors.rst +++ b/doc/modules/neighbors.rst @@ -230,12 +230,12 @@ which will be used to compute the weights. :scale: 75 The use of multi-output nearest neighbors for regression is demonstrated in -:ref:`sphx_glr_auto_examples_plot_multioutput_face_completion.py`. In this example, the inputs +:ref:`sphx_glr_auto_examples_miscellaneous_plot_multioutput_face_completion.py`. In this example, the inputs X are the pixels of the upper half of faces and the outputs Y are the pixels of the lower half of those faces. -.. figure:: ../auto_examples/images/sphx_glr_plot_multioutput_face_completion_001.png - :target: ../auto_examples/plot_multioutput_face_completion.html +.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_multioutput_face_completion_001.png + :target: ../auto_examples/miscellaneous/plot_multioutput_face_completion.html :scale: 75 :align: center @@ -245,7 +245,7 @@ the lower half of those faces. * :ref:`sphx_glr_auto_examples_neighbors_plot_regression.py`: an example of regression using nearest neighbors. - * :ref:`sphx_glr_auto_examples_plot_multioutput_face_completion.py`: an example of + * :ref:`sphx_glr_auto_examples_miscellaneous_plot_multioutput_face_completion.py`: an example of multi-output regression using nearest neighbors. diff --git a/doc/modules/outlier_detection.rst b/doc/modules/outlier_detection.rst index c061feb0b1d7c..76bd85f3bb1c8 100644 --- a/doc/modules/outlier_detection.rst +++ b/doc/modules/outlier_detection.rst @@ -98,8 +98,8 @@ Outlier Factor (LOF) does not show a decision boundary in black as it has no predict method to be applied on new data when it is used for outlier detection. -.. figure:: ../auto_examples/images/sphx_glr_plot_anomaly_comparison_001.png - :target: ../auto_examples/plot_anomaly_comparison.html +.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_anomaly_comparison_001.png + :target: ../auto_examples/miscellaneous/plot_anomaly_comparison.html :align: center :scale: 50 @@ -109,12 +109,12 @@ The :class:`svm.OneClassSVM` is known to be sensitive to outliers and thus does not perform very well for outlier detection. Finally, :class:`covariance.EllipticEnvelope` assumes the data is Gaussian and learns an ellipse. For more details on the different estimators refer to the example -:ref:`sphx_glr_auto_examples_plot_anomaly_comparison.py` and the sections -hereunder. +:ref:`sphx_glr_auto_examples_miscellaneous_plot_anomaly_comparison.py` and the +sections hereunder. .. topic:: Examples: - * See :ref:`sphx_glr_auto_examples_plot_anomaly_comparison.py` + * See :ref:`sphx_glr_auto_examples_miscellaneous_plot_anomaly_comparison.py` for a comparison of the :class:`svm.OneClassSVM`, the :class:`ensemble.IsolationForest`, the :class:`neighbors.LocalOutlierFactor` and @@ -270,8 +270,8 @@ allows you to add more trees to an already fitted model:: * See :ref:`sphx_glr_auto_examples_ensemble_plot_isolation_forest.py` for an illustration of the use of IsolationForest. - * See :ref:`sphx_glr_auto_examples_plot_anomaly_comparison.py` for a - comparison of :class:`ensemble.IsolationForest` with + * See :ref:`sphx_glr_auto_examples_miscellaneous_plot_anomaly_comparison.py` + for a comparison of :class:`ensemble.IsolationForest` with :class:`neighbors.LocalOutlierFactor`, :class:`svm.OneClassSVM` (tuned to perform like an outlier detection method) and a covariance-based outlier detection with @@ -339,8 +339,8 @@ This strategy is illustrated below. * See :ref:`sphx_glr_auto_examples_neighbors_plot_lof_outlier_detection.py` for an illustration of the use of :class:`neighbors.LocalOutlierFactor`. - * See :ref:`sphx_glr_auto_examples_plot_anomaly_comparison.py` for a - comparison with other anomaly detection methods. + * See :ref:`sphx_glr_auto_examples_miscellaneous_plot_anomaly_comparison.py` + for a comparison with other anomaly detection methods. .. topic:: References: diff --git a/doc/modules/random_projection.rst b/doc/modules/random_projection.rst index eb8d6de984985..cd3c129cfad45 100644 --- a/doc/modules/random_projection.rst +++ b/doc/modules/random_projection.rst @@ -64,19 +64,19 @@ bounded distortion introduced by the random projection:: >>> johnson_lindenstrauss_min_dim(n_samples=[1e4, 1e5, 1e6], eps=0.1) array([ 7894, 9868, 11841]) -.. figure:: ../auto_examples/images/sphx_glr_plot_johnson_lindenstrauss_bound_001.png - :target: ../auto_examples/plot_johnson_lindenstrauss_bound.html +.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_johnson_lindenstrauss_bound_001.png + :target: ../auto_examples/miscellaneous/plot_johnson_lindenstrauss_bound.html :scale: 75 :align: center -.. figure:: ../auto_examples/images/sphx_glr_plot_johnson_lindenstrauss_bound_002.png - :target: ../auto_examples/plot_johnson_lindenstrauss_bound.html +.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_johnson_lindenstrauss_bound_002.png + :target: ../auto_examples/miscellaneous/plot_johnson_lindenstrauss_bound.html :scale: 75 :align: center .. topic:: Example: - * See :ref:`sphx_glr_auto_examples_plot_johnson_lindenstrauss_bound.py` + * See :ref:`sphx_glr_auto_examples_miscellaneous_plot_johnson_lindenstrauss_bound.py` for a theoretical explication on the Johnson-Lindenstrauss lemma and an empirical validation using sparse random matrices. diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst index af6fc4e1edfe9..e12b63adb48c4 100644 --- a/doc/modules/tree.rst +++ b/doc/modules/tree.rst @@ -280,19 +280,19 @@ X is a single real value and the outputs Y are the sine and cosine of X. :align: center The use of multi-output trees for classification is demonstrated in -:ref:`sphx_glr_auto_examples_plot_multioutput_face_completion.py`. In this example, the inputs +:ref:`sphx_glr_auto_examples_miscellaneous_plot_multioutput_face_completion.py`. In this example, the inputs X are the pixels of the upper half of faces and the outputs Y are the pixels of the lower half of those faces. -.. figure:: ../auto_examples/images/sphx_glr_plot_multioutput_face_completion_001.png - :target: ../auto_examples/plot_multioutput_face_completion.html +.. figure:: ../auto_examples/miscellaneous/images/sphx_glr_plot_multioutput_face_completion_001.png + :target: ../auto_examples/miscellaneous/plot_multioutput_face_completion.html :scale: 75 :align: center .. topic:: Examples: * :ref:`sphx_glr_auto_examples_tree_plot_tree_regression_multioutput.py` - * :ref:`sphx_glr_auto_examples_plot_multioutput_face_completion.py` + * :ref:`sphx_glr_auto_examples_miscellaneous_plot_multioutput_face_completion.py` .. topic:: References: diff --git a/doc/modules/unsupervised_reduction.rst b/doc/modules/unsupervised_reduction.rst index 3a85b8e53b553..6e16886064cfc 100644 --- a/doc/modules/unsupervised_reduction.rst +++ b/doc/modules/unsupervised_reduction.rst @@ -37,7 +37,7 @@ documentation: :ref:`random_projection`. .. topic:: **Examples** - * :ref:`sphx_glr_auto_examples_plot_johnson_lindenstrauss_bound.py` + * :ref:`sphx_glr_auto_examples_miscellaneous_plot_johnson_lindenstrauss_bound.py` Feature agglomeration ------------------------ diff --git a/doc/visualizations.rst b/doc/visualizations.rst index e50a9a90a0b84..ebb98700d9e08 100644 --- a/doc/visualizations.rst +++ b/doc/visualizations.rst @@ -24,8 +24,8 @@ ROC curve for a fitted support vector machine: svc_disp = plot_roc_curve(svc, X_test, y_test) -.. figure:: auto_examples/images/sphx_glr_plot_roc_curve_visualization_api_001.png - :target: auto_examples/plot_roc_curve_visualization_api.html +.. figure:: auto_examples/miscellaneous/images/sphx_glr_plot_roc_curve_visualization_api_001.png + :target: auto_examples/miscellaneous/plot_roc_curve_visualization_api.html :align: center :scale: 75% @@ -48,8 +48,8 @@ method of the `Display` object. rfc_disp = plot_roc_curve(rfc, X_test, y_test, ax=ax, alpha=0.8) svc_disp.plot(ax=ax, alpha=0.8) -.. figure:: auto_examples/images/sphx_glr_plot_roc_curve_visualization_api_002.png - :target: auto_examples/plot_roc_curve_visualization_api.html +.. figure:: auto_examples/miscellaneous/images/sphx_glr_plot_roc_curve_visualization_api_002.png + :target: auto_examples/miscellaneous/plot_roc_curve_visualization_api.html :align: center :scale: 75% @@ -58,9 +58,9 @@ values of the curves. .. topic:: Examples: - * :ref:`sphx_glr_auto_examples_plot_roc_curve_visualization_api.py` - * :ref:`sphx_glr_auto_examples_plot_partial_dependence_visualization_api.py` - * :ref:`sphx_glr_auto_examples_plot_display_object_visualization.py` + * :ref:`sphx_glr_auto_examples_miscellaneous_plot_roc_curve_visualization_api.py` + * :ref:`sphx_glr_auto_examples_miscellaneous_plot_partial_dependence_visualization_api.py` + * :ref:`sphx_glr_auto_examples_miscellaneous_plot_display_object_visualization.py` Available Plotting Utilities ============================ diff --git a/examples/README.txt b/examples/README.txt index 4ee6efc46d1dd..958de667a5c69 100644 --- a/examples/README.txt +++ b/examples/README.txt @@ -2,8 +2,3 @@ Examples ======== - -Miscellaneous examples ----------------------- - -Miscellaneous and introductory examples for scikit-learn. diff --git a/examples/miscellaneous/README.txt b/examples/miscellaneous/README.txt new file mode 100644 index 0000000000000..4e44ceee95809 --- /dev/null +++ b/examples/miscellaneous/README.txt @@ -0,0 +1,7 @@ +.. _miscellaneous_examples: + +Miscellaneous +------------- + +Miscellaneous and introductory examples for scikit-learn. + diff --git a/examples/plot_anomaly_comparison.py b/examples/miscellaneous/plot_anomaly_comparison.py similarity index 100% rename from examples/plot_anomaly_comparison.py rename to examples/miscellaneous/plot_anomaly_comparison.py diff --git a/examples/plot_changed_only_pprint_parameter.py b/examples/miscellaneous/plot_changed_only_pprint_parameter.py similarity index 100% rename from examples/plot_changed_only_pprint_parameter.py rename to examples/miscellaneous/plot_changed_only_pprint_parameter.py diff --git a/examples/plot_display_object_visualization.py b/examples/miscellaneous/plot_display_object_visualization.py similarity index 100% rename from examples/plot_display_object_visualization.py rename to examples/miscellaneous/plot_display_object_visualization.py diff --git a/examples/plot_isotonic_regression.py b/examples/miscellaneous/plot_isotonic_regression.py similarity index 100% rename from examples/plot_isotonic_regression.py rename to examples/miscellaneous/plot_isotonic_regression.py diff --git a/examples/plot_johnson_lindenstrauss_bound.py b/examples/miscellaneous/plot_johnson_lindenstrauss_bound.py similarity index 99% rename from examples/plot_johnson_lindenstrauss_bound.py rename to examples/miscellaneous/plot_johnson_lindenstrauss_bound.py index b981c14fbf132..9d369c6c6d46d 100644 --- a/examples/plot_johnson_lindenstrauss_bound.py +++ b/examples/miscellaneous/plot_johnson_lindenstrauss_bound.py @@ -8,7 +8,8 @@ dataset can be randomly projected into a lower dimensional Euclidean space while controlling the distortion in the pairwise distances. -.. _`Johnson-Lindenstrauss lemma`: https://en.wikipedia.org/wiki/Johnson%E2%80%93Lindenstrauss_lemma +.. _`Johnson-Lindenstrauss lemma`: https://en.wikipedia.org/wiki/\ + Johnson%E2%80%93Lindenstrauss_lemma """ print(__doc__) diff --git a/examples/plot_kernel_approximation.py b/examples/miscellaneous/plot_kernel_approximation.py similarity index 100% rename from examples/plot_kernel_approximation.py rename to examples/miscellaneous/plot_kernel_approximation.py diff --git a/examples/plot_kernel_ridge_regression.py b/examples/miscellaneous/plot_kernel_ridge_regression.py similarity index 100% rename from examples/plot_kernel_ridge_regression.py rename to examples/miscellaneous/plot_kernel_ridge_regression.py diff --git a/examples/plot_multilabel.py b/examples/miscellaneous/plot_multilabel.py similarity index 100% rename from examples/plot_multilabel.py rename to examples/miscellaneous/plot_multilabel.py diff --git a/examples/plot_multioutput_face_completion.py b/examples/miscellaneous/plot_multioutput_face_completion.py similarity index 100% rename from examples/plot_multioutput_face_completion.py rename to examples/miscellaneous/plot_multioutput_face_completion.py diff --git a/examples/plot_partial_dependence_visualization_api.py b/examples/miscellaneous/plot_partial_dependence_visualization_api.py similarity index 98% rename from examples/plot_partial_dependence_visualization_api.py rename to examples/miscellaneous/plot_partial_dependence_visualization_api.py index 8ccb225afc2d0..761dad8b1e1fa 100644 --- a/examples/plot_partial_dependence_visualization_api.py +++ b/examples/miscellaneous/plot_partial_dependence_visualization_api.py @@ -10,9 +10,9 @@ .. note:: - See also :ref:`sphx_glr_auto_examples_plot_roc_curve_visualization_api.py` + See also :ref:`sphx_glr_auto_examples_miscellaneous_plot_roc_curve_visualization_api.py` -""" +""" # noqa print(__doc__) import pandas as pd diff --git a/examples/plot_roc_curve_visualization_api.py b/examples/miscellaneous/plot_roc_curve_visualization_api.py similarity index 100% rename from examples/plot_roc_curve_visualization_api.py rename to examples/miscellaneous/plot_roc_curve_visualization_api.py diff --git a/sklearn/inspection/_plot/partial_dependence.py b/sklearn/inspection/_plot/partial_dependence.py index 812005f5ab2ae..e02717e76dce3 100644 --- a/sklearn/inspection/_plot/partial_dependence.py +++ b/sklearn/inspection/_plot/partial_dependence.py @@ -343,7 +343,7 @@ class PartialDependenceDisplay: stored as attributes. Read more in - :ref:`sphx_glr_auto_examples_plot_partial_dependence_visualization_api.py` + :ref:`sphx_glr_auto_examples_miscellaneous_plot_partial_dependence_visualization_api.py` and the :ref:`User Guide `. .. versionadded:: 0.22 From a1261a7e18c19bb3dfc8d739a6512c6f671d9e79 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 27 Apr 2020 13:58:33 -0400 Subject: [PATCH 076/125] DOC kwonly section in whatsnew (#17059) --- doc/whats_new/v0.23.rst | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 4357845885e3f..6d9fbfeeebc0c 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -14,6 +14,20 @@ Version 0.23.0 Put the changes in their relevant module. +Enforcing keyword-only arguments +-------------------------------- + +In an effort to promote clear and non-ambiguous use of the library, most +constructor and function parameters are now expected to be passed as keyword +arguments (i.e. using the `param=value` syntax) instead of positional. To +ease the transition, a `FutureWarning` is raised if a keyword-only parameter +is used as positional. In version 0.25, these parameters will be strictly +keyword-only, and a `TypeError` will be raised. +:issue:`15005` by `Joel Nothman`_, `Adrin Jalali`_, `Thomas Fan`_, and +`Nicolas Hug`_. See `SLEP009 +`_ +for more details. + Changed models -------------- @@ -536,12 +550,6 @@ Miscellaneous error message is raised when y was expected but None was passed. :pr:`16622` by `Nicolas Hug`_. -- |API| Most constructor and function parameters are now expected to be passed - as a keyword and not positional. :issue:`15005` by `Joel Nothman`_, - `Adrin Jalali`_, `Thomas Fan`_, and `Nicolas Hug`_. See `SLEP009 - `_ - for more details. - Code and Documentation Contributors ----------------------------------- From 1ba06518c5c4c7c1865110a8f34c4da64d8e478f Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Mon, 27 Apr 2020 20:41:35 +0200 Subject: [PATCH 077/125] API make load_* args in datasets kwarg only (#16719) * API male load_* args in datasets kwarg only * more loaders * pep8 * fix test_omp usage * fix some usages * Update sklearn/datasets/_samples_generator.py Co-Authored-By: Thomas J Fan Co-authored-by: Nicolas Hug Co-authored-by: Thomas J Fan --- sklearn/datasets/_base.py | 25 ++++++--- sklearn/datasets/_california_housing.py | 5 +- sklearn/datasets/_covtype.py | 5 +- sklearn/datasets/_kddcup99.py | 5 +- sklearn/datasets/_lfw.py | 8 ++- sklearn/datasets/_olivetti_faces.py | 4 +- sklearn/datasets/_openml.py | 4 +- sklearn/datasets/_rcv1.py | 4 +- sklearn/datasets/_samples_generator.py | 64 +++++++++++++++------- sklearn/datasets/_species_distributions.py | 4 +- sklearn/datasets/_svmlight_format_io.py | 20 +++++-- sklearn/datasets/_twenty_newsgroups.py | 7 ++- sklearn/datasets/tests/test_base.py | 2 +- sklearn/linear_model/tests/test_omp.py | 7 ++- 14 files changed, 117 insertions(+), 47 deletions(-) diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py index 9737a5f67891a..2402fc3a069dc 100644 --- a/sklearn/datasets/_base.py +++ b/sklearn/datasets/_base.py @@ -17,6 +17,7 @@ from ..utils import Bunch from ..utils import check_random_state from ..utils import check_pandas_support +from ..utils.validation import _deprecate_positional_args import numpy as np @@ -80,7 +81,8 @@ def _convert_data_dataframe(caller_name, data, target, return combined_df, X, y -def load_files(container_path, description=None, categories=None, +@_deprecate_positional_args +def load_files(container_path, *, description=None, categories=None, load_content=True, shuffle=True, encoding=None, decode_error='strict', random_state=0): """Load text files with categories as subfolder names. @@ -267,7 +269,8 @@ def load_data(module_path, data_file_name): return data, target, target_names -def load_wine(return_X_y=False, as_frame=False): +@_deprecate_positional_args +def load_wine(*, return_X_y=False, as_frame=False): """Load and return the wine dataset (classification). .. versionadded:: 0.18 @@ -381,7 +384,8 @@ def load_wine(return_X_y=False, as_frame=False): feature_names=feature_names) -def load_iris(return_X_y=False, as_frame=False): +@_deprecate_positional_args +def load_iris(*, return_X_y=False, as_frame=False): """Load and return the iris dataset (classification). The iris dataset is a classic and very easy multi-class classification @@ -495,7 +499,8 @@ def load_iris(return_X_y=False, as_frame=False): filename=iris_csv_filename) -def load_breast_cancer(return_X_y=False, as_frame=False): +@_deprecate_positional_args +def load_breast_cancer(*, return_X_y=False, as_frame=False): """Load and return the breast cancer wisconsin dataset (classification). The breast cancer dataset is a classic and very easy binary classification @@ -619,7 +624,8 @@ def load_breast_cancer(return_X_y=False, as_frame=False): filename=csv_filename) -def load_digits(n_class=10, return_X_y=False, as_frame=False): +@_deprecate_positional_args +def load_digits(*, n_class=10, return_X_y=False, as_frame=False): """Load and return the digits dataset (classification). Each datapoint is a 8x8 image of a digit. @@ -742,7 +748,8 @@ def load_digits(n_class=10, return_X_y=False, as_frame=False): DESCR=descr) -def load_diabetes(return_X_y=False, as_frame=False): +@_deprecate_positional_args +def load_diabetes(*, return_X_y=False, as_frame=False): """Load and return the diabetes dataset (regression). ============== ================== @@ -834,7 +841,8 @@ def load_diabetes(return_X_y=False, as_frame=False): target_filename=target_filename) -def load_linnerud(return_X_y=False, as_frame=False): +@_deprecate_positional_args +def load_linnerud(*, return_X_y=False, as_frame=False): """Load and return the physical excercise linnerud dataset. This dataset is suitable for multi-ouput regression tasks. @@ -937,7 +945,8 @@ def load_linnerud(return_X_y=False, as_frame=False): target_filename=target_filename) -def load_boston(return_X_y=False): +@_deprecate_positional_args +def load_boston(*, return_X_y=False): """Load and return the boston house-prices dataset (regression). ============== ============== diff --git a/sklearn/datasets/_california_housing.py b/sklearn/datasets/_california_housing.py index e3df2124aab2b..107458e2d515d 100644 --- a/sklearn/datasets/_california_housing.py +++ b/sklearn/datasets/_california_housing.py @@ -36,6 +36,8 @@ from ._base import _pkl_filepath from ._base import RemoteFileMetadata from ..utils import Bunch +from ..utils.validation import _deprecate_positional_args + # The original data can be found at: # https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz @@ -48,7 +50,8 @@ logger = logging.getLogger(__name__) -def fetch_california_housing(data_home=None, download_if_missing=True, +@_deprecate_positional_args +def fetch_california_housing(*, data_home=None, download_if_missing=True, return_X_y=False, as_frame=False): """Load the California housing dataset (regression). diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py index 6b23f913e05a7..de93b22ac4f56 100644 --- a/sklearn/datasets/_covtype.py +++ b/sklearn/datasets/_covtype.py @@ -28,6 +28,8 @@ from ..utils import Bunch from ._base import _pkl_filepath from ..utils import check_random_state +from ..utils.validation import _deprecate_positional_args + # The original data can be found in: # https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz @@ -40,7 +42,8 @@ logger = logging.getLogger(__name__) -def fetch_covtype(data_home=None, download_if_missing=True, +@_deprecate_positional_args +def fetch_covtype(*, data_home=None, download_if_missing=True, random_state=None, shuffle=False, return_X_y=False): """Load the covertype dataset (classification). diff --git a/sklearn/datasets/_kddcup99.py b/sklearn/datasets/_kddcup99.py index c0ba00fa46f04..4e2f6856d89b1 100644 --- a/sklearn/datasets/_kddcup99.py +++ b/sklearn/datasets/_kddcup99.py @@ -23,6 +23,8 @@ from ..utils import Bunch from ..utils import check_random_state from ..utils import shuffle as shuffle_method +from ..utils.validation import _deprecate_positional_args + # The original data can be found at: # https://archive.ics.uci.edu/ml/machine-learning-databases/kddcup99-mld/kddcup.data.gz @@ -43,7 +45,8 @@ logger = logging.getLogger(__name__) -def fetch_kddcup99(subset=None, data_home=None, shuffle=False, +@_deprecate_positional_args +def fetch_kddcup99(*, subset=None, data_home=None, shuffle=False, random_state=None, percent10=True, download_if_missing=True, return_X_y=False): """Load the kddcup99 dataset (classification). diff --git a/sklearn/datasets/_lfw.py b/sklearn/datasets/_lfw.py index 3dc3833db3417..b8db75010e8f2 100644 --- a/sklearn/datasets/_lfw.py +++ b/sklearn/datasets/_lfw.py @@ -20,6 +20,7 @@ from ._base import get_data_home, _fetch_remote, RemoteFileMetadata from ..utils import Bunch +from ..utils.validation import _deprecate_positional_args logger = logging.getLogger(__name__) @@ -215,7 +216,8 @@ def _fetch_lfw_people(data_folder_path, slice_=None, color=False, resize=None, return faces, target, target_names -def fetch_lfw_people(data_home=None, funneled=True, resize=0.5, +@_deprecate_positional_args +def fetch_lfw_people(*, data_home=None, funneled=True, resize=0.5, min_faces_per_person=0, color=False, slice_=(slice(70, 195), slice(78, 172)), download_if_missing=True, return_X_y=False): @@ -385,7 +387,9 @@ def _fetch_lfw_pairs(index_file_path, data_folder_path, slice_=None, return pairs, target, np.array(['Different persons', 'Same person']) -def fetch_lfw_pairs(subset='train', data_home=None, funneled=True, resize=0.5, +@_deprecate_positional_args +def fetch_lfw_pairs(*, subset='train', data_home=None, funneled=True, + resize=0.5, color=False, slice_=(slice(70, 195), slice(78, 172)), download_if_missing=True): """Load the Labeled Faces in the Wild (LFW) pairs dataset (classification). diff --git a/sklearn/datasets/_olivetti_faces.py b/sklearn/datasets/_olivetti_faces.py index d5f163d468214..dfa459880a5c4 100644 --- a/sklearn/datasets/_olivetti_faces.py +++ b/sklearn/datasets/_olivetti_faces.py @@ -25,6 +25,7 @@ from ._base import RemoteFileMetadata from ._base import _pkl_filepath from ..utils import check_random_state, Bunch +from ..utils.validation import _deprecate_positional_args # The original data can be found at: # https://cs.nyu.edu/~roweis/data/olivettifaces.mat @@ -35,7 +36,8 @@ 'd5fca46a4b8906c18e454d41af987794')) -def fetch_olivetti_faces(data_home=None, shuffle=False, random_state=0, +@_deprecate_positional_args +def fetch_olivetti_faces(*, data_home=None, shuffle=False, random_state=0, download_if_missing=True, return_X_y=False): """Load the Olivetti faces data-set from AT&T (classification). diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py index 4040641a17574..112cd9c0e525e 100644 --- a/sklearn/datasets/_openml.py +++ b/sklearn/datasets/_openml.py @@ -23,6 +23,7 @@ from ..utils import get_chunk_n_rows from ..utils import _chunk_generator from ..utils import check_pandas_support # noqa +from ..utils.validation import _deprecate_positional_args __all__ = ['fetch_openml'] @@ -608,7 +609,8 @@ def _valid_data_column_names(features_list, target_columns): return valid_data_column_names -def fetch_openml(name=None, version='active', data_id=None, data_home=None, +@_deprecate_positional_args +def fetch_openml(name=None, *, version='active', data_id=None, data_home=None, target_column='default-target', cache=True, return_X_y=False, as_frame=False): """Fetch dataset from openml by name or dataset id. diff --git a/sklearn/datasets/_rcv1.py b/sklearn/datasets/_rcv1.py index 4f1c5cc4af199..abb9881700614 100644 --- a/sklearn/datasets/_rcv1.py +++ b/sklearn/datasets/_rcv1.py @@ -25,6 +25,7 @@ from ._svmlight_format_io import load_svmlight_files from ..utils import shuffle as shuffle_ from ..utils import Bunch +from ..utils.validation import _deprecate_positional_args # The original vectorized data can be found at: @@ -75,7 +76,8 @@ logger = logging.getLogger(__name__) -def fetch_rcv1(data_home=None, subset='all', download_if_missing=True, +@_deprecate_positional_args +def fetch_rcv1(*, data_home=None, subset='all', download_if_missing=True, random_state=None, shuffle=False, return_X_y=False): """Load the RCV1 multilabel dataset (classification). diff --git a/sklearn/datasets/_samples_generator.py b/sklearn/datasets/_samples_generator.py index 7642b6fb7dc59..ee3ac6ab2827f 100644 --- a/sklearn/datasets/_samples_generator.py +++ b/sklearn/datasets/_samples_generator.py @@ -18,6 +18,7 @@ from ..utils import check_array, check_random_state from ..utils import shuffle as util_shuffle from ..utils.random import sample_without_replacement +from ..utils.validation import _deprecate_positional_args def _generate_hypercube(samples, dimensions, rng): @@ -33,7 +34,8 @@ def _generate_hypercube(samples, dimensions, rng): return out -def make_classification(n_samples=100, n_features=20, n_informative=2, +@_deprecate_positional_args +def make_classification(n_samples=100, n_features=20, *, n_informative=2, n_redundant=2, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, @@ -261,7 +263,9 @@ def make_classification(n_samples=100, n_features=20, n_informative=2, return X, y -def make_multilabel_classification(n_samples=100, n_features=20, n_classes=5, +@_deprecate_positional_args +def make_multilabel_classification(n_samples=100, n_features=20, *, + n_classes=5, n_labels=2, length=50, allow_unlabeled=True, sparse=False, return_indicator='dense', return_distributions=False, @@ -422,7 +426,8 @@ def sample_example(): return X, Y -def make_hastie_10_2(n_samples=12000, random_state=None): +@_deprecate_positional_args +def make_hastie_10_2(n_samples=12000, *, random_state=None): """Generates data for binary classification used in Hastie et al. 2009, Example 10.2. @@ -470,7 +475,8 @@ def make_hastie_10_2(n_samples=12000, random_state=None): return X, y -def make_regression(n_samples=100, n_features=100, n_informative=10, +@_deprecate_positional_args +def make_regression(n_samples=100, n_features=100, *, n_informative=10, n_targets=1, bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, shuffle=True, coef=False, random_state=None): @@ -592,7 +598,8 @@ def make_regression(n_samples=100, n_features=100, n_informative=10, return X, y -def make_circles(n_samples=100, shuffle=True, noise=None, random_state=None, +@_deprecate_positional_args +def make_circles(n_samples=100, *, shuffle=True, noise=None, random_state=None, factor=.8): """Make a large circle containing a smaller circle in 2d. @@ -668,7 +675,8 @@ def make_circles(n_samples=100, shuffle=True, noise=None, random_state=None, return X, y -def make_moons(n_samples=100, shuffle=True, noise=None, random_state=None): +@_deprecate_positional_args +def make_moons(n_samples=100, *, shuffle=True, noise=None, random_state=None): """Make two interleaving half circles A simple toy dataset to visualize clustering and classification @@ -731,7 +739,8 @@ def make_moons(n_samples=100, shuffle=True, noise=None, random_state=None): return X, y -def make_blobs(n_samples=100, n_features=2, centers=None, cluster_std=1.0, +@_deprecate_positional_args +def make_blobs(n_samples=100, n_features=2, *, centers=None, cluster_std=1.0, center_box=(-10.0, 10.0), shuffle=True, random_state=None, return_centers=False): """Generate isotropic Gaussian blobs for clustering. @@ -889,7 +898,9 @@ def make_blobs(n_samples=100, n_features=2, centers=None, cluster_std=1.0, return X, y -def make_friedman1(n_samples=100, n_features=10, noise=0.0, random_state=None): +@_deprecate_positional_args +def make_friedman1(n_samples=100, n_features=10, *, noise=0.0, + random_state=None): """Generate the "Friedman #1" regression problem This dataset is described in Friedman [1] and Breiman [2]. @@ -951,7 +962,8 @@ def make_friedman1(n_samples=100, n_features=10, noise=0.0, random_state=None): return X, y -def make_friedman2(n_samples=100, noise=0.0, random_state=None): +@_deprecate_positional_args +def make_friedman2(n_samples=100, *, noise=0.0, random_state=None): """Generate the "Friedman #2" regression problem This dataset is described in Friedman [1] and Breiman [2]. @@ -1016,7 +1028,8 @@ def make_friedman2(n_samples=100, noise=0.0, random_state=None): return X, y -def make_friedman3(n_samples=100, noise=0.0, random_state=None): +@_deprecate_positional_args +def make_friedman3(n_samples=100, *, noise=0.0, random_state=None): """Generate the "Friedman #3" regression problem This dataset is described in Friedman [1] and Breiman [2]. @@ -1080,7 +1093,8 @@ def make_friedman3(n_samples=100, noise=0.0, random_state=None): return X, y -def make_low_rank_matrix(n_samples=100, n_features=100, effective_rank=10, +@_deprecate_positional_args +def make_low_rank_matrix(n_samples=100, n_features=100, *, effective_rank=10, tail_strength=0.5, random_state=None): """Generate a mostly low rank matrix with bell-shaped singular values @@ -1149,7 +1163,8 @@ def make_low_rank_matrix(n_samples=100, n_features=100, effective_rank=10, return np.dot(np.dot(u, s), v.T) -def make_sparse_coded_signal(n_samples, n_components, n_features, +@_deprecate_positional_args +def make_sparse_coded_signal(n_samples, *, n_components, n_features, n_nonzero_coefs, random_state=None): """Generate a signal as a sparse combination of dictionary elements. @@ -1211,7 +1226,9 @@ def make_sparse_coded_signal(n_samples, n_components, n_features, return map(np.squeeze, (Y, D, X)) -def make_sparse_uncorrelated(n_samples=100, n_features=10, random_state=None): +@_deprecate_positional_args +def make_sparse_uncorrelated(n_samples=100, n_features=10, *, + random_state=None): """Generate a random regression problem with sparse uncorrelated design This dataset is described in Celeux et al [1]. as:: @@ -1262,7 +1279,8 @@ def make_sparse_uncorrelated(n_samples=100, n_features=10, random_state=None): return X, y -def make_spd_matrix(n_dim, random_state=None): +@_deprecate_positional_args +def make_spd_matrix(n_dim, *, random_state=None): """Generate a random symmetric, positive-definite matrix. Read more in the :ref:`User Guide `. @@ -1295,7 +1313,8 @@ def make_spd_matrix(n_dim, random_state=None): return X -def make_sparse_spd_matrix(dim=1, alpha=0.95, norm_diag=False, +@_deprecate_positional_args +def make_sparse_spd_matrix(dim=1, *, alpha=0.95, norm_diag=False, smallest_coef=.1, largest_coef=.9, random_state=None): """Generate a sparse symmetric definite positive matrix. @@ -1369,7 +1388,8 @@ def make_sparse_spd_matrix(dim=1, alpha=0.95, norm_diag=False, return prec -def make_swiss_roll(n_samples=100, noise=0.0, random_state=None): +@_deprecate_positional_args +def make_swiss_roll(n_samples=100, *, noise=0.0, random_state=None): """Generate a swiss roll dataset. Read more in the :ref:`User Guide `. @@ -1421,7 +1441,8 @@ def make_swiss_roll(n_samples=100, noise=0.0, random_state=None): return X, t -def make_s_curve(n_samples=100, noise=0.0, random_state=None): +@_deprecate_positional_args +def make_s_curve(n_samples=100, *, noise=0.0, random_state=None): """Generate an S curve dataset. Read more in the :ref:`User Guide `. @@ -1463,7 +1484,8 @@ def make_s_curve(n_samples=100, noise=0.0, random_state=None): return X, t -def make_gaussian_quantiles(mean=None, cov=1., n_samples=100, +@_deprecate_positional_args +def make_gaussian_quantiles(*, mean=None, cov=1., n_samples=100, n_features=2, n_classes=3, shuffle=True, random_state=None): r"""Generate isotropic Gaussian and label samples by quantile @@ -1558,7 +1580,8 @@ def _shuffle(data, random_state=None): return result, row_idx, col_idx -def make_biclusters(shape, n_clusters, noise=0.0, minval=10, +@_deprecate_positional_args +def make_biclusters(shape, n_clusters, *, noise=0.0, minval=10, maxval=100, shuffle=True, random_state=None): """Generate an array with constant block diagonal structure for biclustering. @@ -1649,7 +1672,8 @@ def make_biclusters(shape, n_clusters, noise=0.0, minval=10, return result, rows, cols -def make_checkerboard(shape, n_clusters, noise=0.0, minval=10, +@_deprecate_positional_args +def make_checkerboard(shape, n_clusters, *, noise=0.0, minval=10, maxval=100, shuffle=True, random_state=None): """Generate an array with block checkerboard structure for biclustering. diff --git a/sklearn/datasets/_species_distributions.py b/sklearn/datasets/_species_distributions.py index 7f621d1de74eb..e17ab419c5d7e 100644 --- a/sklearn/datasets/_species_distributions.py +++ b/sklearn/datasets/_species_distributions.py @@ -50,6 +50,7 @@ from ._base import _fetch_remote from ._base import RemoteFileMetadata from ..utils import Bunch +from ..utils.validation import _deprecate_positional_args from ._base import _pkl_filepath # The original data can be found at: @@ -137,7 +138,8 @@ def construct_grids(batch): return (xgrid, ygrid) -def fetch_species_distributions(data_home=None, +@_deprecate_positional_args +def fetch_species_distributions(*, data_home=None, download_if_missing=True): """Loader for species distribution dataset from Phillips et. al. (2006) diff --git a/sklearn/datasets/_svmlight_format_io.py b/sklearn/datasets/_svmlight_format_io.py index 91bb35ff2ec75..8360ee4402b40 100644 --- a/sklearn/datasets/_svmlight_format_io.py +++ b/sklearn/datasets/_svmlight_format_io.py @@ -25,6 +25,7 @@ from .. import __version__ from ..utils import check_array, IS_PYPY +from ..utils.validation import _deprecate_positional_args if not IS_PYPY: from ._svmlight_format_fast import _load_svmlight_file @@ -37,7 +38,8 @@ def _load_svmlight_file(*args, **kwargs): 'for the status updates).') -def load_svmlight_file(f, n_features=None, dtype=np.float64, +@_deprecate_positional_args +def load_svmlight_file(f, *, n_features=None, dtype=np.float64, multilabel=False, zero_based="auto", query_id=False, offset=0, length=-1): """Load datasets in the svmlight / libsvm format into sparse CSR matrix @@ -151,8 +153,13 @@ def get_data(): X, y = get_data() """ - return tuple(load_svmlight_files([f], n_features, dtype, multilabel, - zero_based, query_id, offset, length)) + return tuple(load_svmlight_files([f], n_features=n_features, + dtype=dtype, + multilabel=multilabel, + zero_based=zero_based, + query_id=query_id, + offset=offset, + length=length)) def _gen_open(f): @@ -196,7 +203,8 @@ def _open_and_load(f, dtype, multilabel, zero_based, query_id, return data, indices, indptr, labels, query -def load_svmlight_files(files, n_features=None, dtype=np.float64, +@_deprecate_positional_args +def load_svmlight_files(files, *, n_features=None, dtype=np.float64, multilabel=False, zero_based="auto", query_id=False, offset=0, length=-1): """Load dataset from multiple files in SVMlight format @@ -380,7 +388,9 @@ def _dump_svmlight(X, y, f, multilabel, one_based, comment, query_id): f.write((line_pattern % feat).encode('ascii')) -def dump_svmlight_file(X, y, f, zero_based=True, comment=None, query_id=None, +@_deprecate_positional_args +def dump_svmlight_file(X, y, f, *, zero_based=True, comment=None, + query_id=None, multilabel=False): """Dump the dataset in svmlight / libsvm file format. diff --git a/sklearn/datasets/_twenty_newsgroups.py b/sklearn/datasets/_twenty_newsgroups.py index ebbd191069c49..c5d322c88ef0c 100644 --- a/sklearn/datasets/_twenty_newsgroups.py +++ b/sklearn/datasets/_twenty_newsgroups.py @@ -45,6 +45,7 @@ from ..feature_extraction.text import CountVectorizer from .. import preprocessing from ..utils import check_random_state, Bunch +from ..utils.validation import _deprecate_positional_args logger = logging.getLogger(__name__) @@ -146,7 +147,8 @@ def strip_newsgroup_footer(text): return text -def fetch_20newsgroups(data_home=None, subset='train', categories=None, +@_deprecate_positional_args +def fetch_20newsgroups(*, data_home=None, subset='train', categories=None, shuffle=True, random_state=42, remove=(), download_if_missing=True, return_X_y=False): @@ -322,7 +324,8 @@ def fetch_20newsgroups(data_home=None, subset='train', categories=None, return data -def fetch_20newsgroups_vectorized(subset="train", remove=(), data_home=None, +@_deprecate_positional_args +def fetch_20newsgroups_vectorized(*, subset="train", remove=(), data_home=None, download_if_missing=True, return_X_y=False, normalize=True): """Load the 20 newsgroups dataset and vectorize it into token counts \ diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py index 224538b181696..3ec60074a4015 100644 --- a/sklearn/datasets/tests/test_base.py +++ b/sklearn/datasets/tests/test_base.py @@ -152,7 +152,7 @@ def test_load_digits(): def test_load_digits_n_class_lt_10(): - digits = load_digits(9) + digits = load_digits(n_class=9) assert digits.data.shape == (1617, 64) assert numpy.unique(digits.target).size == 9 diff --git a/sklearn/linear_model/tests/test_omp.py b/sklearn/linear_model/tests/test_omp.py index 791983ba62cc2..f3f3080aebe66 100644 --- a/sklearn/linear_model/tests/test_omp.py +++ b/sklearn/linear_model/tests/test_omp.py @@ -18,8 +18,11 @@ from sklearn.datasets import make_sparse_coded_signal n_samples, n_features, n_nonzero_coefs, n_targets = 25, 35, 5, 3 -y, X, gamma = make_sparse_coded_signal(n_targets, n_features, n_samples, - n_nonzero_coefs, random_state=0) +y, X, gamma = make_sparse_coded_signal(n_samples=n_targets, + n_components=n_features, + n_features=n_samples, + n_nonzero_coefs=n_nonzero_coefs, + random_state=0) # Make X not of norm 1 for testing X *= 10 y *= 10 From 5b2c931a994c9f4e39d202efd3b8a3de44309728 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 28 Apr 2020 05:38:11 +1000 Subject: [PATCH 078/125] API pairwise_distances will require explicit V/VI param if Y is given (#16993) * API pairwise_distances will require explicit V/VI param if Y is given Deprecation until version 0.25. The current approach in `_precompute_metric_params` (https://github.com/scikit-learn/scikit-learn/blob/f82a2cb33871a67b36150647ece1c7e56d3132bb/sklearn/metrics/pairwise.py#L1429-L1444) means that we may be applying a different metric at training and test time. Ideally we'd have a framework for fitting a metric on some specific training data, but in the meantime, this deprecation stops users making mistakes. * DOC update what's new * Update sklearn/metrics/tests/test_pairwise.py Co-Authored-By: Thomas J Fan * Update sklearn/metrics/pairwise.py Co-Authored-By: Thomas J Fan * Update sklearn/metrics/pairwise.py Co-Authored-By: Thomas J Fan * Update sklearn/metrics/tests/test_pairwise.py Co-Authored-By: Thomas J Fan Co-authored-by: Thomas J Fan --- doc/whats_new/v0.23.rst | 6 ++++++ sklearn/metrics/pairwise.py | 6 ++++++ sklearn/metrics/tests/test_pairwise.py | 12 ++++++++++-- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 6d9fbfeeebc0c..e0a3927f41405 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -397,6 +397,12 @@ Changelog or 'd'). :pr:`16159` by :user:`Rick Mackenbach ` and `Thomas Fan`_. +- |API| From version 0.25, :func:`metrics.pairwise.pairwise_distances` will no + longer automatically compute the ``VI`` parameter for Mahalanobis distance + and the ``V`` parameter for seuclidean distance if ``Y`` is passed. The user + will be expected to compute this parameter on the training data of their + choice and pass it to `pairwise_distances`. :pr:`16993` by `Joel Nothman`_. + :mod:`sklearn.model_selection` .............................. diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 9d4107ebd66d6..20350345f54da 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1441,12 +1441,18 @@ def _precompute_metric_params(X, Y, metric=None, **kwds): if X is Y: V = np.var(X, axis=0, ddof=1) else: + warnings.warn("from version 0.25, pairwise_distances for " + "metric='seuclidean' will require V to be " + "specified if Y is passed.", FutureWarning) V = np.var(np.vstack([X, Y]), axis=0, ddof=1) return {'V': V} if metric == "mahalanobis" and 'VI' not in kwds: if X is Y: VI = np.linalg.inv(np.cov(X.T)).T else: + warnings.warn("from version 0.25, pairwise_distances for " + "metric='mahalanobis' will require VI to be " + "specified if Y is passed.", FutureWarning) VI = np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T return {'VI': VI} return {} diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index f2c7a307571bc..d7a96de12c9e3 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -1281,8 +1281,16 @@ def test_pairwise_distances_data_derived_params(n_jobs, metric, dist_function, params = {'VI': np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T} expected_dist_explicit_params = cdist(X, Y, metric=metric, **params) - dist = np.vstack(tuple(dist_function(X, Y, - metric=metric, n_jobs=n_jobs))) + # TODO: Remove warn_checker in 0.25 + if y_is_x: + warn_checker = pytest.warns(None) + else: + warn_checker = pytest.warns(FutureWarning, + match="to be specified if Y is passed") + with warn_checker: + dist = np.vstack(tuple(dist_function(X, Y, + metric=metric, + n_jobs=n_jobs))) assert_allclose(dist, expected_dist_explicit_params) assert_allclose(dist, expected_dist_default_params) From 41b18fea37e59a84bb9219c32c15c36432afc9ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20G=C3=B3rski?= Date: Mon, 27 Apr 2020 23:32:19 +0200 Subject: [PATCH 079/125] ENH Added n_components_ to SparsePCA and MiniBatchSparsePCA (#16981) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add n_components_ attribute to SparcePCA and MiniBatchSparsePCA * Add n_components_ attribute to SparcePCA and MiniBatchSparsePCA * Add whatsnew entry * Fix documentation * Apply suggestions from code review Co-Authored-By: Thomas J Fan * Add suggested changes * Update test_sparse_pca.py Co-authored-by: Mateusz Górski Co-authored-by: Thomas J Fan --- doc/whats_new/v0.23.rst | 3 +++ sklearn/decomposition/_sparse_pca.py | 12 ++++++++++++ sklearn/decomposition/tests/test_sparse_pca.py | 15 +++++++++++++++ 3 files changed, 30 insertions(+) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index e0a3927f41405..b5b95e745f456 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -215,6 +215,9 @@ Changelog raise `invalid value encountered in multiply` during `fit`. :pr:`16718` by :user:`Gui Miotto `. +- |Feature| Added `n_components_` attribute to :class:'decomposition.SparsePCA' + and :class:'MiniBatchSparsePCA'. :pr:'16981' by :user:'Mateusz Górski ' + :mod:`sklearn.ensemble` ....................... diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py index cf1f5a2608e1c..53f3ed3bf23ca 100644 --- a/sklearn/decomposition/_sparse_pca.py +++ b/sklearn/decomposition/_sparse_pca.py @@ -103,6 +103,11 @@ class SparsePCA(TransformerMixin, BaseEstimator): error_ : array Vector of errors at each iteration. + n_components_ : int + Estimated number of components. + + .. versionadded:: 0.23 + n_iter_ : int Number of iterations run. @@ -197,6 +202,7 @@ def fit(self, X, y=None): self.components_, axis=1)[:, np.newaxis] components_norm[components_norm == 0] = 1 self.components_ /= components_norm + self.n_components_ = len(self.components_) self.error_ = E return self @@ -312,6 +318,11 @@ class MiniBatchSparsePCA(SparsePCA): components_ : array, [n_components, n_features] Sparse components extracted from the data. + n_components_ : int + Estimated number of components. + + .. versionadded:: 0.23 + n_iter_ : int Number of iterations run. @@ -403,5 +414,6 @@ def fit(self, X, y=None): self.components_, axis=1)[:, np.newaxis] components_norm[components_norm == 0] = 1 self.components_ /= components_norm + self.n_components_ = len(self.components_) return self diff --git a/sklearn/decomposition/tests/test_sparse_pca.py b/sklearn/decomposition/tests/test_sparse_pca.py index f3d14e31f3e1b..9ee0339a192b4 100644 --- a/sklearn/decomposition/tests/test_sparse_pca.py +++ b/sklearn/decomposition/tests/test_sparse_pca.py @@ -207,3 +207,18 @@ def test_spca_error_unormalized_components(spca): err_msg = "normalize_components=False is not supported starting " with pytest.raises(NotImplementedError, match=err_msg): spca(normalize_components=False).fit(Y) + + +@pytest.mark.parametrize("SPCA", [SparsePCA, MiniBatchSparsePCA]) +@pytest.mark.parametrize("n_components", [None, 3]) +def test_spca_n_components_(SPCA, n_components): + rng = np.random.RandomState(0) + n_samples, n_features = 12, 10 + X = rng.randn(n_samples, n_features) + + model = SPCA(n_components=n_components).fit(X) + + if n_components is not None: + assert model.n_components_ == n_components + else: + assert model.n_components_ == n_features From fc0041546a6e186bbacdf96e200fc863d620c44d Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 27 Apr 2020 18:15:52 -0400 Subject: [PATCH 080/125] Deprecate class support for check_estimator (#17032) and parametrize_with_checks --- doc/developers/develop.rst | 8 ++-- doc/whats_new/v0.23.rst | 5 +++ sklearn/tests/test_common.py | 22 +++++++++- sklearn/utils/estimator_checks.py | 45 +++++++++++++++++--- sklearn/utils/tests/test_estimator_checks.py | 4 ++ 5 files changed, 74 insertions(+), 10 deletions(-) diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst index f17c58cee0d7f..13d2010ca7319 100644 --- a/doc/developers/develop.rst +++ b/doc/developers/develop.rst @@ -246,13 +246,13 @@ whether it is just for you or for contributing it to scikit-learn, there are several internals of scikit-learn that you should be aware of in addition to the scikit-learn API outlined above. You can check whether your estimator adheres to the scikit-learn interface and standards by running -:func:`utils.estimator_checks.check_estimator` on the class or using -:func:`~sklearn.utils.parametrize_with_checks` pytest decorator (see its -docstring for details and possible interactions with `pytest`):: +:func:`~sklearn.utils.estimator_checks.check_estimator` on an instance. The +:func:`~sklearn.utils.parametrize_with_checks` pytest decorator can also be +used (see its docstring for details and possible interactions with `pytest`):: >>> from sklearn.utils.estimator_checks import check_estimator >>> from sklearn.svm import LinearSVC - >>> check_estimator(LinearSVC) # passes + >>> check_estimator(LinearSVC()) # passes The main motivation to make a class compatible to the scikit-learn estimator interface might be that you want to use it together with model evaluation and diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index b5b95e745f456..b596e8a1797be 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -534,6 +534,11 @@ Changelog matrix from a pandas DataFrame that contains only `SparseArray`s. :pr:`16728` by `Thomas Fan`_. +- |API| Passing classes to :func:`utils.estimator_checks.check_estimator` and + :func:`utils.estimator_checks.parametrize_with_checks` is now deprecated, + and support for classes will be removed in 0.24. Pass instances instead. + :pr:`17032` by `Nicolas Hug`_. + :mod:`sklearn.cluster` ...................... diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index af98c1bc50a74..73c99b0483de8 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -48,7 +48,9 @@ def test_all_estimator_no_base_class(): assert not name.lower().startswith('base'), msg +@ignore_warnings("Passing a class is depr", category=FutureWarning) # 0.24 def test_estimator_cls_parameterize_with_checks(): + # TODO: remove test in 0.24 # Non-regression test for #16707 to ensure that parametrize_with_checks # works with estimator classes param_checks = parametrize_with_checks([LogisticRegression]) @@ -105,7 +107,7 @@ def _tested_estimators(): yield estimator -@parametrize_with_checks(_tested_estimators()) +@parametrize_with_checks(list(_tested_estimators())) def test_estimators(estimator, check, request): # Common tests for estimator instances with ignore_warnings(category=(FutureWarning, @@ -115,7 +117,9 @@ def test_estimators(estimator, check, request): check(estimator) +@ignore_warnings("Passing a class is depr", category=FutureWarning) # 0.24 def test_check_estimator_generate_only(): + # TODO in 0.24: remove checks on passing a class estimator_cls_gen_checks = check_estimator(LogisticRegression, generate_only=True) all_instance_gen_checks = check_estimator(LogisticRegression(), @@ -238,3 +242,19 @@ def test_all_tests_are_importable(): '__init__.py or an add_subpackage directive ' 'in the parent ' 'setup.py'.format(missing_tests)) + + +# TODO: remove in 0.24 +def test_class_support_deprecated(): + # Make sure passing classes to check_estimator or parametrize_with_checks + # is deprecated + + msg = "Passing a class is deprecated" + with pytest.warns(FutureWarning, match=msg): + check_estimator(LogisticRegression) + + with pytest.warns(FutureWarning, match=msg): + parametrize_with_checks([LogisticRegression]) + + # Make sure check_parameters_default_constructible accepts instances now + check_parameters_default_constructible('name', LogisticRegression()) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index efac2aca2a2df..ec28cb22919f0 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -33,7 +33,7 @@ from ..linear_model import Ridge from ..base import (clone, ClusterMixin, is_classifier, is_regressor, - RegressorMixin, is_outlier_detector) + RegressorMixin, is_outlier_detector, BaseEstimator) from ..metrics import accuracy_score, adjusted_rand_score, f1_score from ..random_projection import BaseRandomProjection @@ -333,12 +333,15 @@ def _construct_instance(Estimator): return estimator +# TODO: probably not needed anymore in 0.24 since _generate_class_checks should +# be removed too. Just put this in check_estimator() def _generate_instance_checks(name, estimator): """Generate instance checks.""" yield from ((estimator, partial(check, name)) for check in _yield_all_checks(name, estimator)) +# TODO: remove this in 0.24 def _generate_class_checks(Estimator): """Generate class checks.""" name = Estimator.__name__ @@ -353,6 +356,8 @@ def _mark_xfail_checks(estimator, check, pytest): if isinstance(estimator, type): # try to construct estimator instance, if it is unable to then # return the estimator class, ignoring the tag + # TODO: remove this if block in 0.24 since passing instances isn't + # supported anymore try: estimator = _construct_instance(estimator) except Exception: @@ -385,6 +390,10 @@ def parametrize_with_checks(estimators): estimators : list of estimators objects or classes Estimators to generated checks for. + .. deprecated:: 0.23 + Passing a class is deprecated from version 0.23, and won't be + supported in 0.24. Pass an instance instead. + Returns ------- decorator : `pytest.mark.parametrize` @@ -395,13 +404,21 @@ def parametrize_with_checks(estimators): >>> from sklearn.linear_model import LogisticRegression >>> from sklearn.tree import DecisionTreeRegressor - >>> @parametrize_with_checks([LogisticRegression, DecisionTreeRegressor]) + >>> @parametrize_with_checks([LogisticRegression(), + ... DecisionTreeRegressor()]) ... def test_sklearn_compatible_estimator(estimator, check): ... check(estimator) """ import pytest + if any(isinstance(est, type) for est in estimators): + # TODO: remove class support in 0.24 and update docstrings + msg = ("Passing a class is deprecated since version 0.23 " + "and won't be supported in 0.24." + "Please pass an instance instead.") + warnings.warn(msg, FutureWarning) + checks_generator = chain.from_iterable( check_estimator(estimator, generate_only=True) for estimator in estimators) @@ -418,7 +435,7 @@ def check_estimator(Estimator, generate_only=False): """Check if estimator adheres to scikit-learn conventions. This estimator will run an extensive test-suite for input validation, - shapes, etc, making sure that the estimator complies with `scikit-leanrn` + shapes, etc, making sure that the estimator complies with `scikit-learn` conventions as detailed in :ref:`rolling_your_own_estimator`. Additional tests for classifiers, regressors, clustering or transformers will be run if the Estimator class inherits from the corresponding mixin @@ -426,7 +443,9 @@ def check_estimator(Estimator, generate_only=False): This test can be applied to classes or instances. Classes currently have some additional tests that related to construction, - while passing instances allows the testing of multiple options. + while passing instances allows the testing of multiple options. However, + support for classes is deprecated since version 0.23 and will be removed + in version 0.24 (class checks will still be run on the instances). Setting `generate_only=True` returns a generator that yields (estimator, check) tuples where the check can be called independently from each @@ -439,9 +458,13 @@ def check_estimator(Estimator, generate_only=False): Parameters ---------- - estimator : estimator object or class + estimator : estimator object Estimator to check. Estimator is a class object or instance. + .. deprecated:: 0.23 + Passing a class is deprecated from version 0.23, and won't be + supported in 0.24. Pass an instance instead. + generate_only : bool, optional (default=False) When `False`, checks are evaluated when `check_estimator` is called. When `True`, `check_estimator` returns a generator that yields @@ -456,8 +479,14 @@ def check_estimator(Estimator, generate_only=False): Generator that yields (estimator, check) tuples. Returned when `generate_only=True`. """ + # TODO: remove class support in 0.24 and update docstrings if isinstance(Estimator, type): # got a class + msg = ("Passing a class is deprecated since version 0.23 " + "and won't be supported in 0.24." + "Please pass an instance instead.") + warnings.warn(msg, FutureWarning) + checks_generator = _generate_class_checks(Estimator) else: # got an instance @@ -2570,6 +2599,12 @@ def check_parameters_default_constructible(name, Estimator): # this check works on classes, not instances # test default-constructibility # get rid of deprecation warnings + if isinstance(Estimator, BaseEstimator): + # Convert estimator instance to its class + # TODO: Always convert to class in 0.24, because check_estimator() will + # only accept instances, not classes + Estimator = Estimator.__class__ + with ignore_warnings(category=FutureWarning): estimator = _construct_instance(Estimator) # test cloning diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index a755daa842ef5..594ff65f9e889 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -356,6 +356,7 @@ def fit(self, X, y): check_fit_score_takes_y("test", TestEstimatorWithDeprecatedFitMethod()) +@ignore_warnings("Passing a class is depr", category=FutureWarning) # 0.24 def test_check_estimator(): # tests that the estimator actually fails on "bad" estimators. # not a complete test of all checks, which are very extensive. @@ -579,7 +580,10 @@ def test_check_regressor_data_not_an_array(): EstimatorInconsistentForPandas()) +@ignore_warnings("Passing a class is depr", category=FutureWarning) # 0.24 def test_check_estimator_required_parameters_skip(): + # TODO: remove whole test in 0.24 since passes classes to check_estimator() + # isn't supported anymore class MyEstimator(BaseEstimator): _required_parameters = ["special_parameter"] From 964c830328b2ede02814f276cbbf23d16f0e8914 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 27 Apr 2020 18:22:21 -0400 Subject: [PATCH 081/125] MNT Change print_changed_only default to True (#17061) --- conftest.py | 10 ---------- doc/conf.py | 3 --- doc/whats_new/v0.23.rst | 8 ++++++++ sklearn/_config.py | 11 +++++++---- sklearn/tests/test_base.py | 4 ++-- sklearn/tests/test_config.py | 6 +++--- sklearn/utils/tests/conftest.py | 10 ++++++++++ sklearn/utils/tests/test_pprint.py | 19 ++++++++----------- 8 files changed, 38 insertions(+), 33 deletions(-) create mode 100644 sklearn/utils/tests/conftest.py diff --git a/conftest.py b/conftest.py index 2b9e87bf9f292..874931341e195 100644 --- a/conftest.py +++ b/conftest.py @@ -99,16 +99,6 @@ def pytest_unconfigure(config): del sys._is_pytest_session -def pytest_runtest_setup(item): - if isinstance(item, DoctestItem): - set_config(print_changed_only=True) - - -def pytest_runtest_teardown(item, nextitem): - if isinstance(item, DoctestItem): - set_config(print_changed_only=False) - - # TODO: Remove when modules are deprecated in 0.24 # Configures pytest to ignore deprecated modules. collect_ignore_glob = [ diff --git a/doc/conf.py b/doc/conf.py index 1783a676b6d01..d459cdfd3f1af 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -392,6 +392,3 @@ def setup(app): warnings.filterwarnings("ignore", category=UserWarning, message='Matplotlib is currently using agg, which is a' ' non-GUI backend, so cannot show the figure.') - -# Reduces the output of estimators -sklearn.set_config(print_changed_only=True) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index b596e8a1797be..5fc3922f1d457 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -564,6 +564,14 @@ Miscellaneous error message is raised when y was expected but None was passed. :pr:`16622` by `Nicolas Hug`_. +- |API| The default setting `print_changed_only` has been changed from False + to True. This means that the `repr` of estimators is now more concise and + only shows the parameters whose default value has been changed when + printing an estimator. You can restore the previous behaviour by using + `sklearn.set_config(print_changed_only=False)`. Also, note that it is + always possible to quickly inspect the parameters of any estimator using + `est.get_params(deep=False)`. :pr:`17061` by `Nicolas Hug`_. + Code and Documentation Contributors ----------------------------------- diff --git a/sklearn/_config.py b/sklearn/_config.py index c7f3934ee1cb3..44eaae1d59012 100644 --- a/sklearn/_config.py +++ b/sklearn/_config.py @@ -6,7 +6,7 @@ _global_config = { 'assume_finite': bool(os.environ.get('SKLEARN_ASSUME_FINITE', False)), 'working_memory': int(os.environ.get('SKLEARN_WORKING_MEMORY', 1024)), - 'print_changed_only': False, + 'print_changed_only': True, } @@ -93,9 +93,12 @@ def config_context(**new_config): print_changed_only : bool, optional If True, only the parameters that were set to non-default values will be printed when printing an estimator. For example, - ``print(SVC())`` while True will only print 'SVC()' while the default - behaviour would be to print 'SVC(C=1.0, cache_size=200, ...)' with - all the non-changed parameters. + ``print(SVC())`` while True will only print 'SVC()', but would print + 'SVC(C=1.0, cache_size=200, ...)' with all the non-changed parameters + when False. Default is True. + + .. versionchanged:: 0.23 + Default changed from False to True. Notes ----- diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index 95f7b01f27058..52f2e60b4af70 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -211,10 +211,10 @@ def test_repr(): test = T(K(), K()) assert ( repr(test) == - "T(a=K(c=None, d=None), b=K(c=None, d=None))") + "T(a=K(), b=K())") some_est = T(a=["long_params"] * 1000) - assert len(repr(some_est)) == 495 + assert len(repr(some_est)) == 485 def test_str(): diff --git a/sklearn/tests/test_config.py b/sklearn/tests/test_config.py index dfa944110ad7a..ae13c61838694 100644 --- a/sklearn/tests/test_config.py +++ b/sklearn/tests/test_config.py @@ -4,7 +4,7 @@ def test_config_context(): assert get_config() == {'assume_finite': False, 'working_memory': 1024, - 'print_changed_only': False} + 'print_changed_only': True} # Not using as a context manager affects nothing config_context(assume_finite=True) @@ -12,7 +12,7 @@ def test_config_context(): with config_context(assume_finite=True): assert get_config() == {'assume_finite': True, 'working_memory': 1024, - 'print_changed_only': False} + 'print_changed_only': True} assert get_config()['assume_finite'] is False with config_context(assume_finite=True): @@ -37,7 +37,7 @@ def test_config_context(): assert get_config()['assume_finite'] is True assert get_config() == {'assume_finite': False, 'working_memory': 1024, - 'print_changed_only': False} + 'print_changed_only': True} # No positional arguments assert_raises(TypeError, config_context, True) diff --git a/sklearn/utils/tests/conftest.py b/sklearn/utils/tests/conftest.py new file mode 100644 index 0000000000000..148225a481f69 --- /dev/null +++ b/sklearn/utils/tests/conftest.py @@ -0,0 +1,10 @@ +import pytest + +import sklearn + + +@pytest.fixture +def print_changed_only_false(): + sklearn.set_config(print_changed_only=False) + yield + sklearn.set_config(print_changed_only=True) # reset to default diff --git a/sklearn/utils/tests/test_pprint.py b/sklearn/utils/tests/test_pprint.py index 146ccf781ae8a..866d872a9b65c 100644 --- a/sklearn/utils/tests/test_pprint.py +++ b/sklearn/utils/tests/test_pprint.py @@ -174,7 +174,7 @@ def __init__(self, missing_values=np.nan, strategy="mean", self.copy = copy -def test_basic(): +def test_basic(print_changed_only_false): # Basic pprint test lr = LogisticRegression() expected = """ @@ -189,8 +189,7 @@ def test_basic(): def test_changed_only(): - # Make sure the changed_only param is correctly used - set_config(print_changed_only=True) + # Make sure the changed_only param is correctly used when True (default) lr = LogisticRegression(C=99) expected = """LogisticRegression(C=99)""" assert lr.__repr__() == expected @@ -216,10 +215,8 @@ def test_changed_only(): # make sure array parameters don't throw error (see #13583) repr(LogisticRegressionCV(Cs=np.array([0.1, 1]))) - set_config(print_changed_only=False) - -def test_pipeline(): +def test_pipeline(print_changed_only_false): # Render a pipeline object pipeline = make_pipeline(StandardScaler(), LogisticRegression(C=999)) expected = """ @@ -240,7 +237,7 @@ def test_pipeline(): assert pipeline.__repr__() == expected -def test_deeply_nested(): +def test_deeply_nested(print_changed_only_false): # Render a deeply nested estimator rfe = RFE(RFE(RFE(RFE(RFE(RFE(RFE(LogisticRegression()))))))) expected = """ @@ -277,7 +274,7 @@ def test_deeply_nested(): assert rfe.__repr__() == expected -def test_gridsearch(): +def test_gridsearch(print_changed_only_false): # render a gridsearch param_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}, @@ -302,7 +299,7 @@ def test_gridsearch(): assert gs.__repr__() == expected -def test_gridsearch_pipeline(): +def test_gridsearch_pipeline(print_changed_only_false): # render a pipeline inside a gridsearch pp = _EstimatorPrettyPrinter(compact=True, indent=1, indent_at_name=True) @@ -372,7 +369,7 @@ def test_gridsearch_pipeline(): assert repr_ == expected -def test_n_max_elements_to_show(): +def test_n_max_elements_to_show(print_changed_only_false): n_max_elements_to_show = 30 pp = _EstimatorPrettyPrinter( @@ -461,7 +458,7 @@ def test_n_max_elements_to_show(): assert pp.pformat(gs) == expected -def test_bruteforce_ellipsis(): +def test_bruteforce_ellipsis(print_changed_only_false): # Check that the bruteforce ellipsis (used when the number of non-blank # characters exceeds N_CHAR_MAX) renders correctly. From dbc35934a6ebf7d0bbfdfc80e8bf8a9fabc1ba4b Mon Sep 17 00:00:00 2001 From: brigi <58770308+brigitteunger@users.noreply.github.com> Date: Tue, 28 Apr 2020 00:43:47 +0200 Subject: [PATCH 082/125] DOC Version added and changed labels added for v0.18 (#wimlds) (#16222) * Versionlabels added to v0.18 (#wimlds) * documentation issues: label changed and added for version 0.18 * fix intends and shorten description Co-authored-by: Hannah <32333241+hhnnhh@users.noreply.github.com> Co-authored-by: Brigitte@home --- sklearn/cluster/_kmeans.py | 3 +++ sklearn/ensemble/_voting.py | 4 ++++ sklearn/feature_selection/_rfe.py | 2 ++ sklearn/feature_selection/_univariate_selection.py | 4 ++++ sklearn/linear_model/_ransac.py | 4 ++++ sklearn/metrics/_classification.py | 5 +++++ sklearn/metrics/cluster/_supervised.py | 2 ++ sklearn/model_selection/_split.py | 2 ++ sklearn/multioutput.py | 2 ++ sklearn/preprocessing/_function_transformer.py | 4 ++++ sklearn/svm/_classes.py | 4 ++++ 11 files changed, 36 insertions(+) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 21a604bed3eb5..8d24ed497aef3 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -834,6 +834,9 @@ class KMeans(TransformerMixin, ClusterMixin, BaseEstimator): For now "auto" (kept for backward compatibiliy) chooses "elkan" but it might change in the future for a better heuristic. + .. versionchanged:: 0.18 + Added Elkan algorithm + Attributes ---------- cluster_centers_ : ndarray of shape (n_clusters, n_features) diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py index b044cb68e5151..0ac42407f5998 100644 --- a/sklearn/ensemble/_voting.py +++ b/sklearn/ensemble/_voting.py @@ -141,6 +141,8 @@ class VotingClassifier(ClassifierMixin, _BaseVoting): ``-1`` means using all processors. See :term:`Glossary ` for more details. + .. versionadded:: 0.18 + flatten_transform : bool, default=True Affects shape of transform output only when voting='soft' If voting='soft' and flatten_transform=True, transform method returns @@ -232,6 +234,8 @@ def fit(self, X, y, sample_weight=None): Note that this is supported only if all underlying estimators support sample weights. + .. versionadded:: 0.18 + Returns ------- self : object diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index aedcd94943bc4..8dc7aecb7dc3e 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -410,6 +410,8 @@ class RFECV(RFE): ``-1`` means using all processors. See :term:`Glossary ` for more details. + .. versionadded:: 0.18 + Attributes ---------- n_features_ : int diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 7e873b3a2b65c..6911830099844 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -384,6 +384,8 @@ class SelectPercentile(_BaseFilter): Default is f_classif (see below "See also"). The default function only works with classification tasks. + .. versionadded:: 0.18 + percentile : int, optional, default=10 Percent of features to keep. @@ -467,6 +469,8 @@ class SelectKBest(_BaseFilter): Default is f_classif (see below "See also"). The default function only works with classification tasks. + .. versionadded:: 0.18 + k : int or "all", optional, default=10 Number of top features to select. The "all" option bypasses selection, for use in a parameter search. diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 5eac651c76383..fffa29d47d91c 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -150,6 +150,8 @@ class RANSACRegressor(MetaEstimatorMixin, RegressorMixin, If the loss on a sample is greater than the ``residual_threshold``, then this sample is classified as an outlier. + .. versionadded:: 0.18 + random_state : int, RandomState instance, default=None The generator used to initialize the centers. Pass an int for reproducible output across multiple function calls. @@ -239,6 +241,8 @@ def fit(self, X, y, sample_weight=None): raises error if sample_weight is passed and base_estimator fit method does not support it. + .. versionadded:: 0.18 + Raises ------ ValueError diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py index b8a1a8e5e22b4..2ceccca65203e 100644 --- a/sklearn/metrics/_classification.py +++ b/sklearn/metrics/_classification.py @@ -227,6 +227,8 @@ def confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None, sample_weight : array-like of shape (n_samples,), default=None Sample weights. + .. versionadded:: 0.18 + normalize : {'true', 'pred', 'all'}, default=None Normalizes confusion matrix over the true (rows), predicted (columns) conditions or all the population. If None, confusion matrix will not be @@ -789,6 +791,8 @@ def matthews_corrcoef(y_true, y_pred, *, sample_weight=None): sample_weight : array-like of shape (n_samples,), default=None Sample weights. + .. versionadded:: 0.18 + Returns ------- mcc : float @@ -2156,6 +2160,7 @@ def log_loss(y_true, y_pred, *, eps=1e-15, normalize=True, sample_weight=None, If not provided, labels will be inferred from y_true. If ``labels`` is ``None`` and ``y_pred`` has shape (n_samples,) the labels are assumed to be binary and are inferred from ``y_true``. + .. versionadded:: 0.18 Returns diff --git a/sklearn/metrics/cluster/_supervised.py b/sklearn/metrics/cluster/_supervised.py index 8a0fdcacb67f1..d652737bd23c0 100644 --- a/sklearn/metrics/cluster/_supervised.py +++ b/sklearn/metrics/cluster/_supervised.py @@ -881,6 +881,8 @@ def normalized_mutual_info_score(labels_true, labels_pred, *, def fowlkes_mallows_score(labels_true, labels_pred, *, sparse=False): """Measure the similarity of two clusterings of a set of points. + .. versionadded:: 0.18 + The Fowlkes-Mallows index (FMI) is defined as the geometric mean between of the precision and recall:: diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index edcb9b375ae79..9b2087e039f40 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -739,6 +739,8 @@ def split(self, X, y, groups=None): class TimeSeriesSplit(_BaseKFold): """Time Series cross-validator + .. versionadded:: 0.18 + Provides train/test indices to split time series data samples that are observed at fixed time intervals, in train/test sets. In each split, test indices must be higher than before, and thus shuffling diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 815c1cbd67757..a5ede43f0fe8c 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -215,6 +215,8 @@ class MultiOutputRegressor(RegressorMixin, _MultiOutputEstimator): simple strategy for extending regressors that do not natively support multi-target regression. + .. versionadded:: 0.18 + Parameters ---------- estimator : estimator object diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index 21dd40365f5a0..c4e6782b7cb19 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -66,9 +66,13 @@ class FunctionTransformer(TransformerMixin, BaseEstimator): kw_args : dict, optional Dictionary of additional keyword arguments to pass to func. + .. versionadded:: 0.18 + inv_kw_args : dict, optional Dictionary of additional keyword arguments to pass to inverse_func. + .. versionadded:: 0.18 + Examples -------- >>> import numpy as np diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 5ff6e74825e50..d082c22d0a3bc 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -213,6 +213,8 @@ def fit(self, X, y, sample_weight=None): samples. If not provided, then each sample is given unit weight. + .. versionadded:: 0.18 + Returns ------- self : object @@ -398,6 +400,8 @@ def fit(self, X, y, sample_weight=None): samples. If not provided, then each sample is given unit weight. + .. versionadded:: 0.18 + Returns ------- self : object From fb76de72e7560aaa739872e94bee761777e54c0a Mon Sep 17 00:00:00 2001 From: maikia Date: Tue, 28 Apr 2020 02:25:24 +0200 Subject: [PATCH 083/125] DOC Exchanging Boston for california dataset in plot missing values (#16513) * first few comments * added new california dataset * removed boston dataset from the file * updating the DOCs * adding a DOC for calculating the error * exchanged the order started writing functions on scoring the imputers * finished writing functions for imputers * finished writing functions and started on DOcs * working on the DOCs for imputers * cleaning up * flake8 * cleaning up * cleaning up * restructuring the document * further text restructuring * text restructuring * flake8 * reformatting * flake8 * Update examples/impute/plot_missing_values.py Co-Authored-By: Olivier Grisel * updated the intro * improve bullet point rendering * spelling * Update examples/impute/plot_missing_values.py Co-Authored-By: Alexandre Gramfort * Update examples/impute/plot_missing_values.py Co-Authored-By: Alexandre Gramfort * Update examples/impute/plot_missing_values.py Co-Authored-By: Alexandre Gramfort * Update examples/impute/plot_missing_values.py Co-Authored-By: Alexandre Gramfort * Update examples/impute/plot_missing_values.py Co-Authored-By: Alexandre Gramfort * Update examples/impute/plot_missing_values.py Co-Authored-By: Alexandre Gramfort * changed the naming * restructuring text * Update examples/impute/plot_missing_values.py Co-Authored-By: Alexandre Gramfort * flake8 * Update examples/impute/plot_missing_values.py Co-Authored-By: Alexandre Gramfort * Update examples/impute/plot_missing_values.py Co-Authored-By: Alexandre Gramfort * Update examples/impute/plot_missing_values.py Co-Authored-By: Alexandre Gramfort * Update examples/impute/plot_missing_values.py Co-Authored-By: Alexandre Gramfort * Update examples/impute/plot_missing_values.py Co-Authored-By: Alexandre Gramfort * changing missing values from 0 to nan * Update examples/impute/plot_missing_values.py Co-Authored-By: Alexandre Gramfort * REGRESSOR to regressor * Update examples/impute/plot_missing_values.py Co-Authored-By: Lucy Liu * Update examples/impute/plot_missing_values.py Co-Authored-By: Lucy Liu * Update examples/impute/plot_missing_values.py Co-Authored-By: Lucy Liu * Update examples/impute/plot_missing_values.py Co-Authored-By: Lucy Liu * Update examples/impute/plot_missing_values.py Co-Authored-By: Lucy Liu * Update examples/impute/plot_missing_values.py Co-Authored-By: Lucy Liu * Update examples/impute/plot_missing_values.py Co-Authored-By: Lucy Liu * Update examples/impute/plot_missing_values.py Co-Authored-By: Lucy Liu * Update examples/impute/plot_missing_values.py Co-Authored-By: Lucy Liu * Update examples/impute/plot_missing_values.py Co-Authored-By: Lucy Liu * Update examples/impute/plot_missing_values.py Co-Authored-By: Lucy Liu * Update examples/impute/plot_missing_values.py Co-Authored-By: Lucy Liu * flake8 * Update examples/impute/plot_missing_values.py Co-Authored-By: Thomas J Fan * Update examples/impute/plot_missing_values.py Co-Authored-By: Thomas J Fan * Update examples/impute/plot_missing_values.py Co-Authored-By: Thomas J Fan * Update examples/impute/plot_missing_values.py Co-Authored-By: Thomas J Fan * flake8 * reducting number of samples used from california dataset * CLN Removes the need for MissingIndicator * FIX Unrelated bug but is stopping the CI from passing Co-authored-by: Olivier Grisel Co-authored-by: Alexandre Gramfort Co-authored-by: Lucy Liu Co-authored-by: Thomas J Fan --- doc/developers/contributing.rst | 5 +- examples/impute/plot_missing_values.py | 285 ++++++++++++++++++------- 2 files changed, 214 insertions(+), 76 deletions(-) diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index c886119e908c1..e13b6850d50eb 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -434,8 +434,9 @@ You can check for common programming errors with the following tools: must not produce new errors in your pull request. Using `# type: ignore` annotation can be a workaround for a few cases that are not supported by mypy, in particular, - - when importing C or Cython modules - - on properties with decorators + + - when importing C or Cython modules + - on properties with decorators Bonus points for contributions that include a performance analysis with a benchmark script and profiling output (please report on the mailing diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 5186cf0ba3bac..2ba7dc05d16b6 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -5,122 +5,255 @@ Missing values can be replaced by the mean, the median or the most frequent value using the basic :class:`sklearn.impute.SimpleImputer`. -The median is a more robust estimator for data with high magnitude variables -which could dominate results (otherwise known as a 'long tail'). -With ``KNNImputer``, missing values can be imputed using the weighted -or unweighted mean of the desired number of nearest neighbors. +In this example we will investigate different imputation techniques: -Another option is the :class:`sklearn.impute.IterativeImputer`. This uses -round-robin linear regression, treating every variable as an output in -turn. The version implemented assumes Gaussian (output) variables. If your -features are obviously non-Normal, consider transforming them to look more -Normal so as to potentially improve performance. +- imputation by the constant value 0 +- imputation by the mean value of each feature combined with a missing-ness + indicator auxiliary variable +- k nearest neighbor imputation +- iterative imputation + +We will use two datasets: Diabetes dataset which consists of 10 feature +variables collected from diabetes patients with an aim to predict disease +progression and California Housing dataset for which the target is the median +house value for California districts. + +As neither of these datasets have missing values, we will remove some +values to create new versions with artificially missing data. The performance +of +:class:`~sklearn.ensemble.RandomForestRegressor` on the full original dataset +is then compared the performance on the altered datasets with the artificially +missing values imputed using different techniques. -In addition of using an imputing method, we can also keep an indication of the -missing information using :func:`sklearn.impute.MissingIndicator` which might -carry some information. """ print(__doc__) +# Authors: Maria Telenczuk +# License: BSD 3 clause + +############################################################################### +# Download the data and make missing values sets +################################################ +# +# First we download the two datasets. Diabetes dataset is shipped with +# scikit-learn. It has 442 entries, each with 10 features. California Housing +# dataset is much larger with 20640 entries and 8 features. It needs to be +# downloaded. We will only use the first 400 entries for the sake of speeding +# up the calculations but feel free to use the whole dataset. +# + import numpy as np -import matplotlib.pyplot as plt -# To use the experimental IterativeImputer, we need to explicitly ask for it: -from sklearn.experimental import enable_iterative_imputer # noqa +from sklearn.datasets import fetch_california_housing from sklearn.datasets import load_diabetes -from sklearn.datasets import load_boston + + +rng = np.random.RandomState(42) + +X_diabetes, y_diabetes = load_diabetes(return_X_y=True) +X_california, y_california = fetch_california_housing(return_X_y=True) +X_california = X_california[:400] +y_california = y_california[:400] + + +def add_missing_values(X_full, y_full): + n_samples, n_features = X_full.shape + + # Add missing values in 75% of the lines + missing_rate = 0.75 + n_missing_samples = int(n_samples * missing_rate) + + missing_samples = np.zeros(n_samples, dtype=np.bool) + missing_samples[: n_missing_samples] = True + + rng.shuffle(missing_samples) + missing_features = rng.randint(0, n_features, n_missing_samples) + X_missing = X_full.copy() + X_missing[missing_samples, missing_features] = np.nan + y_missing = y_full.copy() + + return X_missing, y_missing + + +X_miss_california, y_miss_california = add_missing_values( + X_california, y_california) + +X_miss_diabetes, y_miss_diabetes = add_missing_values( + X_diabetes, y_diabetes) + + +############################################################################### +# Impute the missing data and score +# ################################# +# Now we will write a function which will score the results on the differently +# imputed data. Let's look at each imputer separately: +# + +rng = np.random.RandomState(0) + from sklearn.ensemble import RandomForestRegressor -from sklearn.pipeline import make_pipeline, make_union -from sklearn.impute import ( - SimpleImputer, KNNImputer, IterativeImputer, MissingIndicator) + +# To use the experimental IterativeImputer, we need to explicitly ask for it: +from sklearn.experimental import enable_iterative_imputer # noqa +from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer from sklearn.model_selection import cross_val_score +from sklearn.pipeline import make_pipeline -rng = np.random.RandomState(0) N_SPLITS = 5 -REGRESSOR = RandomForestRegressor(random_state=0) +regressor = RandomForestRegressor(random_state=0) + +############################################################################### +# Missing information +# ------------------- +# In addition to imputing the missing values, the imputers have an +# `add_indicator` parameter that marks the values that were missing, which +# might carry some information. +# def get_scores_for_imputer(imputer, X_missing, y_missing): - estimator = make_pipeline( - make_union(imputer, MissingIndicator(missing_values=0)), - REGRESSOR) + estimator = make_pipeline(imputer, regressor) impute_scores = cross_val_score(estimator, X_missing, y_missing, scoring='neg_mean_squared_error', cv=N_SPLITS) return impute_scores -def get_results(dataset): - X_full, y_full = dataset.data, dataset.target - n_samples = X_full.shape[0] - n_features = X_full.shape[1] +x_labels = ['Full data', + 'Zero imputation', + 'Mean Imputation', + 'KNN Imputation', + 'Iterative Imputation'] + +mses_california = np.zeros(5) +stds_california = np.zeros(5) +mses_diabetes = np.zeros(5) +stds_diabetes = np.zeros(5) + +############################################################################### +# Estimate the score +# ------------------ +# First, we want to estimate the score on the original data: +# - # Estimate the score on the entire dataset, with no missing values - full_scores = cross_val_score(REGRESSOR, X_full, y_full, + +def get_full_score(X_full, y_full): + full_scores = cross_val_score(regressor, X_full, y_full, scoring='neg_mean_squared_error', cv=N_SPLITS) + return full_scores.mean(), full_scores.std() - # Add missing values in 75% of the lines - missing_rate = 0.75 - n_missing_samples = int(np.floor(n_samples * missing_rate)) - missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples, - dtype=np.bool), - np.ones(n_missing_samples, - dtype=np.bool))) - rng.shuffle(missing_samples) - missing_features = rng.randint(0, n_features, n_missing_samples) - X_missing = X_full.copy() - X_missing[np.where(missing_samples)[0], missing_features] = 0 - y_missing = y_full.copy() - # Estimate the score after replacing missing values by 0 - imputer = SimpleImputer(missing_values=0, - strategy='constant', - fill_value=0) +mses_california[0], stds_california[0] = get_full_score(X_california, + y_california) +mses_diabetes[0], stds_diabetes[0] = get_full_score(X_diabetes, y_diabetes) + + +############################################################################### +# Replace missing values by 0 +# --------------------------- +# +# Now we will estimate the score on the data where the missing values are +# replaced by 0: +# + + +def get_impute_zero_score(X_missing, y_missing): + + imputer = SimpleImputer(missing_values=np.nan, add_indicator=True, + strategy='constant', fill_value=0) zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) + return zero_impute_scores.mean(), zero_impute_scores.std() - # Estimate the score after imputation (mean strategy) of the missing values - imputer = SimpleImputer(missing_values=0, strategy="mean") - mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) - # Estimate the score after kNN-imputation of the missing values - imputer = KNNImputer(missing_values=0) +mses_california[1], stds_california[1] = get_impute_zero_score( + X_miss_california, y_miss_california) +mses_diabetes[1], stds_diabetes[1] = get_impute_zero_score(X_miss_diabetes, + y_miss_diabetes) + + +############################################################################### +# kNN-imputation of the missing values +# ------------------------------------ +# +# :class:`sklearn.impute.KNNImputer` imputes missing values using the weighted +# or unweighted mean of the desired number of nearest neighbors. + +def get_impute_knn_score(X_missing, y_missing): + imputer = KNNImputer(missing_values=np.nan, add_indicator=True) knn_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) + return knn_impute_scores.mean(), knn_impute_scores.std() - # Estimate the score after iterative imputation of the missing values - imputer = IterativeImputer(missing_values=0, - random_state=0, - n_nearest_features=5, + +mses_california[2], stds_california[2] = get_impute_knn_score( + X_miss_california, y_miss_california) +mses_diabetes[2], stds_diabetes[2] = get_impute_knn_score(X_miss_diabetes, + y_miss_diabetes) + + +############################################################################### +# Impute missing values with mean +# ------------------------------- +# + +def get_impute_mean(X_missing, y_missing): + imputer = SimpleImputer(missing_values=np.nan, strategy="mean", + add_indicator=True) + mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) + return mean_impute_scores.mean(), mean_impute_scores.std() + + +mses_california[3], stds_california[3] = get_impute_mean(X_miss_california, + y_miss_california) +mses_diabetes[3], stds_diabetes[3] = get_impute_mean(X_miss_diabetes, + y_miss_diabetes) + + +############################################################################### +# Iterative imputation of the missing values +# ------------------------------------------ +# +# Another option is the :class:`sklearn.impute.IterativeImputer`. This uses +# round-robin linear regression, modeling each feature with missing values as a +# function of other features, in turn. +# The version implemented assumes Gaussian (output) variables. If your features +# are obviously non-normal, consider transforming them to look more normal +# to potentially improve performance. +# + +def get_impute_iterative(X_missing, y_missing): + imputer = IterativeImputer(missing_values=np.nan, add_indicator=True, + random_state=0, n_nearest_features=5, sample_posterior=True) iterative_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) + return iterative_impute_scores.mean(), iterative_impute_scores.std() - return ((full_scores.mean(), full_scores.std()), - (zero_impute_scores.mean(), zero_impute_scores.std()), - (mean_impute_scores.mean(), mean_impute_scores.std()), - (knn_impute_scores.mean(), knn_impute_scores.std()), - (iterative_impute_scores.mean(), iterative_impute_scores.std())) +mses_california[4], stds_california[4] = get_impute_iterative( + X_miss_california, y_miss_california) +mses_diabetes[4], stds_diabetes[4] = get_impute_iterative(X_miss_diabetes, + y_miss_diabetes) -results_diabetes = np.array(get_results(load_diabetes())) -mses_diabetes = results_diabetes[:, 0] * -1 -stds_diabetes = results_diabetes[:, 1] +mses_diabetes = mses_diabetes * -1 +mses_california = mses_california * -1 + +############################################################################### +# Plot the results +# ################ +# +# Finally we are going to visualize the score: +# + +import matplotlib.pyplot as plt -results_boston = np.array(get_results(load_boston())) -mses_boston = results_boston[:, 0] * -1 -stds_boston = results_boston[:, 1] n_bars = len(mses_diabetes) xval = np.arange(n_bars) -x_labels = ['Full data', - 'Zero imputation', - 'Mean Imputation', - 'KNN Imputation', - 'Iterative Imputation'] colors = ['r', 'g', 'b', 'orange', 'black'] # plot diabetes results @@ -138,16 +271,20 @@ def get_results(dataset): ax1.invert_yaxis() ax1.set_yticklabels(x_labels) -# plot boston results +# plot california dataset results ax2 = plt.subplot(122) for j in xval: - ax2.barh(j, mses_boston[j], xerr=stds_boston[j], + ax2.barh(j, mses_california[j], xerr=stds_california[j], color=colors[j], alpha=0.6, align='center') -ax2.set_title('Imputation Techniques with Boston Data') +ax2.set_title('Imputation Techniques with California Data') ax2.set_yticks(xval) ax2.set_xlabel('MSE') ax2.invert_yaxis() ax2.set_yticklabels([''] * n_bars) plt.show() + +# You can also try different techniques. For instance, the median is a more +# robust estimator for data with high magnitude variables which could dominate +# results (otherwise known as a 'long tail'). From 8b4d4f4aaf10fb0bdcc5829df9f265b363ce7a8e Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 28 Apr 2020 01:47:21 -0400 Subject: [PATCH 084/125] DOC Add whats new for missing PRs (#17066) --- doc/whats_new/v0.23.rst | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 5fc3922f1d457..02ef6a9e94408 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -275,6 +275,11 @@ Changelog with log-link useful for modeling count data. :pr:`16692` by :user:`Christian Lorentzen ` +- |Fix| Fixed a bug where :class:`ensemble.HistGradientBoostingRegressor` and + :class:`ensemble.HistGradientBoostingClassifier` would fail with multiple + calls to fit when `warm_start=True`, `early_stopping=True`, and there is no + validation set. :pr:`16663` by `Thomas Fan`_. + :mod:`sklearn.feature_extraction` ................................. @@ -283,11 +288,16 @@ Changelog for datasets with large vocabularies combined with ``min_df`` or ``max_df``. :pr:`15834` by :user:`Santiago M. Mola `. +:mod:`sklearn.feature_selection` +................................ - |Enhancement| Added support for multioutput data in :class:`feature_selection.RFE` and :class:`feature_selection.RFECV`. :pr:`16103` by :user:`Divyaprabha M `. +- |API| Adds :class:`feature_selection.SelectorMixin` back to public API. + :pr:`16132` by :user:`trimeta`. + :mod:`sklearn.gaussian_process` ............................... @@ -441,6 +451,11 @@ Changelog :mod:`sklearn.neural_network` ............................. +- |Efficiency| :class:`neural_network.MLPClassifier` and + :class:`neural_network.MLPRegressor` has reduced memory footprint when using + stochastic solvers, `'sgd'` or `'adam'`, and `shuffle=True`. :pr:`14075` by + :user:`meyer89`. + - |Fix| Increases the numerical stability of the logistic loss function in :class:`neural_network.MLPClassifier` by clipping the probabilities. :pr:`16117` by `Thomas Fan`_. @@ -460,6 +475,10 @@ Changelog each feature with two categories. :pr:`16245` by :user:`Rushabh Vasani `. +- |Enhancement| :class:`preprocessing.OneHotEncoder`'s `drop_idx_` ndarray + can now contain `None`, where `drop_idx_[i] = None` means that no category + is dropped for index `i`. :pr:`16585` by :user:`Chiara Marmo `. + - |Efficiency| :class:`preprocessing.OneHotEncoder` is now faster at transforming. :pr:`15762` by `Thomas Fan`_. @@ -472,6 +491,13 @@ Changelog normalizing the vectors. :pr:`16632` by :user:`Maura Pintor ` and :user:`Battista Biggio `. +:mod:`sklearn.semi_supervised` +.............................. + +- |Fix| :class:`semi_supervised.LabelSpreading` and + :class:`semi_supervised.LabelPropagation` avoids divide by zero warnings + when normalizing `label_distributions_`. :pr:`15946` by :user:`ngshya`. + :mod:`sklearn.svm` .................. @@ -539,6 +565,9 @@ Changelog and support for classes will be removed in 0.24. Pass instances instead. :pr:`17032` by `Nicolas Hug`_. +- |FIX| :func:`utils.all_estimators` now only returns public estimators. + :pr:`15380` by `Thomas Fan`_. + :mod:`sklearn.cluster` ...................... From 3deacb98c755d0b8e46e298ff3ea918b0e0e4204 Mon Sep 17 00:00:00 2001 From: "Gregory R. Lee" Date: Tue, 28 Apr 2020 06:19:52 -0400 Subject: [PATCH 085/125] MNT consistently call import_array() after cimport of numpy (#17054) --- sklearn/_isotonic.pyx | 2 ++ sklearn/cluster/_dbscan_inner.pyx | 2 ++ sklearn/ensemble/_hist_gradient_boosting/_binning.pyx | 3 +++ .../ensemble/_hist_gradient_boosting/_gradient_boosting.pyx | 2 ++ sklearn/ensemble/_hist_gradient_boosting/_loss.pyx | 2 ++ sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx | 2 ++ sklearn/ensemble/_hist_gradient_boosting/common.pxd | 2 ++ sklearn/ensemble/_hist_gradient_boosting/histogram.pyx | 2 ++ sklearn/ensemble/_hist_gradient_boosting/splitting.pyx | 4 +++- sklearn/linear_model/_sag_fast.pyx.tp | 1 + sklearn/manifold/_barnes_hut_tsne.pyx | 2 ++ sklearn/manifold/_utils.pyx | 4 ++++ sklearn/neighbors/_typedefs.pyx | 3 +++ sklearn/preprocessing/_csr_polynomial_expansion.pyx | 1 + sklearn/svm/_libsvm_sparse.pyx | 4 +++- sklearn/utils/_logistic_sigmoid.pyx | 1 + 16 files changed, 35 insertions(+), 2 deletions(-) diff --git a/sklearn/_isotonic.pyx b/sklearn/_isotonic.pyx index c7486097df854..75c4bbef11379 100644 --- a/sklearn/_isotonic.pyx +++ b/sklearn/_isotonic.pyx @@ -11,6 +11,8 @@ cimport numpy as np cimport cython from cython cimport floating +np.import_array() + def _inplace_contiguous_isotonic_regression(floating[::1] y, floating[::1] w): cdef: diff --git a/sklearn/cluster/_dbscan_inner.pyx b/sklearn/cluster/_dbscan_inner.pyx index a348bf59d6717..b9a80686a76f8 100644 --- a/sklearn/cluster/_dbscan_inner.pyx +++ b/sklearn/cluster/_dbscan_inner.pyx @@ -9,6 +9,8 @@ from libcpp.vector cimport vector cimport numpy as np import numpy as np +np.import_array() + # Work around Cython bug: C++ exceptions are not caught unless thrown within # a cdef function with an "except +" declaration. diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx index 1ecee3c9ee27e..4e11abfcabdf8 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx @@ -16,6 +16,9 @@ from libc.math cimport isnan from .common cimport X_DTYPE_C, X_BINNED_DTYPE_C +np.import_array() + + def _map_to_bins(const X_DTYPE_C [:, :] data, list binning_thresholds, const unsigned char missing_values_bin_idx, diff --git a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx index 8d307c3806532..18f1b6a365421 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_gradient_boosting.pyx @@ -13,6 +13,8 @@ cimport numpy as np from .common import Y_DTYPE from .common cimport Y_DTYPE_C +np.import_array() + def _update_raw_predictions( Y_DTYPE_C [::1] raw_predictions, # OUT diff --git a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx index 64480911439e5..4114cd24aa8df 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx @@ -15,6 +15,8 @@ from libc.math cimport exp, log from .common cimport Y_DTYPE_C from .common cimport G_H_DTYPE_C +np.import_array() + def _update_gradients_least_squares( G_H_DTYPE_C [::1] gradients, # OUT diff --git a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx index b3234cb5ba945..d346aabdac070 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx @@ -18,6 +18,8 @@ from .common import Y_DTYPE from .common cimport X_BINNED_DTYPE_C from .common cimport node_struct +np.import_array() + def _predict_from_numeric_data( node_struct [:] nodes, diff --git a/sklearn/ensemble/_hist_gradient_boosting/common.pxd b/sklearn/ensemble/_hist_gradient_boosting/common.pxd index 60399c2fbdd70..161ad114829fe 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/common.pxd +++ b/sklearn/ensemble/_hist_gradient_boosting/common.pxd @@ -2,6 +2,8 @@ import numpy as np cimport numpy as np +np.import_array() + ctypedef np.npy_float64 X_DTYPE_C ctypedef np.npy_uint8 X_BINNED_DTYPE_C diff --git a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx index 740e5e002cf4e..8bd7c4ee8b350 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/histogram.pyx @@ -17,6 +17,8 @@ from .common cimport hist_struct from .common cimport X_BINNED_DTYPE_C from .common cimport G_H_DTYPE_C +np.import_array() + # Notes: # - IN views are read-only, OUT views are write-only # - In a lot of functions here, we pass feature_idx and the whole 2d diff --git a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx index 43405551ef357..984cc6767facf 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx +++ b/sklearn/ensemble/_hist_gradient_boosting/splitting.pyx @@ -27,6 +27,8 @@ from .common cimport hist_struct from .common import HISTOGRAM_DTYPE from .common cimport MonotonicConstraint +np.import_array() + cdef struct split_info_struct: # Same as the SplitInfo class, but we need a C struct to use it in the @@ -809,7 +811,7 @@ cpdef inline Y_DTYPE_C compute_node_value( """ cdef: - Y_DTYPE_C value + Y_DTYPE_C value value = -sum_gradient / (sum_hessian + l2_regularization + 1e-15) diff --git a/sklearn/linear_model/_sag_fast.pyx.tp b/sklearn/linear_model/_sag_fast.pyx.tp index 5758a8e5ee34c..141890497fcd2 100644 --- a/sklearn/linear_model/_sag_fast.pyx.tp +++ b/sklearn/linear_model/_sag_fast.pyx.tp @@ -58,6 +58,7 @@ from ..utils._seq_dataset cimport SequentialDataset32, SequentialDataset64 from libc.stdio cimport printf +np.import_array() {{for name, c_type, np_type in get_dispatch(dtypes)}} diff --git a/sklearn/manifold/_barnes_hut_tsne.pyx b/sklearn/manifold/_barnes_hut_tsne.pyx index ec80890fd8a58..b15462e597684 100644 --- a/sklearn/manifold/_barnes_hut_tsne.pyx +++ b/sklearn/manifold/_barnes_hut_tsne.pyx @@ -18,6 +18,8 @@ from cython.parallel cimport prange, parallel from ..neighbors._quad_tree cimport _QuadTree +np.import_array() + cdef char* EMPTY_STRING = "" diff --git a/sklearn/manifold/_utils.pyx b/sklearn/manifold/_utils.pyx index 676d3676fb8c1..0cc2b0af137cc 100644 --- a/sklearn/manifold/_utils.pyx +++ b/sklearn/manifold/_utils.pyx @@ -5,6 +5,10 @@ cimport cython import numpy as np cimport numpy as np from libc.stdio cimport printf + +np.import_array() + + cdef extern from "numpy/npy_math.h": float NPY_INFINITY diff --git a/sklearn/neighbors/_typedefs.pyx b/sklearn/neighbors/_typedefs.pyx index bbdfd00505b43..789afb4997dd1 100644 --- a/sklearn/neighbors/_typedefs.pyx +++ b/sklearn/neighbors/_typedefs.pyx @@ -4,6 +4,9 @@ import numpy as np cimport numpy as np from libc.math cimport sqrt +np.import_array() + + # use a hack to determine the associated numpy data types # NOTE: the following requires the buffer interface, only available in # numpy 1.5+. We'll choose the DTYPE by hand instead. diff --git a/sklearn/preprocessing/_csr_polynomial_expansion.pyx b/sklearn/preprocessing/_csr_polynomial_expansion.pyx index dd36f8321410f..84fef3f042dc7 100644 --- a/sklearn/preprocessing/_csr_polynomial_expansion.pyx +++ b/sklearn/preprocessing/_csr_polynomial_expansion.pyx @@ -8,6 +8,7 @@ from scipy.sparse import csr_matrix from numpy cimport ndarray cimport numpy as np +np.import_array() ctypedef np.int32_t INDEX_T ctypedef fused DATA_T: diff --git a/sklearn/svm/_libsvm_sparse.pyx b/sklearn/svm/_libsvm_sparse.pyx index f180560f1d1e7..4b5070a64aad8 100644 --- a/sklearn/svm/_libsvm_sparse.pyx +++ b/sklearn/svm/_libsvm_sparse.pyx @@ -4,6 +4,8 @@ cimport numpy as np from scipy import sparse from ..exceptions import ConvergenceWarning +np.import_array() + cdef extern from *: ctypedef char* const_char_p "const char*" @@ -186,7 +188,7 @@ def libsvm_sparse_train ( int n_features, # copy model.nSV # TODO: do only in classification - cdef np.ndarray n_class_SV + cdef np.ndarray n_class_SV n_class_SV = np.empty(n_class, dtype=np.int32) copy_nSV(n_class_SV.data, model) diff --git a/sklearn/utils/_logistic_sigmoid.pyx b/sklearn/utils/_logistic_sigmoid.pyx index 4ca32193c5ce6..3531d99bc4f44 100644 --- a/sklearn/utils/_logistic_sigmoid.pyx +++ b/sklearn/utils/_logistic_sigmoid.pyx @@ -7,6 +7,7 @@ from libc.math cimport log, exp import numpy as np cimport numpy as np +np.import_array() ctypedef np.float64_t DTYPE_t From 1d3a553b2dfbe5cc8d32b306fe62855671fe9ae4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9mie=20du=20Boisberranger?= <34657725+jeremiedbb@users.noreply.github.com> Date: Tue, 28 Apr 2020 12:57:01 +0200 Subject: [PATCH 086/125] [MRG] Fix LinearModelsCV for loky backend. (#14264) --- doc/whats_new/v0.23.rst | 6 +++++ sklearn/linear_model/_coordinate_descent.py | 9 ++++++++ .../tests/test_coordinate_descent.py | 23 +++++++++++++++++++ 3 files changed, 38 insertions(+) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 02ef6a9e94408..d3a3de4d7153b 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -384,6 +384,12 @@ Changelog of non-zero coefficients and in the predicted output. :pr:`16849` by `Nicolas Hug`_. +- |Fix| Fixed a bug in :class:`linear_model.ElasticNetCV`, + :class:`linear_model.MultitaskElasticNetCV`, :class:`linear_model.LassoCV` + and :class:`linear_model.MultitaskLassoCV` where fitting would fail when + using joblib loky backend. :pr:`14264` by + :user:`Jérémie du Boisberranger `. + :mod:`sklearn.metrics` ...................... diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index cd57b9612b362..2d8567b04db56 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -1068,6 +1068,15 @@ def _path_residuals(X, y, train, test, path, path_params, alphas=None, y_train = y[train] X_test = X[test] y_test = y[test] + + if not sparse.issparse(X): + for array, array_input in ((X_train, X), (y_train, y), + (X_test, X), (y_test, y)): + if array.base is not array_input and not array.flags['WRITEABLE']: + # fancy indexing should create a writable copy but it doesn't + # for read-only memmaps (cf. numpy#14132). + array.setflags(write=True) + fit_intercept = path_params['fit_intercept'] normalize = path_params['normalize'] diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index fdc49599788fe..142c1e9ac2a47 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -7,8 +7,11 @@ import pytest from scipy import interpolate, sparse from copy import deepcopy +import joblib +from distutils.version import LooseVersion from sklearn.datasets import load_boston +from sklearn.datasets import make_regression from sklearn.exceptions import ConvergenceWarning from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_almost_equal @@ -1020,3 +1023,23 @@ def test_enet_sample_weight_sparse(): with pytest.raises(ValueError, match="Sample weights do not.*support " "sparse matrices"): reg.fit(X, y, sample_weight=sw, check_input=True) + + +@pytest.mark.parametrize("backend", ["loky", "threading"]) +@pytest.mark.parametrize("estimator", + [ElasticNetCV, MultiTaskElasticNetCV, + LassoCV, MultiTaskLassoCV]) +def test_linear_models_cv_fit_for_all_backends(backend, estimator): + # LinearModelsCV.fit performs inplace operations on input data which is + # memmapped when using loky backend, causing an error due to unexpected + # behavior of fancy indexing of read-only memmaps (cf. numpy#14132). + + if joblib.__version__ < LooseVersion('0.12') and backend == 'loky': + pytest.skip('loky backend does not exist in joblib <0.12') + + # Create a problem sufficiently large to cause memmapping (1MB). + n_targets = 1 + (estimator in (MultiTaskElasticNetCV, MultiTaskLassoCV)) + X, y = make_regression(20000, 10, n_targets=n_targets) + + with joblib.parallel_backend(backend=backend): + estimator(n_jobs=2, cv=3).fit(X, y) From 54354083eb0d749391d6b51480216a7f87747049 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 28 Apr 2020 20:59:32 +1000 Subject: [PATCH 087/125] DOC markup fixes for change log --- doc/whats_new/v0.23.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index d3a3de4d7153b..788f9fe6837b4 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -176,7 +176,7 @@ Changelog - |Enhancement| Added ``return_centers`` parameter in :func:`datasets.make_blobs`, which can be used to return centers for each cluster. - :pr:`15709` by :user:`` and + :pr:`15709` by :user:`shivamgargsya` and :user:`Venkatachalam N `. - |Enhancement| Functions :func:`datasets.make_circles` and @@ -198,8 +198,8 @@ Changelog ``csc`` matrices. :pr:`16837` by :user:`wornbb`. - |Fix| :class:`decomposition.PCA` with a float `n_components` parameter, will - exclusively choose the components that explain the variance greater than - `n_components`. :pr:`15669` by :user:`Krishna Chaitanya ` + exclusively choose the components that explain the variance greater than + `n_components`. :pr:`15669` by :user:`Krishna Chaitanya ` - |Fix| :class:`decomposition.PCA` with `n_components='mle'` now correctly handles small eigenvalues, and does not infer 0 as the correct number of @@ -302,7 +302,7 @@ Changelog ............................... - |Enhancement| :func:`gaussian_process.kernels.Matern` returns the RBF kernel when ``nu=np.inf``. - :pr:`15503` by :user:`Sam Dixon` . + :pr:`15503` by :user:`Sam Dixon `. - |Fix| Fixed bug in :class:`gaussian_process.GaussianProcessRegressor` that caused predicted standard deviations to only be between 0 and 1 when @@ -336,7 +336,7 @@ Changelog and `Olivier Grisel`_. - |Feature| Support of `sample_weight` in :class:`linear_model.ElasticNet` and - :class:`linear_model:Lasso` for dense feature matrix `X`. + :class:`linear_model.Lasso` for dense feature matrix `X`. :pr:`15436` by :user:`Christian Lorentzen `. - |Efficiency| :class:`linear_model.RidgeCV` and @@ -558,12 +558,12 @@ Changelog - |Enhancement| improve error message in :func:`utils.validation.column_or_1d`. :pr:`15926` by :user:`Loïc Estève `. -- |Enhancement| add warning in :func:`utils.validation.check_array` for +- |Enhancement| add warning in :func:`utils.check_array` for pandas sparse DataFrame. :pr:`16021` by :user:`Rushabh Vasani `. -- |Enhancement| :func:`utils.validation.check_array` now constructs a sparse - matrix from a pandas DataFrame that contains only `SparseArray`s. +- |Enhancement| :func:`utils.check_array` now constructs a sparse + matrix from a pandas DataFrame that contains only `SparseArray` columns. :pr:`16728` by `Thomas Fan`_. - |API| Passing classes to :func:`utils.estimator_checks.check_estimator` and From 9f015c8a14a67d248599dc376d33ec612dd9dbb9 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 28 Apr 2020 11:23:44 -0400 Subject: [PATCH 088/125] FIX Mixed bool dtype in pandas (#17008) --- sklearn/utils/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 8ee18371a3009..1fde1f0d69fb1 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -507,7 +507,7 @@ def check_array(array, accept_sparse=False, *, accept_large_sparse=True, # pandas boolean dtype __array__ interface coerces bools to objects for i, dtype_iter in enumerate(dtypes_orig): if dtype_iter.kind == 'b': - dtypes_orig[i] = np.object + dtypes_orig[i] = np.dtype(np.object) if all(isinstance(dtype, np.dtype) for dtype in dtypes_orig): dtype_orig = np.result_type(*dtypes_orig) From acbe13c07d204e6ba463f41d2fd39b2a1c776f20 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 28 Apr 2020 13:00:12 -0400 Subject: [PATCH 089/125] ENH Adds pandas IntegerArray support to check_array (#16508) --- doc/whats_new/v0.23.rst | 20 +++++++ sklearn/impute/_base.py | 9 ++- sklearn/impute/_iterative.py | 4 +- sklearn/impute/_knn.py | 4 +- sklearn/impute/tests/test_common.py | 29 +++++++++ sklearn/metrics/pairwise.py | 23 ++++--- sklearn/preprocessing/tests/test_common.py | 30 ++++++++++ sklearn/utils/tests/test_validation.py | 31 ++++++++++ sklearn/utils/validation.py | 70 +++++++++++++++------- 9 files changed, 186 insertions(+), 34 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 788f9fe6837b4..aedd5fe804722 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -316,6 +316,10 @@ Changelog ``max_value`` and ``min_value``. Array-like inputs allow a different max and min to be specified for each feature. :pr:`16403` by :user:`Narendra Mukherjee `. +- |Enhancement| :class:`impute.SimpleImputer`, :class:`impute.KNNImputer`, and + :class:`impute.SimpleImputer` accepts pandas' nullable integer dtype with + missing values. :pr:`16508` by `Thomas Fan`_. + :mod:`sklearn.inspection` ......................... @@ -485,6 +489,13 @@ Changelog can now contain `None`, where `drop_idx_[i] = None` means that no category is dropped for index `i`. :pr:`16585` by :user:`Chiara Marmo `. +- |Enhancement| :class:`preprocessing.MaxAbsScaler`, + :class:`preprocessing.MinMaxScaler`, :class:`preprocessing.StandardScaler`, + :class:`preprocessing.PowerTransformer`, + :class:`preprocessing.QuantileTransformer`, + :class:`preprocessing.RobustScaler` now supports pandas' nullable integer + dtype with missing values. :pr:`16508` by `Thomas Fan`_. + - |Efficiency| :class:`preprocessing.OneHotEncoder` is now faster at transforming. :pr:`15762` by `Thomas Fan`_. @@ -566,6 +577,15 @@ Changelog matrix from a pandas DataFrame that contains only `SparseArray` columns. :pr:`16728` by `Thomas Fan`_. +- |Enhancement| :func:`utils.validation.check_array` supports pandas' + nullable integer dtype with missing values when `force_all_finite` is set to + `False` or `'allow-nan'` in which case the data is converted to floating + point values where `pd.NA` values are replaced by `np.nan`. As a consequence, + all :mod:`sklearn.preprocessing` transformers that accept numeric inputs with + missing values represented as `np.nan` now also accepts being directly fed + pandas dataframes with `pd.Int* or `pd.Uint*` typed columns that use `pd.NA` + as a missing value marker. :pr:`16508` by `Thomas Fan`_. + - |API| Passing classes to :func:`utils.estimator_checks.check_estimator` and :func:`utils.estimator_checks.parametrize_with_checks` is now deprecated, and support for classes will be removed in 0.24. Pass instances instead. diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 5f1069708a20e..517de982d8478 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -128,7 +128,9 @@ class SimpleImputer(_BaseImputer): ---------- missing_values : number, string, np.nan (default) or None The placeholder for the missing values. All occurrences of - `missing_values` will be imputed. + `missing_values` will be imputed. For pandas' dataframes with + nullable integer dtypes with missing values, `missing_values` + should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`. strategy : string, default='mean' The imputation strategy. @@ -476,8 +478,9 @@ class MissingIndicator(TransformerMixin, BaseEstimator): ---------- missing_values : number, string, np.nan (default) or None The placeholder for the missing values. All occurrences of - `missing_values` will be indicated (True in the output array), the - other values will be marked as False. + `missing_values` will be imputed. For pandas' dataframes with + nullable integer dtypes with missing values, `missing_values` + should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`. features : str, default=None Whether the imputer mask should represent all or a subset of diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index 17a3d05507205..8f80c9723eac3 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -54,7 +54,9 @@ class IterativeImputer(_BaseImputer): missing_values : int, np.nan, default=np.nan The placeholder for the missing values. All occurrences of - ``missing_values`` will be imputed. + `missing_values` will be imputed. For pandas' dataframes with + nullable integer dtypes with missing values, `missing_values` + should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`. sample_posterior : boolean, default=False Whether to sample from the (Gaussian) predictive posterior of the diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index 44fccf024247e..80a6423bdef79 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -32,7 +32,9 @@ class KNNImputer(_BaseImputer): ---------- missing_values : number, string, np.nan or None, default=`np.nan` The placeholder for the missing values. All occurrences of - `missing_values` will be imputed. + `missing_values` will be imputed. For pandas' dataframes with + nullable integer dtypes with missing values, `missing_values` + should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`. n_neighbors : int, default=5 Number of neighboring samples to use for imputation. diff --git a/sklearn/impute/tests/test_common.py b/sklearn/impute/tests/test_common.py index a8d2fd9d6b2f7..220a335c15285 100644 --- a/sklearn/impute/tests/test_common.py +++ b/sklearn/impute/tests/test_common.py @@ -84,3 +84,32 @@ def test_imputers_add_indicator_sparse(imputer, marker): imputer.set_params(add_indicator=False) X_trans_no_indicator = imputer.fit_transform(X) assert_allclose_dense_sparse(X_trans[:, :-4], X_trans_no_indicator) + + +# ConvergenceWarning will be raised by the IterativeImputer +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") +@pytest.mark.parametrize("imputer", IMPUTERS) +@pytest.mark.parametrize("add_indicator", [True, False]) +def test_imputers_pandas_na_integer_array_support(imputer, add_indicator): + # Test pandas IntegerArray with pd.NA + pd = pytest.importorskip('pandas', minversion="1.0") + marker = np.nan + imputer = imputer.set_params(add_indicator=add_indicator, + missing_values=marker) + + X = np.array([ + [marker, 1, 5, marker, 1], + [2, marker, 1, marker, 2], + [6, 3, marker, marker, 3], + [1, 2, 9, marker, 4] + ]) + # fit on numpy array + X_trans_expected = imputer.fit_transform(X) + + # Creates dataframe with IntegerArrays with pd.NA + X_df = pd.DataFrame(X, dtype="Int16", columns=["a", "b", "c", "d", "e"]) + + # fit on pandas dataframe with IntegerArrays + X_trans = imputer.fit_transform(X_df) + + assert_allclose(X_trans_expected, X_trans) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 20350345f54da..2e1332d18a20c 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -100,17 +100,20 @@ def check_pairwise_arrays(X, Y, *, precomputed=False, dtype=None, raise an error. force_all_finite : boolean or 'allow-nan', (default=True) - Whether to raise an error on np.inf and np.nan in array. The + Whether to raise an error on np.inf, np.nan, pd.NA in array. The possibilities are: - True: Force all values of array to be finite. - - False: accept both np.inf and np.nan in array. - - 'allow-nan': accept only np.nan values in array. Values cannot - be infinite. + - False: accepts np.inf, np.nan, pd.NA in array. + - 'allow-nan': accepts only np.nan and pd.NA values in array. Values + cannot be infinite. .. versionadded:: 0.22 ``force_all_finite`` accepts the string ``'allow-nan'``. + .. versionchanged:: 0.23 + Accepts `pd.NA` and converts it into `np.nan` + copy : bool Whether a forced copy will be triggered. If copy=False, a copy might be triggered by a conversion. @@ -1691,15 +1694,19 @@ def pairwise_distances(X, Y=None, metric="euclidean", *, n_jobs=None, for more details. force_all_finite : boolean or 'allow-nan', (default=True) - Whether to raise an error on np.inf and np.nan in array. The + Whether to raise an error on np.inf, np.nan, pd.NA in array. The possibilities are: - True: Force all values of array to be finite. - - False: accept both np.inf and np.nan in array. - - 'allow-nan': accept only np.nan values in array. Values cannot - be infinite. + - False: accepts np.inf, np.nan, pd.NA in array. + - 'allow-nan': accepts only np.nan and pd.NA values in array. Values + cannot be infinite. .. versionadded:: 0.22 + ``force_all_finite`` accepts the string ``'allow-nan'``. + + .. versionchanged:: 0.23 + Accepts `pd.NA` and converts it into `np.nan` **kwds : optional keyword parameters Any further parameters are passed directly to the distance function. diff --git a/sklearn/preprocessing/tests/test_common.py b/sklearn/preprocessing/tests/test_common.py index 7938256d482b7..802329fc5ce32 100644 --- a/sklearn/preprocessing/tests/test_common.py +++ b/sklearn/preprocessing/tests/test_common.py @@ -126,3 +126,33 @@ def test_missing_value_handling(est, func, support_sparse, strictly_positive): Xt_inv_sp = est_sparse.inverse_transform(Xt_sp) assert len(records) == 0 assert_allclose(Xt_inv_sp.A, Xt_inv_dense) + + +@pytest.mark.parametrize( + "est, func", + [(MaxAbsScaler(), maxabs_scale), + (MinMaxScaler(), minmax_scale), + (StandardScaler(), scale), + (StandardScaler(with_mean=False), scale), + (PowerTransformer('yeo-johnson'), power_transform), + (PowerTransformer('box-cox'), power_transform,), + (QuantileTransformer(n_quantiles=3), quantile_transform), + (RobustScaler(), robust_scale), + (RobustScaler(with_centering=False), robust_scale)] +) +def test_missing_value_pandas_na_support(est, func): + # Test pandas IntegerArray with pd.NA + pd = pytest.importorskip('pandas', minversion="1.0") + + X = np.array([[1, 2, 3, np.nan, np.nan, 4, 5, 1], + [np.nan, np.nan, 8, 4, 6, np.nan, np.nan, 8], + [1, 2, 3, 4, 5, 6, 7, 8]]).T + + # Creates dataframe with IntegerArrays with pd.NA + X_df = pd.DataFrame(X, dtype="Int16", columns=['a', 'b', 'c']) + X_df['c'] = X_df['c'].astype('int') + + X_trans = est.fit_transform(X) + X_df_trans = est.fit_transform(X_df) + + assert_allclose(X_trans, X_df_trans) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 418f037936c64..bcfd8fcd8d50e 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -349,6 +349,37 @@ def test_check_array(): check_array(X, dtype="numeric") +@pytest.mark.parametrize("pd_dtype", ["Int8", "Int16", "UInt8", "UInt16"]) +@pytest.mark.parametrize("dtype, expected_dtype", [ + ([np.float32, np.float64], np.float32), + (np.float64, np.float64), + ("numeric", np.float64), +]) +def test_check_array_pandas_na_support(pd_dtype, dtype, expected_dtype): + # Test pandas IntegerArray with pd.NA + pd = pytest.importorskip('pandas', minversion="1.0") + + X_np = np.array([[1, 2, 3, np.nan, np.nan], + [np.nan, np.nan, 8, 4, 6], + [1, 2, 3, 4, 5]]).T + + # Creates dataframe with IntegerArrays with pd.NA + X = pd.DataFrame(X_np, dtype=pd_dtype, columns=['a', 'b', 'c']) + # column c has no nans + X['c'] = X['c'].astype('float') + X_checked = check_array(X, force_all_finite='allow-nan', dtype=dtype) + assert_allclose(X_checked, X_np) + assert X_checked.dtype == expected_dtype + + X_checked = check_array(X, force_all_finite=False, dtype=dtype) + assert_allclose(X_checked, X_np) + assert X_checked.dtype == expected_dtype + + msg = "Input contains NaN, infinity" + with pytest.raises(ValueError, match=msg): + check_array(X, force_all_finite=True) + + def test_check_array_pandas_dtype_object_conversion(): # test that data-frame like objects with dtype object # get converted diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 1fde1f0d69fb1..7a6ef1e05fdde 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -135,17 +135,20 @@ def as_float_array(X, *, copy=True, force_all_finite=True): returned if X's dtype is not a floating point type. force_all_finite : boolean or 'allow-nan', (default=True) - Whether to raise an error on np.inf and np.nan in X. The possibilities - are: + Whether to raise an error on np.inf, np.nan, pd.NA in X. The + possibilities are: - True: Force all values of X to be finite. - - False: accept both np.inf and np.nan in X. - - 'allow-nan': accept only np.nan values in X. Values cannot be - infinite. + - False: accepts np.inf, np.nan, pd.NA in X. + - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot + be infinite. .. versionadded:: 0.20 ``force_all_finite`` accepts the string ``'allow-nan'``. + .. versionchanged:: 0.23 + Accepts `pd.NA` and converts it into `np.nan` + Returns ------- XT : {array, sparse matrix} @@ -317,17 +320,20 @@ def _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy, be triggered by a conversion. force_all_finite : boolean or 'allow-nan', (default=True) - Whether to raise an error on np.inf and np.nan in X. The possibilities - are: + Whether to raise an error on np.inf, np.nan, pd.NA in X. The + possibilities are: - True: Force all values of X to be finite. - - False: accept both np.inf and np.nan in X. - - 'allow-nan': accept only np.nan values in X. Values cannot be - infinite. + - False: accepts np.inf, np.nan, pd.NA in X. + - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot + be infinite. .. versionadded:: 0.20 ``force_all_finite`` accepts the string ``'allow-nan'``. + .. versionchanged:: 0.23 + Accepts `pd.NA` and converts it into `np.nan` + Returns ------- spmatrix_converted : scipy sparse matrix. @@ -438,19 +444,20 @@ def check_array(array, accept_sparse=False, *, accept_large_sparse=True, be triggered by a conversion. force_all_finite : boolean or 'allow-nan', (default=True) - Whether to raise an error on np.inf and np.nan in array. The + Whether to raise an error on np.inf, np.nan, pd.NA in array. The possibilities are: - True: Force all values of array to be finite. - - False: accept both np.inf and np.nan in array. - - 'allow-nan': accept only np.nan values in array. Values cannot - be infinite. - - For object dtyped data, only np.nan is checked and not np.inf. + - False: accepts np.inf, np.nan, pd.NA in array. + - 'allow-nan': accepts only np.nan and pd.NA values in array. Values + cannot be infinite. .. versionadded:: 0.20 ``force_all_finite`` accepts the string ``'allow-nan'``. + .. versionchanged:: 0.23 + Accepts `pd.NA` and converts it into `np.nan` + ensure_2d : boolean (default=True) Whether to raise a value error if array is not 2D. @@ -491,6 +498,7 @@ def check_array(array, accept_sparse=False, *, accept_large_sparse=True, # check if the object contains several dtypes (typically a pandas # DataFrame), and store them. If not, store None. dtypes_orig = None + has_pd_integer_array = False if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'): # throw warning if columns are sparse. If all columns are sparse, then # array.sparse exists and sparsity will be perserved (later). @@ -508,6 +516,19 @@ def check_array(array, accept_sparse=False, *, accept_large_sparse=True, for i, dtype_iter in enumerate(dtypes_orig): if dtype_iter.kind == 'b': dtypes_orig[i] = np.dtype(np.object) + elif dtype_iter.name.startswith(("Int", "UInt")): + # name looks like an Integer Extension Array, now check for + # the dtype + with suppress(ImportError): + from pandas import (Int8Dtype, Int16Dtype, + Int32Dtype, Int64Dtype, + UInt8Dtype, UInt16Dtype, + UInt32Dtype, UInt64Dtype) + if isinstance(dtype_iter, (Int8Dtype, Int16Dtype, + Int32Dtype, Int64Dtype, + UInt8Dtype, UInt16Dtype, + UInt32Dtype, UInt64Dtype)): + has_pd_integer_array = True if all(isinstance(dtype, np.dtype) for dtype in dtypes_orig): dtype_orig = np.result_type(*dtypes_orig) @@ -528,6 +549,10 @@ def check_array(array, accept_sparse=False, *, accept_large_sparse=True, # list of accepted types. dtype = dtype[0] + if has_pd_integer_array: + # If there are any pandas integer extension arrays, + array = array.astype(dtype) + if force_all_finite not in (True, False, 'allow-nan'): raise ValueError('force_all_finite should be a bool or "allow-nan"' '. Got {!r} instead'.format(force_all_finite)) @@ -712,18 +737,21 @@ def check_X_y(X, y, accept_sparse=False, *, accept_large_sparse=True, be triggered by a conversion. force_all_finite : boolean or 'allow-nan', (default=True) - Whether to raise an error on np.inf and np.nan in X. This parameter - does not influence whether y can have np.inf or np.nan values. + Whether to raise an error on np.inf, np.nan, pd.NA in X. This parameter + does not influence whether y can have np.inf, np.nan, pd.NA values. The possibilities are: - True: Force all values of X to be finite. - - False: accept both np.inf and np.nan in X. - - 'allow-nan': accept only np.nan values in X. Values cannot be - infinite. + - False: accepts np.inf, np.nan, pd.NA in X. + - 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot + be infinite. .. versionadded:: 0.20 ``force_all_finite`` accepts the string ``'allow-nan'``. + .. versionchanged:: 0.23 + Accepts `pd.NA` and converts it into `np.nan` + ensure_2d : boolean (default=True) Whether to raise a value error if X is not 2D. From 1bd740436ddeadd00567cd2ab7ec15e20c2f5a57 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 28 Apr 2020 17:16:20 -0400 Subject: [PATCH 090/125] DOC Fixes formating in whats new (#17076) --- doc/whats_new/v0.23.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index aedd5fe804722..0e149ed03a9fa 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -215,8 +215,9 @@ Changelog raise `invalid value encountered in multiply` during `fit`. :pr:`16718` by :user:`Gui Miotto `. -- |Feature| Added `n_components_` attribute to :class:'decomposition.SparsePCA' - and :class:'MiniBatchSparsePCA'. :pr:'16981' by :user:'Mateusz Górski ' +- |Feature| Added `n_components_` attribute to :class:`decomposition.SparsePCA` + and :class:`decomposition.MiniBatchSparsePCA`. :pr:`16981` by + :user:`Mateusz Górski `. :mod:`sklearn.ensemble` ....................... From 2dd12af9d687f4349a37ee0df90e61c9f992b092 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 28 Apr 2020 23:26:26 +0200 Subject: [PATCH 091/125] DOC Improve claim prediction example (#16648) Co-Authored-By: Christian Lorentzen Co-Authored-By: Nicolas Hug --- ...plot_poisson_regression_non_normal_loss.py | 537 +++++++++++------- 1 file changed, 325 insertions(+), 212 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 4b0386edfcdf6..4fc3bea7bda51 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -3,87 +3,87 @@ Poisson regression and non-normal loss ====================================== -This example illustrates the use of log-linear Poisson regression -on the `French Motor Third-Party Liability Claims dataset -`_ from [1]_ and compares -it with models learned with least squared error. In this dataset, each sample -corresponds to an insurance policy, i.e. a contract within an insurance -company and an individual (policiholder). Available features include driver -age, vehicle age, vehicle power, etc. - -A few definitions: a *claim* is the request made by a policyholder to the -insurer to compensate for a loss covered by the insurance. The *exposure* is -the duration of the insurance coverage of a given policy, in years. - -Our goal is to predict the expected number of insurance claims (or frequency) -following car accidents for a policyholder given the historical data over a -population of policyholders. +This example illustrates the use of log-linear Poisson regression on the +`French Motor Third-Party Liability Claims dataset +`_ from [1]_ and compares it with a linear +model fitted with the usual least squared error and a non-linear GBRT model +fitted with the Poisson loss (and a log-link). + +A few definitions: + +- A **policy** is a contract between an insurance company and an individual: + the **policyholder**, that is, the vehicle driver in this case. + +- A **claim** is the request made by a policyholder to the insurer to + compensate for a loss covered by the insurance. + +- The **exposure** is the duration of the insurance coverage of a given policy, + in years. + +- The claim **frequency** is the number of claims divided by the exposure, + typically measured in number of claims per year. + +In this dataset, each sample corresponds to an insurance policy. Available +features include driver age, vehicle age, vehicle power, etc. + +Our goal is to predict the expected frequency of claims following car accidents +for a new policyholder given the historical data over a population of +policyholders. .. [1] A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor - Third-Party Liability Claims (November 8, 2018). - `doi:10.2139/ssrn.3164764 `_ + Third-Party Liability Claims (November 8, 2018). `doi:10.2139/ssrn.3164764 + `_ """ print(__doc__) - # Authors: Christian Lorentzen # Roman Yurchak +# Olivier Grisel # License: BSD 3 clause -import warnings - import numpy as np import matplotlib.pyplot as plt import pandas as pd -from sklearn.datasets import fetch_openml -from sklearn.dummy import DummyRegressor -from sklearn.compose import ColumnTransformer -from sklearn.linear_model import Ridge, PoissonRegressor -from sklearn.model_selection import train_test_split -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import FunctionTransformer, OneHotEncoder -from sklearn.preprocessing import OrdinalEncoder -from sklearn.preprocessing import StandardScaler, KBinsDiscretizer -from sklearn.ensemble import RandomForestRegressor -from sklearn.utils import gen_even_slices -from sklearn.metrics import auc - -from sklearn.metrics import mean_squared_error, mean_absolute_error -from sklearn.metrics import mean_poisson_deviance - -def load_mtpl2(n_samples=100000): - """Fetch the French Motor Third-Party Liability Claims dataset. - - Parameters - ---------- - n_samples: int or None, default=100000 - Number of samples to select (for faster run time). If None, the full - dataset with 678013 samples is returned. - """ +############################################################################## +# The French Motor Third-Party Liability Claims dataset +# ----------------------------------------------------- +# +# Let's load the motor claim dataset from OpenML: +# https://www.openml.org/d/41214 - # freMTPL2freq dataset from https://www.openml.org/d/41214 - df = fetch_openml(data_id=41214, as_frame=True)['data'] +from sklearn.datasets import fetch_openml - # unquote string fields - for column_name in df.columns[df.dtypes.values == np.object]: - df[column_name] = df[column_name].str.strip("'") - if n_samples is not None: - return df.iloc[:n_samples] - return df +df = fetch_openml(data_id=41214, as_frame=True).frame +df ############################################################################## -# Let's load the motor claim dataset. We ignore the severity data for this -# study for the sake of simplicitly. +# The number of claims (``ClaimNb``) is a positive integer that can be modeled +# as a Poisson distribution. It is then assumed to be the number of discrete +# events occurring with a constant rate in a given time interval (``Exposure``, +# in units of years). # -# We also subsample the data for the sake of computational cost and running -# time. Using the full dataset would lead to similar conclusions. +# Here we want to model the frequency ``y = ClaimNb / Exposure`` conditionally +# on ``X`` via a (scaled) Poisson distribution, and use ``Exposure`` as +# ``sample_weight``. + +df["Frequency"] = df["ClaimNb"] / df["Exposure"] + +print("Average Frequency = {}" + .format(np.average(df["Frequency"], weights=df["Exposure"]))) -df = load_mtpl2(n_samples=300000) +print("Fraction of exposure with zero claims = {0:.1%}" + .format(df.loc[df["ClaimNb"] == 0, "Exposure"].sum() / + df["Exposure"].sum())) -# Correct for unreasonable observations (that might be data error) -df["Exposure"] = df["Exposure"].clip(upper=1) +fig, (ax0, ax1, ax2) = plt.subplots(ncols=3, figsize=(16, 4)) +ax0.set_title("Number of claims") +_ = df["ClaimNb"].hist(bins=30, log=True, ax=ax0) +ax1.set_title("Exposure in years") +_ = df["Exposure"].hist(bins=30, log=True, ax=ax1) +ax2.set_title("Frequency (number of claims per year)") +_ = df["Frequency"].hist(bins=30, log=True, ax=ax2) ############################################################################## # The remaining columns can be used to predict the frequency of claim events. @@ -93,6 +93,12 @@ def load_mtpl2(n_samples=100000): # In order to fit linear models with those predictors it is therefore # necessary to perform standard feature transformations as follows: +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import FunctionTransformer, OneHotEncoder +from sklearn.preprocessing import StandardScaler, KBinsDiscretizer +from sklearn.compose import ColumnTransformer + + log_scale_transformer = make_pipeline( FunctionTransformer(np.log, validate=False), StandardScaler() @@ -112,123 +118,144 @@ def load_mtpl2(n_samples=100000): remainder="drop", ) -############################################################################## -# The number of claims (``ClaimNb``) is a positive integer that can be modeled -# as a Poisson distribution. It is then assumed to be the number of discrete -# events occurring with a constant rate in a given time interval -# (``Exposure``, in units of years). Here we model the frequency -# ``y = ClaimNb / Exposure``, which is still a (scaled) Poisson distribution, -# and use ``Exposure`` as ``sample_weight``. - -df["Frequency"] = df["ClaimNb"] / df["Exposure"] - -print( - pd.cut(df["Frequency"], [-1e-6, 1e-6, 1, 2, 3, 4, 5]).value_counts() -) - -print("Average Frequency = {}" - .format(np.average(df["Frequency"], weights=df["Exposure"]))) - -print("Percentage of zero claims = {0:%}" - .format(df.loc[df["ClaimNb"] == 0, "Exposure"].sum() / - df["Exposure"].sum())) ############################################################################## -# It is worth noting that 92 % of policyholders have zero claims, and if we -# were to convert this problem into a binary classification task, it would be -# significantly imbalanced. +# A constant prediction baseline +# ------------------------------ +# +# It is worth noting that more than 93% of policyholders have zero claims. If +# we were to convert this problem into a binary classification task, it would +# be significantly imbalanced, and even a simplistic model that would only +# predict mean can achieve an accuracy of 93%. # # To evaluate the pertinence of the used metrics, we will consider as a # baseline a "dummy" estimator that constantly predicts the mean frequency of # the training sample. -df_train, df_test = train_test_split(df, random_state=0) +from sklearn.dummy import DummyRegressor +from sklearn.pipeline import Pipeline +from sklearn.model_selection import train_test_split -dummy = make_pipeline( - linear_model_preprocessor, - DummyRegressor(strategy='mean') -) -dummy.fit(df_train, df_train["Frequency"], - dummyregressor__sample_weight=df_train["Exposure"]) +df_train, df_test = train_test_split(df, test_size=0.33, random_state=0) + +dummy = Pipeline([ + ("preprocessor", linear_model_preprocessor), + ("regressor", DummyRegressor(strategy='mean')), +]).fit(df_train, df_train["Frequency"], + regressor__sample_weight=df_train["Exposure"]) + + +############################################################################## +# Let's compute the performance of this constant prediction baseline with 3 +# different regression metrics: + +from sklearn.metrics import mean_squared_error +from sklearn.metrics import mean_absolute_error +from sklearn.metrics import mean_poisson_deviance def score_estimator(estimator, df_test): """Score an estimator on the test set.""" - y_pred = estimator.predict(df_test) print("MSE: %.3f" % mean_squared_error(df_test["Frequency"], y_pred, - df_test["Exposure"])) + sample_weight=df_test["Exposure"])) print("MAE: %.3f" % mean_absolute_error(df_test["Frequency"], y_pred, - df_test["Exposure"])) + sample_weight=df_test["Exposure"])) - # ignore non-positive predictions, as they are invalid for - # the Poisson deviance + # Ignore non-positive predictions, as they are invalid for + # the Poisson deviance. mask = y_pred > 0 if (~mask).any(): - warnings.warn("Estimator yields non-positive predictions for {} " - "samples out of {}. These will be ignored while " - "computing the Poisson deviance" - .format((~mask).sum(), mask.shape[0])) + n_masked, n_samples = (~mask).sum(), mask.shape[0] + print(f"WARNING: Estimator yields invalid, non-positive predictions " + f" for {n_masked} samples out of {n_samples}. These predictions " + f"are ignored when computing the Poisson deviance.") print("mean Poisson deviance: %.3f" % mean_poisson_deviance(df_test["Frequency"][mask], y_pred[mask], - df_test["Exposure"][mask])) + sample_weight=df_test["Exposure"][mask])) print("Constant mean frequency evaluation:") score_estimator(dummy, df_test) ############################################################################## -# We start by modeling the target variable with the least squares linear -# regression model, +# (Generalized) Linear models +# --------------------------- +# +# We start by modeling the target variable with the (l2 penalized) least +# squares linear regression model, more comonly known as Ridge regression. We +# use a low penalization `alpha`, as we expect such a linear model to under-fit +# on such a large dataset. -ridge = make_pipeline(linear_model_preprocessor, Ridge(alpha=1.0)) -ridge.fit(df_train, df_train["Frequency"], - ridge__sample_weight=df_train["Exposure"]) +from sklearn.linear_model import Ridge + + +ridge_glm = Pipeline([ + ("preprocessor", linear_model_preprocessor), + ("regressor", Ridge(alpha=1e-6)), +]).fit(df_train, df_train["Frequency"], + regressor__sample_weight=df_train["Exposure"]) ############################################################################## # The Poisson deviance cannot be computed on non-positive values predicted by -# the model. For models that do return a few non-positive predictions -# (e.g. :class:`linear_model.Ridge`) we ignore the corresponding samples, +# the model. For models that do return a few non-positive predictions (e.g. +# :class:`~sklearn.linear_model.Ridge`) we ignore the corresponding samples, # meaning that the obtained Poisson deviance is approximate. An alternative -# approach could be to use :class:`compose.TransformedTargetRegressor` +# approach could be to use :class:`~sklearn.compose.TransformedTargetRegressor` # meta-estimator to map ``y_pred`` to a strictly positive domain. print("Ridge evaluation:") -score_estimator(ridge, df_test) +score_estimator(ridge_glm, df_test) ############################################################################## # Next we fit the Poisson regressor on the target variable. We set the -# regularization strength ``alpha`` to 1 over number of samples in oder to -# mimic the Ridge regressor whose L2 penalty term scales differently with the -# number of samples. +# regularization strength ``alpha`` to approximately 1e-6 over number of +# samples (i.e. `1e-12`) in order to mimic the Ridge regressor whose L2 penalty +# term scales differently with the number of samples. -poisson = make_pipeline( - linear_model_preprocessor, - PoissonRegressor(alpha=1/df_train.shape[0], max_iter=1000) -) -poisson.fit(df_train, df_train["Frequency"], - poissonregressor__sample_weight=df_train["Exposure"]) +from sklearn.linear_model import PoissonRegressor + +n_samples = df_train.shape[0] + +poisson_glm = Pipeline([ + ("preprocessor", linear_model_preprocessor), + ("regressor", PoissonRegressor(alpha=1e-12, max_iter=300)) +]) +poisson_glm.fit(df_train, df_train["Frequency"], + regressor__sample_weight=df_train["Exposure"]) print("PoissonRegressor evaluation:") -score_estimator(poisson, df_test) +score_estimator(poisson_glm, df_test) ############################################################################## -# Finally, we will consider a non-linear model, namely a random forest. Random -# forests do not require the categorical data to be one-hot encoded: instead, -# we can encode each category label with an arbitrary integer using -# :class:`preprocessing.OrdinalEncoder`. With this encoding, the forest will -# treat the categorical features as ordered features, which might not be always -# a desired behavior. However this effect is limited for deep enough trees -# which are able to recover the categorical nature of the features. The main -# advantage of the :class:`preprocessing.OrdinalEncoder` over the -# :class:`preprocessing.OneHotEncoder` is that it will make training faster. - -rf_preprocessor = ColumnTransformer( +# Finally, we will consider a non-linear model, namely Gradient Boosting +# Regression Trees. Tree-based models do not require the categorical data to be +# one-hot encoded: instead, we can encode each category label with an arbitrary +# integer using :class:`~sklearn.preprocessing.OrdinalEncoder`. With this +# encoding, the trees will treat the categorical features as ordered features, +# which might not be always a desired behavior. However this effect is limited +# for deep enough trees which are able to recover the categorical nature of the +# features. The main advantage of the +# :class:`~sklearn.preprocessing.OrdinalEncoder` over the +# :class:`~sklearn.preprocessing.OneHotEncoder` is that it will make training +# faster. +# +# Gradient Boosting also gives the possibility to fit the trees with a Poisson +# loss (with an implicit log-link function) instead of the default +# least-squares loss. Here we only fit trees with the Poisson loss to keep this +# example concise. + +from sklearn.experimental import enable_hist_gradient_boosting # noqa +from sklearn.ensemble import HistGradientBoostingRegressor +from sklearn.preprocessing import OrdinalEncoder + + +tree_preprocessor = ColumnTransformer( [ ("categorical", OrdinalEncoder(), ["VehBrand", "VehPower", "VehGas", "Region", "Area"]), @@ -237,22 +264,22 @@ def score_estimator(estimator, df_test): ], remainder="drop", ) -rf = make_pipeline( - rf_preprocessor, - RandomForestRegressor(min_weight_fraction_leaf=0.01, n_jobs=2) -) -rf.fit(df_train, df_train["Frequency"].values, - randomforestregressor__sample_weight=df_train["Exposure"].values) - +poisson_gbrt = Pipeline([ + ("preprocessor", tree_preprocessor), + ("regressor", HistGradientBoostingRegressor(loss="poisson", + max_leaf_nodes=128)), +]) +poisson_gbrt.fit(df_train, df_train["Frequency"], + regressor__sample_weight=df_train["Exposure"]) -print("RandomForestRegressor evaluation:") -score_estimator(rf, df_test) +print("Poisson Gradient Boosted Trees evaluation:") +score_estimator(poisson_gbrt, df_test) ############################################################################## -# Like the Ridge regression above, the random forest model minimizes the -# conditional squared error, too. However, because of a higher predictive -# power, it also results in a smaller Poisson deviance than the Poisson +# Like the Ridge regression above, the gradient boosted trees model minimizes +# the conditional squared error. However, because of a higher predictive power, +# it also results in a smaller Poisson deviance than the linear Poisson # regression model. # # Evaluating models with a single train / test split is prone to random @@ -263,7 +290,7 @@ def score_estimator(estimator, df_test): # comparing the histogram of observed target values with that of predicted # values: -fig, axes = plt.subplots(2, 4, figsize=(16, 6), sharey=True) +fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(16, 6), sharey=True) fig.subplots_adjust(bottom=0.2) n_bins = 20 for row_idx, label, df in zip(range(2), @@ -278,7 +305,7 @@ def score_estimator(estimator, df_test): axes[row_idx, 0].set_ylim([1e1, 5e5]) axes[row_idx, 0].set_ylabel(label + " samples") - for idx, model in enumerate([ridge, poisson, rf]): + for idx, model in enumerate([ridge_glm, poisson_glm, poisson_gbrt]): y_pred = model.predict(df) pd.Series(y_pred).hist(bins=np.linspace(-1, 4, n_bins), @@ -292,21 +319,42 @@ def score_estimator(estimator, df_test): ############################################################################## # The experimental data presents a long tail distribution for ``y``. In all -# models we predict a mean expected value, so we will have necessarily fewer -# extreme values. Additionally, the normal distribution used in ``Ridge`` and -# ``RandomForestRegressor`` has a constant variance, while for the Poisson -# distribution used in ``PoissonRegressor``, the variance is proportional to -# the mean predicted value. +# models, we predict the expected frequency of a random variable, so we will +# have necessarily fewer extreme values than for the observed realizations of +# that random variable. This explains that the mode of the histograms of model +# predictions doesn't necessarily correspond to the smallest value. +# Additionally, the normal distribution used in ``Ridge`` has a constant +# variance, while for the Poisson distribution used in ``PoissonRegressor`` and +# ``HistGradientBoostingRegressor``, the variance is proportional to the +# predicted expected value. +# +# Thus, among the considered estimators, ``PoissonRegressor`` and +# ``HistGradientBoostingRegressor`` are a-priori better suited for modeling the +# long tail distribution of the non-negative data as compared to the ``Ridge`` +# model which makes a wrong assumption on the distribution of the target +# variable. +# +# The ``HistGradientBoostingRegressor`` estimator has the most flexibility and +# is able to predict higher expected values. # -# Thus, among the considered estimators, ``PoissonRegressor`` is better suited -# for modeling the long tail distribution of the data as compared to the -# ``Ridge`` and ``RandomForestRegressor`` estimators. +# Note that we could have used the least squares loss for the +# ``HistGradientBoostingRegressor`` model. This would wrongly assume a normal +# distribution the response variable as for the `Ridge` model, and possibly +# also lead to slightly negative predictions. However the gradient boosted +# trees would still perform relatively well and in particular better than +# ``PoissonRegressor`` thanks to the flexibility of the trees combined with the +# large number of training samples. +# +# Evaluation of the calibration of predictions +# -------------------------------------------- # # To ensure that estimators yield reasonable predictions for different # policyholder types, we can bin test samples according to ``y_pred`` returned # by each model. Then for each bin, we compare the mean predicted ``y_pred``, # with the mean observed target: +from sklearn.utils import gen_even_slices + def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, n_bins=100): @@ -352,104 +400,169 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, return bin_centers, y_true_bin, y_pred_bin -fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 3.5)) +print(f"Actual number of claims: {df_test['ClaimNb'].sum()}") +fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 8)) plt.subplots_adjust(wspace=0.3) -for axi, model in zip(ax, [ridge, poisson, rf]): +for axi, model in zip(ax.ravel(), [ridge_glm, poisson_glm, poisson_gbrt, + dummy]): y_pred = model.predict(df_test) - + y_true = df_test["Frequency"].values + exposure = df_test["Exposure"].values q, y_true_seg, y_pred_seg = _mean_frequency_by_risk_group( - df_test["Frequency"].values, - y_pred, - sample_weight=df_test["Exposure"].values, - n_bins=10) + y_true, y_pred, sample_weight=exposure, n_bins=10) + + # Name of the model after the estimator used in the last step of the + # pipeline. + print(f"Predicted number of claims by {model[-1]}: " + f"{np.sum(y_pred * exposure):.1f}") - axi.plot(q, y_pred_seg, marker='o', linestyle="-", label="predictions") - axi.plot(q, y_true_seg, marker='x', linestyle="--", label="observations") + axi.plot(q, y_pred_seg, marker='x', linestyle="--", label="predictions") + axi.plot(q, y_true_seg, marker='o', linestyle="--", label="observations") axi.set_xlim(0, 1.0) - axi.set_ylim(0, 0.6) + axi.set_ylim(0, 0.5) axi.set( - title=model[-1].__class__.__name__, + title=model[-1], xlabel='Fraction of samples sorted by y_pred', ylabel='Mean Frequency (y_pred)' ) axi.legend() plt.tight_layout() -############################################################################## -# The ``Ridge`` regression model can predict very low expected frequencies -# that do not match the data. It can therefore severly under-estimate the risk -# for some policyholders. -# -# ``PoissonRegressor`` and ``RandomForestRegressor`` show better consistency -# between predicted and observed targets, especially for low predicted target -# values. -# -# However, for some business applications, we are not necessarily interested -# in the ability of the model to predict the expected frequency value, but -# instead to predict which policyholder groups are the riskiest and which are -# the safest. In this case, the model evaluation would cast the problem as a -# ranking problem rather than a regression problem. -# -# To compare the 3 models within this perspective, one can plot the fraction of -# the number of claims vs the fraction of exposure for test samples ordered by -# the model predictions, from safest to riskiest according to each model: - - -def _cumulated_claims(y_true, y_pred, exposure): - idx_sort = np.argsort(y_pred) # from safest to riskiest - sorted_exposure = exposure[idx_sort] - sorted_frequencies = y_true[idx_sort] - cumulated_exposure = np.cumsum(sorted_exposure) - cumulated_exposure /= cumulated_exposure[-1] - cumulated_claims = np.cumsum(sorted_exposure * sorted_frequencies) +############################################################################### +# The dummy regression model predicts a constant frequency. This model does not +# attribute the same tied rank to all samples but is none-the-less globally +# well calibrated (to estimate the mean frequency of the entire population). +# +# The ``Ridge`` regression model can predict very low expected frequencies that +# do not match the data. It can therefore severly under-estimate the risk for +# some policyholders. +# +# ``PoissonRegressor`` and ``HistGradientBoostingRegressor`` show better +# consistency between predicted and observed targets, especially for low +# predicted target values. +# +# The sum of all predictions also confirms the calibration issue of the +# ``Ridge`` model: it under-estimates by more than 3% the total number of +# claims in the test set while the other three models can approximately recover +# the total number of claims of the test portfolio. +# +# Evaluation of the ranking power +# ------------------------------- +# +# For some business applications, we are interested in the ability of the model +# to rank the riskiest from the safest policyholders, irrespective of the +# absolute value of the prediction. In this case, the model evaluation would +# cast the problem as a ranking problem rather than a regression problem. +# +# To compare the 3 models from this perspective, one can plot the cumulative +# proportion of claims vs the cumulative proportion of exposure for the test +# samples order by the model predictions, from safest to riskiest according to +# each model. +# +# This plot is called a Lorenz curve and can be summarized by the Gini index: + +from sklearn.metrics import auc + + +def lorenz_curve(y_true, y_pred, exposure): + y_true, y_pred = np.asarray(y_true), np.asarray(y_pred) + exposure = np.asarray(exposure) + + # order samples by increasing predicted risk: + ranking = np.argsort(y_pred) + ranked_exposure = exposure[ranking] + ranked_frequencies = y_true[ranking] + ranked_exposure = exposure[ranking] + cumulated_claims = np.cumsum(ranked_frequencies * ranked_exposure) cumulated_claims /= cumulated_claims[-1] + cumulated_exposure = np.cumsum(ranked_exposure) + cumulated_exposure /= cumulated_exposure[-1] return cumulated_exposure, cumulated_claims fig, ax = plt.subplots(figsize=(8, 8)) -for model in [ridge, poisson, rf]: +for model in [dummy, ridge_glm, poisson_glm, poisson_gbrt]: y_pred = model.predict(df_test) - cum_exposure, cum_claims = _cumulated_claims( - df_test["Frequency"].values, - y_pred, - df_test["Exposure"].values) - area = auc(cum_exposure, cum_claims) - label = "{} (area under curve: {:.3f})".format( - model[-1].__class__.__name__, area) + cum_exposure, cum_claims = lorenz_curve(df_test["Frequency"], y_pred, + df_test["Exposure"]) + gini = 1 - 2 * auc(cum_exposure, cum_claims) + label = "{} (Gini: {:.2f})".format(model[-1], gini) ax.plot(cum_exposure, cum_claims, linestyle="-", label=label) # Oracle model: y_pred == y_test -cum_exposure, cum_claims = _cumulated_claims( - df_test["Frequency"].values, - df_test["Frequency"].values, - df_test["Exposure"].values) -area = auc(cum_exposure, cum_claims) -label = "Oracle (area under curve: {:.3f})".format(area) +cum_exposure, cum_claims = lorenz_curve(df_test["Frequency"], + df_test["Frequency"], + df_test["Exposure"]) +gini = 1 - 2 * auc(cum_exposure, cum_claims) +label = "Oracle (Gini: {:.2f})".format(gini) ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label=label) # Random Baseline ax.plot([0, 1], [0, 1], linestyle="--", color="black", label="Random baseline") ax.set( - title="Cumulated number of claims by model", - xlabel='Fraction of exposure (from safest to riskiest)', - ylabel='Fraction of number of claims' + title="Lorenz curves by model", + xlabel='Cumulative proportion of exposure (from safest to riskiest)', + ylabel='Cumulative proportion of claims' ) ax.legend(loc="upper left") ############################################################################## -# This plot reveals that the random forest model is slightly better at ranking -# policyholders by risk profiles even if the absolute value of the predicted -# expected frequencies are less well calibrated than for the linear Poisson -# model. +# As expected, the dummy regressor is unable to correctly rank the samples and +# therefore performs the worst on this plot. +# +# The tree-based model is significantly better at ranking policyholders by risk +# while the two linear models perform similarly. # # All three models are significantly better than chance but also very far from # making perfect predictions. # # This last point is expected due to the nature of the problem: the occurrence # of accidents is mostly dominated by circumstantial causes that are not -# captured in the columns of the dataset or that are indeed random. +# captured in the columns of the dataset and can indeed be considered as purely +# random. +# +# The linear models assume no interactions between the input variables which +# likely causes under-fitting. Inserting a polynomial feature extractor +# (:func:`~sklearn.preprocessing.PolynomialFeatures`) indeed increases their +# discrimative power by 2 points of Gini index. In particular it improves the +# ability of the models to identify the top 5% riskiest profiles. +# +# Main takeaways +# -------------- +# +# - The performance of the models can be evaluted by their ability to yield +# well-calibrated predictions and a good ranking. +# +# - The Gini index reflects the ability of a model to rank predictions +# irrespective of their absolute values, and therefore only assess their +# ranking power. +# +# - The calibration of the model can be assessed by plotting the mean observed +# value vs the mean predicted value on groups of test samples binned by +# predicted risk. +# +# - The least squares loss (along with the implicit use of the identity link +# function) of the Ridge regression model seems to cause this model to be +# badly calibrated. In particular, it tends to underestimate the risk and can +# even predict invalid negative frequencies. +# +# - Using the Poisson loss with a log-link can correct these problems and lead +# to a well-calibrated linear model. +# +# - Despite the improvement in calibration, the ranking power of both linear +# models are comparable and well below the ranking power of the Gradient +# Boosting Regression Trees. +# +# - The Poisson deviance computed as an evaluation metric reflects both the +# calibration and the ranking power of the model. It also makes a linear +# assumption on the ideal relationship between the expected value and the +# variance of the response variable. For the sake of conciseness we did not +# check whether this assumption holds. +# +# - Traditional regression metrics such as Mean Squared Error and Mean Absolute +# Error are hard to meaningfully interpret on count values with many zeros. plt.show() From 1c69a8a55cc18b461b1befbd68c99a5020140363 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 29 Apr 2020 02:26:51 +0200 Subject: [PATCH 092/125] DOC small typos and fixes for poisson example (#17078) --- ...plot_poisson_regression_non_normal_loss.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 4fc3bea7bda51..3a24b55848013 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -277,10 +277,9 @@ def score_estimator(estimator, df_test): ############################################################################## -# Like the Ridge regression above, the gradient boosted trees model minimizes -# the conditional squared error. However, because of a higher predictive power, -# it also results in a smaller Poisson deviance than the linear Poisson -# regression model. +# Like the Poisson GLM above, the gradient boosted trees model minimizes +# the Poisson deviance. However, because of a higher predictive power, +# it reaches lower values of Poisson deviance. # # Evaluating models with a single train / test split is prone to random # fluctuations. If computing resources allow, it should be verified that @@ -339,7 +338,7 @@ def score_estimator(estimator, df_test): # # Note that we could have used the least squares loss for the # ``HistGradientBoostingRegressor`` model. This would wrongly assume a normal -# distribution the response variable as for the `Ridge` model, and possibly +# distributed response variable as does the `Ridge` model, and possibly # also lead to slightly negative predictions. However the gradient boosted # trees would still perform relatively well and in particular better than # ``PoissonRegressor`` thanks to the flexibility of the trees combined with the @@ -533,13 +532,9 @@ def lorenz_curve(y_true, y_pred, exposure): # Main takeaways # -------------- # -# - The performance of the models can be evaluted by their ability to yield +# - The performance of the models can be evaluated by their ability to yield # well-calibrated predictions and a good ranking. # -# - The Gini index reflects the ability of a model to rank predictions -# irrespective of their absolute values, and therefore only assess their -# ranking power. -# # - The calibration of the model can be assessed by plotting the mean observed # value vs the mean predicted value on groups of test samples binned by # predicted risk. @@ -552,6 +547,10 @@ def lorenz_curve(y_true, y_pred, exposure): # - Using the Poisson loss with a log-link can correct these problems and lead # to a well-calibrated linear model. # +# - The Gini index reflects the ability of a model to rank predictions +# irrespective of their absolute values, and therefore only assess their +# ranking power. +# # - Despite the improvement in calibration, the ranking power of both linear # models are comparable and well below the ranking power of the Gradient # Boosting Regression Trees. From ad6a9f977e374842578d3da70c873a451776891f Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 29 Apr 2020 04:05:37 -0400 Subject: [PATCH 093/125] TST Skips derivative check on 32bit platforms (#17073) * TST Checks type for derivative check * TST Skips test for 32bit linux * REV Less diffs --- sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py index 7fc6ab9097873..c3f6ded7be39a 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py @@ -9,6 +9,7 @@ from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES from sklearn.ensemble._hist_gradient_boosting.common import Y_DTYPE from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE +from sklearn.utils._testing import skip_if_32bit def get_derivatives_helper(loss): @@ -58,8 +59,7 @@ def get_hessians(y_true, raw_predictions): ]) @pytest.mark.skipif(sp_version == (1, 2, 0), reason='bug in scipy 1.2.0, see scipy issue #9608') -@pytest.mark.skipif(Y_DTYPE != np.float64, - reason='Newton internally uses float64 != Y_DTYPE') +@skip_if_32bit def test_derivatives(loss, x0, y_true): # Check that gradients are zero when the loss is minimized on 1D array # using Halley's method with the first and second order derivatives From 9f04837ea991b1a063c48e48175fd179be1885ad Mon Sep 17 00:00:00 2001 From: Chiara Marmo Date: Wed, 29 Apr 2020 14:30:44 +0200 Subject: [PATCH 094/125] DOC Update funders. (#17079) --- doc/about.rst | 42 +++++++++++++++++----------------- doc/images/anaconda-small.png | Bin 11313 -> 0 bytes doc/templates/index.html | 1 - 3 files changed, 21 insertions(+), 22 deletions(-) delete mode 100644 doc/images/anaconda-small.png diff --git a/doc/about.rst b/doc/about.rst index a6cdd54eb9201..814a4724d9579 100644 --- a/doc/about.rst +++ b/doc/about.rst @@ -271,14 +271,18 @@ July 2017.
-............ +Past Sponsors +............. .. raw:: html
-`Anaconda, Inc `_ funds Adrin Jalali since 2019. +`INRIA `_ actively supports this project. It has +provided funding for Fabian Pedregosa (2010-2012), Jaques Grobler +(2012-2013) and Olivier Grisel (2013-2017) to work on this project +full-time. It also hosts coding sprints and other events. .. raw:: html @@ -286,67 +290,63 @@ July 2017.
-.. image:: images/anaconda.png +.. image:: images/inria-logo.jpg :width: 100pt :align: center - :target: https://sydney.edu.au/ + :target: https://www.inria.fr .. raw:: html
-Past Sponsors -............. +..................... .. raw:: html
-`INRIA `_ actively supports this project. It has -provided funding for Fabian Pedregosa (2010-2012), Jaques Grobler -(2012-2013) and Olivier Grisel (2013-2017) to work on this project -full-time. It also hosts coding sprints and other events. +`Paris-Saclay Center for Data Science +`_ +funded one year for a developer to work on the project full-time +(2014-2015), 50% of the time of Guillaume Lemaitre (2016-2017) and 50% of the +time of Joris van den Bossche (2017-2018). .. raw:: html
-
-.. image:: images/inria-logo.jpg +.. image:: images/cds-logo.png :width: 100pt :align: center - :target: https://www.inria.fr + :target: https://www.datascience-paris-saclay.fr/ .. raw:: html
-..................... +............ .. raw:: html
-`Paris-Saclay Center for Data Science -`_ -funded one year for a developer to work on the project full-time -(2014-2015), 50% of the time of Guillaume Lemaitre (2016-2017) and 50% of the -time of Joris van den Bossche (2017-2018). +`Anaconda, Inc `_ funded Adrin Jalali in 2019. .. raw:: html
+
-.. image:: images/cds-logo.png +.. image:: images/anaconda.png :width: 100pt :align: center - :target: https://www.datascience-paris-saclay.fr/ + :target: https://www.anaconda.com/ .. raw:: html diff --git a/doc/images/anaconda-small.png b/doc/images/anaconda-small.png deleted file mode 100644 index ccb8bb8b707deca78f49e2423dbd380b48ba4052..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 11313 zcmV-1EY8!3P) zaB^>EX>4U6ba`-PAZ2)IW&i+q+O3;eawNHOMgOr1Edlee9E@h#ftIgx5y9fntS5Vt zs=AA0W@H2acfW%JH2c5)^O*nOPl>^rm`cqpXUm`1V)LDEs(t=EU!9Hj-}@(iyyw2Z zZhU;-@VpfG8lE5Xey{Jmo<2WN-s|1?{<vjI+6|Yx3--r83 z2*0eK`LlGtEa1nw{#-vh-z&oBay&Qsm{$I=PVeh3{l0#8mA;P1`S%k4`G)@N<%a$G z=gaz?v+6nfnO%?Buw40_)W;O>cU;I#Vz?j6{4M;S_`ck~jlb1)JBiw2s>2Y6>6-Jb z>~Ya8*WGdZew}VI#OSv#eD!|%a9^vT_~z#gO4MI>d@;mlg-T2&kqf-+|BS`m=Qcf$ zhgF`u99MeB%@oCNf6dPq{-RK;pxpP{~`m7b>^2#rUp~&f9t|B1ryla~7`a1J` zu|NMPuz?7syUdjdb~~P1jO0FZE8I8-n+%sm`_zjo>%9R>#IqBFae)Zf>_T$c+2Vb1 zj&-cWr!#UNq8~)SC6oMOls<-#2~PETHlHc(Uh9*8UItzWL<%_+LPG%&tQb?w)LSKm zx=A6$lu}M5)znhYA;+9@&Ls<{dI=?#RB|b$mR5QVHP%#fEw$EGd-E*-W68AKN~^84 z-g#(`oqKhz@4PVl2qTU(@+hN@Hu@xeW}Io}S!SJW_T^VtK!90!l~q?;eY;I6?YPs< zyX?B#?uT4E;lz_pKIPQYPX80NH>%&G_A_$-I%@8Xn!kb4b>%Z^yj;r1Eu7#aDP}~> zM@PhqB0xfW#mrV0qgUhNl`k0Bed+)*6)5oXGy^rpq7e4QykKKD%+ro3pA9wjN6Z7wm zq|_D04yCL?Hn#GtX&ELL2@qnnMwqrzwF4vE@H z)fGP=vEre}x=YxD^;$Dif`M_=QsQpj4F<_8!Bbgr_K12haLt~U^y)kIRjK3Di75r{ zLZy@vpb!f4>|yYfKH@5=fe<}2#d>-vXZjA*mXs(;8!9oVH7}*`JfSDcy0}$bW9ehr z1Ivi6DF ztW>(!oQOlq=gy{rnhPl9hCnH)%>BWxQ_`ws_jO!4iTN#%iwK12PBGx3^IEsyBiWq> zppKw)?W9>ij8p+2x7QXzgRsCF@dW%pJU~ih3ucCCMrTZs8nGp$U4W%}Y)X?K>}U&d z)BxF02~_jh9!ZKGn~gbyN_2??8RM(jel(ylmyQFN#Chxl#FBR_D@~{WC~8vRmWh9g zWh4^uh@%8TMD60ry#Jm}j%=0(s3#yyD15o}=ao?j^b{4(Fx=ih6~iM+h5e#^?_lkC zK6LM)J6wGHxWo5xn$Lm$xeL)7$92%e<=TQiM#&Wjsh$B#k!Fsh?S#4PunzggV(UJ2 z!C}LZs$>Y|&cNF$BxPe1-gd>j9M}pq$sp?;-3A8>6(G0>7e+pD_x}RB=C2HL$WT1W zv($))jlcy+GD!;r693>gHA%*+rh}<-Je8QnHx_3-80`ySzX-bLDWU-|DJUxCdJw0^+a{2i@FJoCZk_u9Y1IukG@ zhYbvezWS3*5khg~z-l7r)4*K`kzPd2+s#$F3KdNQO5}!0V0C;pBuFHE8W2V4wkcdh zMwERD@AL^!LigyIHd_U}JJ+(Bz!B_a(L;9S&C(g@F1EHw6zYQX%X`ZXp-3vB1)c_B z1~OL_k)di9a)Nud9mRyXSc(^{G!Bkw+hd8qPO(V!W9K!98hj*=9UQU?{T&c^_}X5w zKog~oYO5>(h%m5O3_3v)r+Z`_*y_+w2aR~*1U9N%q$mgGoI%j5g+-gJ`c&ta_oDH#)89>TtxnY z@2en|?wj{5^Snm_Kkup`^p51mSJJ26E5UmluD&a_9G^Q=izqf=;ljs2A1(k`SASgl z^~&eM=Q7~E;EO2C2ukCr?KF%KL?HmkvBM?E98X+4w_pgAG!=;&1QbYROf;$xHs_|w z$JJc|ngo~zl@-g?rbQV6qf1QjC&5=*Xb2GrMCDL?h%yqJ-0|db^@T#i(dHMkQ5bRL z#fNf*1`!Q;PEwko;6BPhq)$1QnRg$zJw{q;v~LzcirqU#bVPLI&rVOaunIP1{yRz> z2vYhUI@G&SRLs=@5TD3M>cWHCOaNd@D5p&BZO8KYN+zF9KG#ID3?@qn&Wf#V__FFD zKSU88fjlV^$Se}M=!~P#`vUNKugy}Tc4qRdIkJSHu>dPk!wY4t>Of7|!8&K3>XLX= zTLx%rD7FT!$uq*5_)F|ZXoPPfBD{o~)W~J9w8zyTV#|SM6&mveZB4wB!Iz&zAj*?s z(_tSh`F~e60EMs>RrM?@^Ui-MPV-rv%Gq0=->dU`ZK^b%!qgbQ6sGw{WxjREZQshY z_DgMkFU_|wRhQ;lZGNfDKh^iAHh(MJ&r&phsm=eX+TNo4EMfD#hQF6Et3bMt05+$D zHiH1*HWXq-^F*gYY-u&HdnAKaSUwCmllYU}D7YCjlW%;nB5;ywmMq$9!*L^DHw%or zD9Q6*TU29M2F}D>_%s6K#o zLZ|w{U?)Tb57p)*KRn5ayurj`^BH#a=rh23b_6vM7e-8_Tt@<~i7Jf8OmZONYN;y{ z$}*DyHp^(r+>Hoc1>1w%d*}^|x~UEd#peUaP>UN?F>){z8YPUl!T3(@SLC2E9gJv$ zi&L5p2?@xx*bu+*3goAtP$}^N?ntd+65I7)|8y^8`FZ@=hl3#<)#w6v)h4+us-U(5 zYJ_E!Qz6X(duYem^*jO&{D~Uh2))c&zJQPU6@BE|QsEgaMot$(1~= zGBz@6WU`uEOvV5%%aJ4AwIN zLfLGm?jjDP;3`&t0I4$^9c%ASg2{F)AUcdTQzrNj>2L$tO2V^P9%@K!_kHZu{WkOjE{7sTnQvL`jD5kP!Xv^!#X(cgtYUKt0H_tsZ97yZuX zwfEAFOh-bfSGLqKm0xuj3gdQ|QXdwt12CeaCm&Q4k=`xwFBdGoDzpk=L;I4b;2NPV zPiQxZlF1|Pop_QEOMDY3=aR7R*Ha+lJ;iJYNK zXUWh<(~`;oyHJ&kh>xv~6pJm1#~lWf0&qheBjil-rsS)-PVgf}X7?Lw!<<+=D6A&! zs<2o{nx}ZLRr+!w!zLto!HJ-9l^B=&{m?XaI?NSgwv@e78_*pnJ>TXOq zM}ATxy$Dwkf5BsGyP2gDxbSp~D`UoYB3*c(8g_U2@xuY0Kssr6sRuBmG&mAXt;c)n zIt&9?2K;0ZN?6I$A~m*f<=jtNCMR`B1xihEv8IcW*3+xiGVp41-Z1S)4<0*!Sy;SbwTUcdJQF1kw`o zZSKONdMUw!<%R9F)kIYcirLFjW?};-Ua9YR95YU#K%TIkfMJjq>6rhar89xyjwjC8 zXSrbkTw4t41-NDLC%9)Qd&orBSQ!Gi!Hh}&!C(>IYC{o!C>;d>lVU)o;-&sIMNR3< zrqonl!u-B{s&+l=SZ|x73QuKkp7~z+DswPocb=VqMfERac?S;~iVcm$w`v*U=bcw? zxULFvs;lhf!UGp*Yl+XpIz%Nwri}_TINmTRkf!bewQsROLCui>h&LvxF6Ri1$)N*0 z4nTco6c=&da-9*uAu9@w5fukQBJDf_iB>I+jm(C7a0OFvTeQs|Ae4^$3Od8FuBRn4 zL~AR0!@6ckqtR~%wGTAe#0e)vQq5Ep)HP>N|FMmVEOL2GY@^dKAq4JN1kX$@1=11M zGSqvR>Qf+6Q4XxKOy0RLu7*9N8L;E%Sdxf20-R`z8im@^?fx1HCxzeyF5M(V~1WEI>H`78JO`9R3<$92w3L;|4YKhK9i^ zF9yy4paBvWgw$RfdEiaRR!2Ip=^DShw#W$tLlSRo5AE0*auIbnbbNx506Q=(D;9C! z>iyxFqtWpR>L2^9D$INuwRv}AeIbMmZYoo9V(`Gb5pgPku1!(c6fG``MDZrO8aDJ( z08$(}p3Zr^sKSr?1W6Vol{ebAwvVfv)`U+802&ZdjYy*cQ^VhIYc1Wqjh{U7pxEj=vo6$Lnx+Db> z`KL2kAT?na*gK4$4}8&VVzQ|jGEN8f5GT}QN9RpB(@x5PvXDpw6cPXg4}gafvI}0t z*u%9IUx@*0+6^(GW$X=g9B(^#X8Y{2T|OcvR4Wl<_DefUTIvqG;C7E1*5EN7TkvfV z`p|>6UP-I$wD+A@&j@-pwNj~gaD|*iHZvKW_Yk3LX|$KFeYqkK)>%Jka*33_FQo;EtO$VvMFROK~2E|e8QhgvJth`iw zQ!JKsPC(MzkUH9uvXCU+7+ZrlW7WY?uWbNI2!7%* zI>%_9MnID89KIx{HW!XHNd0R-Uz6_5IlV(X$!VQU!eStt$ZYUkjT78LN8GWPj;SIU z1X_f4D(VOo!jFh-l5{)jm=ASvr zDEBK|k(K`kGC68$0~5gVmcnL;6YjZ(Jf&`6$|~(vSpsj>R}vDMBP%!X;YF?2derbt zNb=#U=$G(RuwR`x!J;}$1a~SOq7De{iI1QrvFqwAvxq`J@S;jTjqI@V3uIXdp@trD zIueEOOu1zilI&Ozf7Icr8KI#O%msX&iw%$PGj&v4ReL}U{{UHCZPW)S5_D1kbq9VX zTs#zqAP&^#wksIXm(Dg|vl#B{e%J;Q2_RMR?A_t zU>^vIwGh3s!~_fmNID$_Xj(E+CzldWyOA6^PRazKcvVD~lSSH?OYurV;ID7im`cHT z%i-;u_X*Ve+~^UQ+Lom=EgjKJ5Mi~YLvFX`rx{o!NElB%)YvVQw~P9Tlt79i^7707SaLuel#lZiS|hrp2zsWmK`f;5opvlc(~NVf%p#$r6>jI zgM&4bYmGuRklUa|4;5K^AQZy5&3)U}9^}TtKnu}JB!u@?YM_uu?~zadqDPGQ88S7P z#f-LQVh+aFxf^hs)h}AngCh!VAt4*C3?Z2$iFFPG=ZAw3UFrm%kywSL2Qq%@Sb!u$ zvFbv)4dJhnI)Q>>di9ptRBNA0GJw-4}J{sfz-l8fJCJ5RLr#J5lJ&< z##-y}5uP^63iri;@p8f$Ayn{2D_ssHpr%t&gRMJKWO|*C=uaa7OmGPI^5h7@c>Ou> zuQK&SrZD)-ik~XfJSs@U=T-zC&02fDZR}bh&rU5erd$={j6EZuiVrQFiaA3mWx4bXP}qN+H7c{C1*W&uG8=U z$U)X^9Ra3F8yW{&6j-O^n1GH*E3PgHUf0C;7qKFU4XjP4-ETm_GIPkLZgQX1#`{^u zg2p6`u`BW`yJ-Uv9g6eArLJgjFjCMXx{xPiMS_a|2!u`p0ZX9*GY!A3_79T!3@c|D z(Dc%`y`l#|TN09S@{rhduv4oCv{enVf@!kIq%ZtEGOW4CRQ)0id`DeG*jVM#onVC2tbnr|(opl;Y$cHRP>sq2YDV(NFT7*ArR^SLMtbiQ7wpet) zpCTUdK+l+X@4x}D`1Zg$cp#e9nrhdFHNdT$GIXrjGOE=B1%r79HcXNN_uNC>3t+3N zucNVAv7gTAI!tTKXsCsU(#CH9|0e~?P~}Xd*ip5k@nI4IoDDi#);@%?@}q^Pt4#@# zuZ^t7IQi~%e1av@;X5=SqI+#~4KHGN7NsZ^cY0{xI@B{*Iq6GmNl?M)tc5LTVznW8 zrZf_Apr~6H4Il`!cG*n*`T@#Pd;ior42(7IY69CkuCYO9aMKYvU<6t%mt9G5wV}EH z!JtiRV9P)21FuxH==&+Y?7>4Hwr_>%CYh~Qt&LfdT!6U zt5rSJ`7GEaP$_pD@!Wl48waw}8@2BMDiqnZn7k_!=6Ca~DEj2ta0SxAb7-dHj-hId zkC$Sn7OrC7-AI8Gl!<3?acJxQR<@efBctDCnz`X5?P3USBaXTb5>!M6u24H879V8+ z|8Bwc4b#E2gijr+IZ@G|Ow#Ff+{#cRov%T^GJFkrpSnwzlsuP-GPz`cwmmcIo26rle7o$1_ z_%VkR$DGLgH>?F|3Yox&Bv|2D@&JYs>Qp9>g*V6g9|D0EiIwyr1fU8k7-tC_P3cjc zLM>{4Etym}*)gYB8XB^)cHkr!!Nplu2UkH5`~h)sby9SZ67Ne2En<9dc^~J! zbGYw5K&Y3QW_64Mnr@q^L|n{dSH-SZgdk82qeo_zF(*k$_^z*e1o(az<5~Xq{#?Ck z&SF47B%Wo4X%lY{PjA`==Y8S`E6OVIIq{fD4H7?cUGeyhbHQbSXGY9)YMwYkEEd{W zX=7G2HR37asH*9dFJwGcId5^+%2n3vlfN*W(^r0TpPFT|f9A{GP3qpPcZL!f~MU#c@7HfUaGjQFEN{W5;Qn z0KsSAN^kiqbztU`^jb@c9szyZz{Pb-Q}%$%9bn+ekWJZ@f>c6254@kzH)VnTTOhRN z&8>Nk(+40;vr66o2Zz9DfwI?q-re0kw}0<6=l25uA#!;(q=Ie$000JJOGiWi{{a60 z|De66lK=n!32;bRa{vG>`~Uzg`~j}R?3e%m00(qQO+^Re3IYlqAt?g6@c;k|+et)0 zRA}DqT6uI;)w%zD-#+);n>i#RQo!>X^OO=(CIC}Ibz7J7KL2C^Wxu)k=tXP4v(lR6_CF1jLJK7qNW|D1N{xc`f z9L7%nzOO#`(kd!j1Ix0$m-8_*MC6*z*Q!-3#KgpacI?=JzJ2>%^GjV#GBY!=Z{I!~ zJ9g}<`pChEn~@w8+%Du1?-_5QGuoBwtEijjigrzeqF)mciD(k`2RG zO%q!_5fOk83C37vugdp@So#Iq zYfl)k{I1`ygNHRgBPkm1Y~?CTo}UrZVgYL9ZjF>hS&u~Yjvhz`XOZIPYfn_e8RdGn z?wt3nzrVd}@9|Yf-aNPMbd2`vVPeZ10KUt1iJnHVqbPCi;D{Ruka43nx@CN?wosHr zTJp4CKP9&I8=;P>;~yW%J(R!C27ogqCo2F{060!eAG_yePUj$107Bn-kYXL7=34;9 zwKxVTmkI;0M}SU5lnKDGeNw5f0i197GA$?{w|<|lwgy3Ddn1t)J>V|LOl)xg8KI6m zHh6|I)z9oYtDtm@=QGh)cjj^Z<8Wd6H|MPyYb1cW28`HLPq!F=hzTi$+2iaQYdU~a zV$1qX53k#Gt?T`MKep$6ez4vrHR!yn_p1r676FiG0Vx{6PSxF4;Y|{ej^e+*w4!!< z6-3LD0EZnK(kZN0i!m|nvp0&ZkCKCS^$udXd~k!hwumSe+BXtpa5+2p zQzbZZXyN`o|B9advv$_B9o!@ha<&KtUHrdFSQmXKMN7G8DUBK0?an|ZT&u+Y@1XWg zGIXMf7QrB5f+NhFsQr45@2uD3Y8$*vv?KwzE=OcWI#UBz_|Naq>Ce}^!6q*cj9A*QpV&NcL&c3c_YEsnkAHO7?ws3WmJ#gOAhzDYrh2V@YHI58 zoP)5jUPO)o2qLCFq-Y?3peFg0rC%!^xw@c5R`UQ*I>vK2b!qk>?bm&j696@MVCn$_ zJqZ6a{M*{nQ;Y9kJ-x*@%r*{R{bkMZ>Q6+=O8}~_Aqtk-b+)@?RMGFRH#PXtF{nXy zJh#cmA^e*=XQa$@MFkB9AstLVh2W<8CfOl~Y+{=~+%Rg@Cj5Y6{7@U~YK_g0Z#WNN zHZjeCvwK5w zFp@t=k1rX$rkq@ies8$DzFR%Fc09(PY|bDiMf0MQ+vYYIbns1V@MlSHq-V7m8WTAF z2pRmD3NczYI$y0391}W{nHF??_%;=&C-%9%oVqk;T2Q-?qim`rZ18+XxVblVSx!O! zcLq2y8c29^;N7hg+a6L*wVD*YuR@Hnc7r=^aWBfcJwVljpbjBR`n;ClXc*{_(SYpc1vUrFHl5)01-fd_Sr94)&5lN*JA*jC4(2FEY2Pt*p6|v z33>=X6q{-i0I-wiA3c41r6SeT#K_>u5L6MCc~MiqT1T22sA3gi+;L`$Xi>&c|(vD9TbuUl2$ z{ST14PNr3gt(UXnZg=D*cLIR0t`Q?Z)Y+=>kA>i(5$t&SicyRm{44|~POLmKU)%ak zVjAdPklC-<9%n7Doee>qa;hQ`bP1f?-fLffQk*7QX4M?O_yGi`l&PMnrx&BC5j1sv zCO6F4|EwNsww5+OykVAD`gg=Msr!HR{N;DI-2i|Zjj_wPTZs&sB3d@@e|zUgqUAkO zG$v_odWVMK!bF7Rc^N%P(GU@-?J%^X{ijrXsfDq7|k!Nzj{99P5VlU8fPNzuKc^^xt*lpg7Dcb6<; z>H~op`}7@iA<_$gX@nY2&iv!dOJV-r-b2x0Y0t?3>WJttDQZ{WnA$^%ewOrpde?K? zPWuBlDjBo(Z|;Q|LzPo`0E7TQ(!7jYNYQ-|>}G=_0YqM^sSsnhjHk;U@cd`$(wteO z=-JTj;U4#*tXIU+7nDnl0#UmQ2P(&iUr!^a@|fY~UsDRRU(mJofuxW@sbtVJ2u@qo z{puGsSDjl)+>Y^;Ok4vsOyonNS6V-f_fi3CI|(J0Mbv&p?M zb0nK;#ig*So?>FFx}rLJ^F!-TTuN#PsSC5;C8qHSv(tX{&4%Ms+YIR7Bt=gsr&WNR!`SBCB7(JR*8%`0W=#0B z$MW9UmiAQ|wkm)n?{9u+{fR(5zdiNFi%|4&P)Lyfm0>gOF&QJDik%lf({!0>hB6#A zwY8t`9`pIGLeFBrGZlY$=MROa)2dbmwGGY+HA7mT_f{P|weH0F_a1!5s`X4kUS1x& zHcf}7eCJ`M1NN$Y>DcspA|oSW0YF)55Bq3XK!m+evE@wpxOOAGzxF(XX(#hev5)RS n9o98_@A5%~Lo%7D0s{O4_yF;NymWiy00000NkvXXu0mjfw|`wS diff --git a/doc/templates/index.html b/doc/templates/index.html index e17111fb48eef..8d3bdfaec2b28 100644 --- a/doc/templates/index.html +++ b/doc/templates/index.html @@ -252,7 +252,6 @@

Who uses scikit-learn?

-
From b9403f62ac65e7e6575168ef74b43fb012010599 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 29 Apr 2020 17:42:25 +0200 Subject: [PATCH 095/125] DOC Better headers in Poisson regression example (#17080) --- .../plot_poisson_regression_non_normal_loss.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 3a24b55848013..59c07580b81ba 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -184,7 +184,7 @@ def score_estimator(estimator, df_test): score_estimator(dummy, df_test) ############################################################################## -# (Generalized) Linear models +# (Generalized) linear models # --------------------------- # # We start by modeling the target variable with the (l2 penalized) least @@ -217,6 +217,12 @@ def score_estimator(estimator, df_test): # regularization strength ``alpha`` to approximately 1e-6 over number of # samples (i.e. `1e-12`) in order to mimic the Ridge regressor whose L2 penalty # term scales differently with the number of samples. +# +# Since the Poisson regressor internally models the log of the expected target +# value instead of the expected value directly (log vs identity link function), +# the relationship between X and y is not exactly linear anymore. Therefore the +# Poisson regressor is called a Generalized Linear Model (GLM) rather than a +# vanilla linear model as is the case for Ridge regression. from sklearn.linear_model import PoissonRegressor @@ -233,6 +239,9 @@ def score_estimator(estimator, df_test): score_estimator(poisson_glm, df_test) ############################################################################## +# Gradient Boosting Regression Trees for Poisson regression +# --------------------------------------------------------- +# # Finally, we will consider a non-linear model, namely Gradient Boosting # Regression Trees. Tree-based models do not require the categorical data to be # one-hot encoded: instead, we can encode each category label with an arbitrary From ee2508ce45fd7d491b25e354509cda26c14b16ec Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Wed, 29 Apr 2020 18:31:58 -0400 Subject: [PATCH 096/125] ENH Adds HTML visualizations for estimators (#14180) --- doc/modules/classes.rst | 1 + doc/modules/compose.rst | 25 ++ doc/whats_new/v0.23.rst | 8 + .../plot_column_transformer_mixed_types.py | 9 + sklearn/_config.py | 19 +- sklearn/base.py | 13 + sklearn/compose/_column_transformer.py | 6 + sklearn/ensemble/_stacking.py | 27 ++ sklearn/ensemble/_voting.py | 5 + sklearn/pipeline.py | 20 ++ sklearn/tests/test_base.py | 14 + sklearn/tests/test_config.py | 9 +- sklearn/utils/__init__.py | 3 +- sklearn/utils/_estimator_html_repr.py | 311 ++++++++++++++++++ .../utils/tests/test_estimator_html_repr.py | 267 +++++++++++++++ 15 files changed, 732 insertions(+), 5 deletions(-) create mode 100644 sklearn/utils/_estimator_html_repr.py create mode 100644 sklearn/utils/tests/test_estimator_html_repr.py diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 3d9924638b69b..2489eaf55bac7 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1569,6 +1569,7 @@ Plotting utils.deprecated utils.estimator_checks.check_estimator utils.estimator_checks.parametrize_with_checks + utils.estimator_html_repr utils.extmath.safe_sparse_dot utils.extmath.randomized_range_finder utils.extmath.randomized_svd diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index cd29b14b1f081..e7dac0dadc630 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -528,6 +528,31 @@ above example would be:: ('countvectorizer', CountVectorizer(), 'title')]) +.. _visualizing_composite_estimators: + +Visualizing Composite Estimators +================================ + +Estimators can be displayed with a HTML representation when shown in a +jupyter notebook. This can be useful to diagnose or visualize a Pipeline with +many estimators. This visualization is activated by setting the +`display` option in :func:`sklearn.set_config`:: + + >>> from sklearn import set_config + >>> set_config(display='diagram') # doctest: +SKIP + >>> # diplays HTML representation in a jupyter context + >>> column_trans # doctest: +SKIP + +An example of the HTML output can be seen in the +**HTML representation of Pipeline** section of +:ref:`sphx_glr_auto_examples_compose_plot_column_transformer_mixed_types.py`. +As an alternative, the HTML can be written to a file using +:func:`~sklearn.utils.estimator_html_repr`:: + + >>> from sklearn.utils import estimator_html_repr + >>> with open('my_estimator.html', 'w') as f: # doctest: +SKIP + ... f.write(estimator_html_repr(clf)) + .. topic:: Examples: * :ref:`sphx_glr_auto_examples_compose_plot_column_transformer.py` diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 0e149ed03a9fa..1ac63ca473faf 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -567,6 +567,9 @@ Changelog :mod:`sklearn.utils` .................... +- |Feature| Adds :func:`utils.estimator_html_repr` for returning a + HTML representation of an estimator. :pr:`14180` by `Thomas Fan`_. + - |Enhancement| improve error message in :func:`utils.validation.column_or_1d`. :pr:`15926` by :user:`Loïc Estève `. @@ -605,6 +608,11 @@ Changelog Miscellaneous ............. +- |MajorFeature| Adds a HTML representation of estimators to be shown in + a jupyter notebook or lab. This visualization is acitivated by setting the + `display` option in :func:`sklearn.set_config`. :pr:`14180` by + `Thomas Fan`_. + - |Enhancement| ``scikit-learn`` now works with ``mypy`` without errors. :pr:`16726` by `Roman Yurchak`_. diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py index 1c79c4bb1d607..24fc4d69e35d0 100644 --- a/examples/compose/plot_column_transformer_mixed_types.py +++ b/examples/compose/plot_column_transformer_mixed_types.py @@ -87,6 +87,15 @@ clf.fit(X_train, y_train) print("model score: %.3f" % clf.score(X_test, y_test)) +############################################################################## +# HTML representation of ``Pipeline`` +############################################################################### +# When the ``Pipeline`` is printed out in a jupyter notebook an HTML +# representation of the estimator is displayed as follows: +from sklearn import set_config +set_config(display='diagram') +clf + ############################################################################### # Use ``ColumnTransformer`` by selecting column by data types ############################################################################### diff --git a/sklearn/_config.py b/sklearn/_config.py index 44eaae1d59012..f183203e13228 100644 --- a/sklearn/_config.py +++ b/sklearn/_config.py @@ -7,6 +7,7 @@ 'assume_finite': bool(os.environ.get('SKLEARN_ASSUME_FINITE', False)), 'working_memory': int(os.environ.get('SKLEARN_WORKING_MEMORY', 1024)), 'print_changed_only': True, + 'display': 'text', } @@ -27,7 +28,7 @@ def get_config(): def set_config(assume_finite=None, working_memory=None, - print_changed_only=None): + print_changed_only=None, display=None): """Set global scikit-learn configuration .. versionadded:: 0.19 @@ -59,6 +60,13 @@ def set_config(assume_finite=None, working_memory=None, .. versionadded:: 0.21 + display : {'text', 'diagram'}, optional + If 'diagram', estimators will be displayed as text in a jupyter lab + of notebook context. If 'text', estimators will be displayed as + text. Default is 'text'. + + .. versionadded:: 0.23 + See Also -------- config_context: Context manager for global scikit-learn configuration @@ -70,6 +78,8 @@ def set_config(assume_finite=None, working_memory=None, _global_config['working_memory'] = working_memory if print_changed_only is not None: _global_config['print_changed_only'] = print_changed_only + if display is not None: + _global_config['display'] = display @contextmanager @@ -100,6 +110,13 @@ def config_context(**new_config): .. versionchanged:: 0.23 Default changed from False to True. + display : {'text', 'diagram'}, optional + If 'diagram', estimators will be displayed as text in a jupyter lab + of notebook context. If 'text', estimators will be displayed as + text. Default is 'text'. + + .. versionadded:: 0.23 + Notes ----- All settings, not just those presently modified, will be returned to diff --git a/sklearn/base.py b/sklearn/base.py index bf5ee370aa8f1..666574b491594 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -17,9 +17,11 @@ import numpy as np from . import __version__ +from ._config import get_config from .utils import _IS_32BIT from .utils.validation import check_X_y from .utils.validation import check_array +from .utils._estimator_html_repr import estimator_html_repr from .utils.validation import _deprecate_positional_args _DEFAULT_TAGS = { @@ -435,6 +437,17 @@ def _validate_data(self, X, y=None, reset=True, return out + def _repr_html_(self): + """HTML representation of estimator""" + return estimator_html_repr(self) + + def _repr_mimebundle_(self, **kwargs): + """Mime bundle used by jupyter kernels to display estimator""" + output = {"text/plain": repr(self)} + if get_config()["display"] == 'diagram': + output["text/html"] = estimator_html_repr(self) + return output + class ClassifierMixin: """Mixin class for all classifiers in scikit-learn.""" diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 2ef8876b0c4e7..f148633021a97 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -15,6 +15,7 @@ from joblib import Parallel, delayed from ..base import clone, TransformerMixin +from ..utils._estimator_html_repr import _VisualBlock from ..pipeline import _fit_transform_one, _transform_one, _name_estimators from ..preprocessing import FunctionTransformer from ..utils import Bunch @@ -637,6 +638,11 @@ def _hstack(self, Xs): Xs = [f.toarray() if sparse.issparse(f) else f for f in Xs] return np.hstack(Xs) + def _sk_visual_block_(self): + names, transformers, name_details = zip(*self.transformers) + return _VisualBlock('parallel', transformers, + names=names, name_details=name_details) + def _check_X(X): """Use check_array only on lists and other non-array-likes / sparse""" diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index a75e9236f1612..73aa55c0575a7 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -13,6 +13,7 @@ from ..base import clone from ..base import ClassifierMixin, RegressorMixin, TransformerMixin from ..base import is_classifier, is_regressor +from ..utils._estimator_html_repr import _VisualBlock from ._base import _fit_single_estimator from ._base import _BaseHeterogeneousEnsemble @@ -233,6 +234,14 @@ def predict(self, X, **predict_params): self.transform(X), **predict_params ) + def _sk_visual_block_(self, final_estimator): + names, estimators = zip(*self.estimators) + parallel = _VisualBlock('parallel', estimators, names=names, + dash_wrapped=False) + serial = _VisualBlock('serial', (parallel, final_estimator), + dash_wrapped=False) + return _VisualBlock('serial', [serial]) + class StackingClassifier(ClassifierMixin, _BaseStacking): """Stack of estimators with a final classifier. @@ -496,6 +505,15 @@ def transform(self, X): """ return self._transform(X) + def _sk_visual_block_(self): + # If final_estimator's default changes then this should be + # updated. + if self.final_estimator is None: + final_estimator = LogisticRegression() + else: + final_estimator = self.final_estimator + return super()._sk_visual_block_(final_estimator) + class StackingRegressor(RegressorMixin, _BaseStacking): """Stack of estimators with a final regressor. @@ -665,3 +683,12 @@ def transform(self, X): Prediction outputs for each estimator. """ return self._transform(X) + + def _sk_visual_block_(self): + # If final_estimator's default changes then this should be + # updated. + if self.final_estimator is None: + final_estimator = RidgeCV() + else: + final_estimator = self.final_estimator + return super()._sk_visual_block_(final_estimator) diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py index 0ac42407f5998..6a2b5736d8b4e 100644 --- a/sklearn/ensemble/_voting.py +++ b/sklearn/ensemble/_voting.py @@ -32,6 +32,7 @@ from ..utils.validation import column_or_1d from ..utils.validation import _deprecate_positional_args from ..exceptions import NotFittedError +from ..utils._estimator_html_repr import _VisualBlock class _BaseVoting(TransformerMixin, _BaseHeterogeneousEnsemble): @@ -104,6 +105,10 @@ def n_features_in_(self): return self.estimators_[0].n_features_in_ + def _sk_visual_block_(self): + names, estimators = zip(*self.estimators) + return _VisualBlock('parallel', estimators, names=names) + class VotingClassifier(ClassifierMixin, _BaseVoting): """Soft Voting/Majority Rule classifier for unfitted estimators. diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 8e2a539786557..6f02cb565e15c 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -18,6 +18,7 @@ from joblib import Parallel, delayed from .base import clone, TransformerMixin +from .utils._estimator_html_repr import _VisualBlock from .utils.metaestimators import if_delegate_has_method from .utils import Bunch, _print_elapsed_time from .utils.validation import check_memory @@ -623,6 +624,21 @@ def n_features_in_(self): # delegate to first step (which will call _check_is_fitted) return self.steps[0][1].n_features_in_ + def _sk_visual_block_(self): + _, estimators = zip(*self.steps) + + def _get_name(name, est): + if est is None or est == 'passthrough': + return f'{name}: passthrough' + # Is an estimator + return f'{name}: {est.__class__.__name__}' + names = [_get_name(name, est) for name, est in self.steps] + name_details = [str(est) for est in estimators] + return _VisualBlock('serial', estimators, + names=names, + name_details=name_details, + dash_wrapped=False) + def _name_estimators(estimators): """Generate names for estimators.""" @@ -1004,6 +1020,10 @@ def n_features_in_(self): # X is passed to all transformers so we just delegate to the first one return self.transformer_list[0][1].n_features_in_ + def _sk_visual_block_(self): + names, transformers = zip(*self.transformer_list) + return _VisualBlock('parallel', transformers, names=names) + def make_union(*transformers, **kwargs): """ diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index 52f2e60b4af70..e20fa440d1933 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -23,6 +23,7 @@ from sklearn.base import TransformerMixin from sklearn.utils._mocking import MockDataFrame +from sklearn import config_context import pickle @@ -511,3 +512,16 @@ def fit(self, X, y=None): params = est.get_params() assert params['param'] is None + + +def test_repr_mimebundle_(): + # Checks the display configuration flag controls the json output + tree = DecisionTreeClassifier() + output = tree._repr_mimebundle_() + assert "text/plain" in output + assert "text/html" not in output + + with config_context(display='diagram'): + output = tree._repr_mimebundle_() + assert "text/plain" in output + assert "text/html" in output diff --git a/sklearn/tests/test_config.py b/sklearn/tests/test_config.py index ae13c61838694..eec349861258c 100644 --- a/sklearn/tests/test_config.py +++ b/sklearn/tests/test_config.py @@ -4,7 +4,8 @@ def test_config_context(): assert get_config() == {'assume_finite': False, 'working_memory': 1024, - 'print_changed_only': True} + 'print_changed_only': True, + 'display': 'text'} # Not using as a context manager affects nothing config_context(assume_finite=True) @@ -12,7 +13,8 @@ def test_config_context(): with config_context(assume_finite=True): assert get_config() == {'assume_finite': True, 'working_memory': 1024, - 'print_changed_only': True} + 'print_changed_only': True, + 'display': 'text'} assert get_config()['assume_finite'] is False with config_context(assume_finite=True): @@ -37,7 +39,8 @@ def test_config_context(): assert get_config()['assume_finite'] is True assert get_config() == {'assume_finite': False, 'working_memory': 1024, - 'print_changed_only': True} + 'print_changed_only': True, + 'display': 'text'} # No positional arguments assert_raises(TypeError, config_context, True) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index afde7614070fd..f814ea11c12c1 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -25,6 +25,7 @@ from ..exceptions import DataConversionWarning from .deprecation import deprecated from .fixes import np_version +from ._estimator_html_repr import estimator_html_repr from .validation import (as_float_array, assert_all_finite, check_random_state, column_or_1d, check_array, @@ -52,7 +53,7 @@ "check_symmetric", "indices_to_mask", "deprecated", "parallel_backend", "register_parallel_backend", "resample", "shuffle", "check_matplotlib_support", "all_estimators", - "DataConversionWarning" + "DataConversionWarning", "estimator_html_repr" ] IS_PYPY = platform.python_implementation() == 'PyPy' diff --git a/sklearn/utils/_estimator_html_repr.py b/sklearn/utils/_estimator_html_repr.py new file mode 100644 index 0000000000000..9b2e45790fd2b --- /dev/null +++ b/sklearn/utils/_estimator_html_repr.py @@ -0,0 +1,311 @@ +from contextlib import closing +from contextlib import suppress +from io import StringIO +import uuid +import html + +from sklearn import config_context + + +class _VisualBlock: + """HTML Representation of Estimator + + Parameters + ---------- + kind : {'serial', 'parallel', 'single'} + kind of HTML block + + estimators : list of estimators or `_VisualBlock`s or a single estimator + If kind != 'single', then `estimators` is a list of + estimators. + If kind == 'single', then `estimators` is a single estimator. + + names : list of str + If kind != 'single', then `names` corresponds to estimators. + If kind == 'single', then `names` is a single string corresponding to + the single estimator. + + name_details : list of str, str, or None, default=None + If kind != 'single', then `name_details` corresponds to `names`. + If kind == 'single', then `name_details` is a single string + corresponding to the single estimator. + + dash_wrapped : bool, default=True + If true, wrapped HTML element will be wrapped with a dashed border. + Only active when kind != 'single'. + """ + def __init__(self, kind, estimators, *, names=None, name_details=None, + dash_wrapped=True): + self.kind = kind + self.estimators = estimators + self.dash_wrapped = dash_wrapped + + if self.kind in ('parallel', 'serial'): + if names is None: + names = (None, ) * len(estimators) + if name_details is None: + name_details = (None, ) * len(estimators) + + self.names = names + self.name_details = name_details + + def _sk_visual_block_(self): + return self + + +def _write_label_html(out, name, name_details, + outer_class="sk-label-container", + inner_class="sk-label", + checked=False): + """Write labeled html with or without a dropdown with named details""" + out.write(f'
' + f'
') + name = html.escape(name) + + if name_details is not None: + checked_str = 'checked' if checked else '' + est_id = uuid.uuid4() + out.write(f'' + f'' + f'
{name_details}'
+                  f'
') + else: + out.write(f'') + out.write('
') # outer_class inner_class + + +def _get_visual_block(estimator): + """Generate information about how to display an estimator. + """ + with suppress(AttributeError): + return estimator._sk_visual_block_() + + if isinstance(estimator, str): + return _VisualBlock('single', estimator, + names=estimator, name_details=estimator) + elif estimator is None: + return _VisualBlock('single', estimator, + names='None', name_details='None') + + # check if estimator looks like a meta estimator wraps estimators + if hasattr(estimator, 'get_params'): + estimators = [] + for key, value in estimator.get_params().items(): + # Only look at the estimators in the first layer + if '__' not in key and hasattr(value, 'get_params'): + estimators.append(value) + if len(estimators): + return _VisualBlock('parallel', estimators, names=None) + + return _VisualBlock('single', estimator, + names=estimator.__class__.__name__, + name_details=str(estimator)) + + +def _write_estimator_html(out, estimator, estimator_label, + estimator_label_details, first_call=False): + """Write estimator to html in serial, parallel, or by itself (single). + """ + if first_call: + est_block = _get_visual_block(estimator) + else: + with config_context(print_changed_only=True): + est_block = _get_visual_block(estimator) + + if est_block.kind in ('serial', 'parallel'): + dashed_wrapped = first_call or est_block.dash_wrapped + dash_cls = " sk-dashed-wrapped" if dashed_wrapped else "" + out.write(f'
') + + if estimator_label: + _write_label_html(out, estimator_label, estimator_label_details) + + kind = est_block.kind + out.write(f'
') + est_infos = zip(est_block.estimators, est_block.names, + est_block.name_details) + + for est, name, name_details in est_infos: + if kind == 'serial': + _write_estimator_html(out, est, name, name_details) + else: # parallel + out.write('
') + # wrap element in a serial visualblock + serial_block = _VisualBlock('serial', [est], + dash_wrapped=False) + _write_estimator_html(out, serial_block, name, name_details) + out.write('
') # sk-parallel-item + + out.write('
') + elif est_block.kind == 'single': + _write_label_html(out, est_block.names, est_block.name_details, + outer_class="sk-item", inner_class="sk-estimator", + checked=first_call) + + +_STYLE = """ +div.sk-top-container { + color: black; + background-color: white; +} +div.sk-toggleable { + background-color: white; +} +label.sk-toggleable__label { + cursor: pointer; + display: block; + width: 100%; + margin-bottom: 0; + padding: 0.2em 0.3em; + box-sizing: border-box; + text-align: center; +} +div.sk-toggleable__content { + max-height: 0; + max-width: 0; + overflow: hidden; + text-align: left; + background-color: #f0f8ff; +} +div.sk-toggleable__content pre { + margin: 0.2em; + color: black; + border-radius: 0.25em; + background-color: #f0f8ff; +} +input.sk-toggleable__control:checked~div.sk-toggleable__content { + max-height: 200px; + max-width: 100%; + overflow: auto; +} +div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label { + background-color: #d4ebff; +} +div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label { + background-color: #d4ebff; +} +input.sk-hidden--visually { + border: 0; + clip: rect(1px 1px 1px 1px); + clip: rect(1px, 1px, 1px, 1px); + height: 1px; + margin: -1px; + overflow: hidden; + padding: 0; + position: absolute; + width: 1px; +} +div.sk-estimator { + font-family: monospace; + background-color: #f0f8ff; + margin: 0.25em 0.25em; + border: 1px dotted black; + border-radius: 0.25em; + box-sizing: border-box; +} +div.sk-estimator:hover { + background-color: #d4ebff; +} +div.sk-parallel-item::after { + content: ""; + width: 100%; + border-bottom: 1px solid gray; + flex-grow: 1; +} +div.sk-label:hover label.sk-toggleable__label { + background-color: #d4ebff; +} +div.sk-serial::before { + content: ""; + position: absolute; + border-left: 1px solid gray; + box-sizing: border-box; + top: 2em; + bottom: 0; + left: 50%; +} +div.sk-serial { + display: flex; + flex-direction: column; + align-items: center; + background-color: white; +} +div.sk-item { + z-index: 1; +} +div.sk-parallel { + display: flex; + align-items: stretch; + justify-content: center; + background-color: white; +} +div.sk-parallel-item { + display: flex; + flex-direction: column; + position: relative; + background-color: white; +} +div.sk-parallel-item:first-child::after { + align-self: flex-end; + width: 50%; +} +div.sk-parallel-item:last-child::after { + align-self: flex-start; + width: 50%; +} +div.sk-parallel-item:only-child::after { + width: 0; +} +div.sk-dashed-wrapped { + border: 1px dashed gray; + margin: 0.2em; + box-sizing: border-box; + padding-bottom: 0.1em; + background-color: white; + position: relative; +} +div.sk-label label { + font-family: monospace; + font-weight: bold; + background-color: white; + display: inline-block; + line-height: 1.2em; +} +div.sk-label-container { + position: relative; + z-index: 2; + text-align: center; +} +div.sk-container { + display: inline-block; + position: relative; +} +""".replace(' ', '').replace('\n', '') # noqa + + +def estimator_html_repr(estimator): + """Build a HTML representation of an estimator. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + estimator : estimator object + The estimator to visualize. + + Returns + ------- + html: str + HTML representation of estimator. + """ + with closing(StringIO()) as out: + out.write(f'' + f'
') + _write_estimator_html(out, estimator, estimator.__class__.__name__, + str(estimator), first_call=True) + out.write('
') + + html_output = out.getvalue() + return html_output diff --git a/sklearn/utils/tests/test_estimator_html_repr.py b/sklearn/utils/tests/test_estimator_html_repr.py new file mode 100644 index 0000000000000..47d33051bd9a7 --- /dev/null +++ b/sklearn/utils/tests/test_estimator_html_repr.py @@ -0,0 +1,267 @@ +from contextlib import closing +from io import StringIO + +import pytest + +from sklearn import config_context +from sklearn.linear_model import LogisticRegression +from sklearn.neural_network import MLPClassifier +from sklearn.impute import SimpleImputer +from sklearn.decomposition import PCA +from sklearn.decomposition import TruncatedSVD +from sklearn.pipeline import Pipeline +from sklearn.pipeline import FeatureUnion +from sklearn.compose import ColumnTransformer +from sklearn.ensemble import VotingClassifier +from sklearn.feature_selection import SelectPercentile +from sklearn.cluster import Birch +from sklearn.cluster import AgglomerativeClustering +from sklearn.preprocessing import OneHotEncoder +from sklearn.svm import LinearSVC +from sklearn.svm import LinearSVR +from sklearn.tree import DecisionTreeClassifier +from sklearn.multiclass import OneVsOneClassifier +from sklearn.ensemble import StackingClassifier +from sklearn.ensemble import StackingRegressor +from sklearn.gaussian_process import GaussianProcessRegressor +from sklearn.gaussian_process.kernels import RationalQuadratic +from sklearn.utils._estimator_html_repr import _write_label_html +from sklearn.utils._estimator_html_repr import _get_visual_block +from sklearn.utils._estimator_html_repr import estimator_html_repr + + +@pytest.mark.parametrize("checked", [True, False]) +def test_write_label_html(checked): + # Test checking logic and labeling + name = "LogisticRegression" + tool_tip = "hello-world" + + with closing(StringIO()) as out: + _write_label_html(out, name, tool_tip, checked=checked) + html_label = out.getvalue() + assert 'LogisticRegression' in html_label + assert html_label.startswith('
') + assert '
hello-world
' in html_label + if checked: + assert 'checked>' in html_label + + +@pytest.mark.parametrize('est', ['passthrough', 'drop', None]) +def test_get_visual_block_single_str_none(est): + # Test estimators that are represnted by strings + est_html_info = _get_visual_block(est) + assert est_html_info.kind == 'single' + assert est_html_info.estimators == est + assert est_html_info.names == str(est) + assert est_html_info.name_details == str(est) + + +def test_get_visual_block_single_estimator(): + est = LogisticRegression(C=10.0) + est_html_info = _get_visual_block(est) + assert est_html_info.kind == 'single' + assert est_html_info.estimators == est + assert est_html_info.names == est.__class__.__name__ + assert est_html_info.name_details == str(est) + + +def test_get_visual_block_pipeline(): + pipe = Pipeline([ + ('imputer', SimpleImputer()), + ('do_nothing', 'passthrough'), + ('do_nothing_more', None), + ('classifier', LogisticRegression()) + ]) + est_html_info = _get_visual_block(pipe) + assert est_html_info.kind == 'serial' + assert est_html_info.estimators == tuple(step[1] for step in pipe.steps) + assert est_html_info.names == ['imputer: SimpleImputer', + 'do_nothing: passthrough', + 'do_nothing_more: passthrough', + 'classifier: LogisticRegression'] + assert est_html_info.name_details == [str(est) for _, est in pipe.steps] + + +def test_get_visual_block_feature_union(): + f_union = FeatureUnion([ + ('pca', PCA()), ('svd', TruncatedSVD()) + ]) + est_html_info = _get_visual_block(f_union) + assert est_html_info.kind == 'parallel' + assert est_html_info.names == ('pca', 'svd') + assert est_html_info.estimators == tuple( + trans[1] for trans in f_union.transformer_list) + assert est_html_info.name_details == (None, None) + + +def test_get_visual_block_voting(): + clf = VotingClassifier([ + ('log_reg', LogisticRegression()), + ('mlp', MLPClassifier()) + ]) + est_html_info = _get_visual_block(clf) + assert est_html_info.kind == 'parallel' + assert est_html_info.estimators == tuple(trans[1] + for trans in clf.estimators) + assert est_html_info.names == ('log_reg', 'mlp') + assert est_html_info.name_details == (None, None) + + +def test_get_visual_block_column_transformer(): + ct = ColumnTransformer([ + ('pca', PCA(), ['num1', 'num2']), + ('svd', TruncatedSVD, [0, 3]) + ]) + est_html_info = _get_visual_block(ct) + assert est_html_info.kind == 'parallel' + assert est_html_info.estimators == tuple( + trans[1] for trans in ct.transformers) + assert est_html_info.names == ('pca', 'svd') + assert est_html_info.name_details == (['num1', 'num2'], [0, 3]) + + +def test_estimator_html_repr_pipeline(): + num_trans = Pipeline(steps=[ + ('pass', 'passthrough'), + ('imputer', SimpleImputer(strategy='median')) + ]) + + cat_trans = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='constant', + missing_values='empty')), + ('one-hot', OneHotEncoder(drop='first')) + ]) + + preprocess = ColumnTransformer([ + ('num', num_trans, ['a', 'b', 'c', 'd', 'e']), + ('cat', cat_trans, [0, 1, 2, 3]) + ]) + + feat_u = FeatureUnion([ + ('pca', PCA(n_components=1)), + ('tsvd', Pipeline([('first', TruncatedSVD(n_components=3)), + ('select', SelectPercentile())])) + ]) + + clf = VotingClassifier([ + ('lr', LogisticRegression(solver='lbfgs', random_state=1)), + ('mlp', MLPClassifier(alpha=0.001)) + ]) + + pipe = Pipeline([ + ('preprocessor', preprocess), ('feat_u', feat_u), ('classifier', clf) + ]) + html_output = estimator_html_repr(pipe) + + # top level estimators show estimator with changes + assert str(pipe) in html_output + for _, est in pipe.steps: + assert (f"
" + f"
{str(est)}") in html_output
+
+    # low level estimators do not show changes
+    with config_context(print_changed_only=True):
+        assert str(num_trans['pass']) in html_output
+        assert 'passthrough' in html_output
+        assert str(num_trans['imputer']) in html_output
+
+        for _, _, cols in preprocess.transformers:
+            assert f"
{cols}
" in html_output + + # feature union + for name, _ in feat_u.transformer_list: + assert f"" in html_output + + pca = feat_u.transformer_list[0][1] + assert f"
{str(pca)}
" in html_output + + tsvd = feat_u.transformer_list[1][1] + first = tsvd['first'] + select = tsvd['select'] + assert f"
{str(first)}
" in html_output + assert f"
{str(select)}
" in html_output + + # voting classifer + for name, est in clf.estimators: + assert f"" in html_output + assert f"
{str(est)}
" in html_output + + +@pytest.mark.parametrize("final_estimator", [None, LinearSVC()]) +def test_stacking_classsifer(final_estimator): + estimators = [('mlp', MLPClassifier(alpha=0.001)), + ('tree', DecisionTreeClassifier())] + clf = StackingClassifier( + estimators=estimators, final_estimator=final_estimator) + + html_output = estimator_html_repr(clf) + + assert str(clf) in html_output + # If final_estimator's default changes from LogisticRegression + # this should be updated + if final_estimator is None: + assert "LogisticRegression(" in html_output + else: + assert final_estimator.__class__.__name__ in html_output + + +@pytest.mark.parametrize("final_estimator", [None, LinearSVR()]) +def test_stacking_regressor(final_estimator): + reg = StackingRegressor( + estimators=[('svr', LinearSVR())], final_estimator=final_estimator) + html_output = estimator_html_repr(reg) + + assert str(reg.estimators[0][0]) in html_output + assert "LinearSVR" in html_output + if final_estimator is None: + assert "RidgeCV" in html_output + else: + assert final_estimator.__class__.__name__ in html_output + + +def test_birch_duck_typing_meta(): + # Test duck typing meta estimators with Birch + birch = Birch(n_clusters=AgglomerativeClustering(n_clusters=3)) + html_output = estimator_html_repr(birch) + + # inner estimators do not show changes + with config_context(print_changed_only=True): + assert f"
{str(birch.n_clusters)}" in html_output
+        assert "AgglomerativeClustering" in html_output
+
+    # outer estimator contains all changes
+    assert f"
{str(birch)}" in html_output
+
+
+def test_ovo_classifier_duck_typing_meta():
+    # Test duck typing metaestimators with OVO
+    ovo = OneVsOneClassifier(LinearSVC(penalty='l1'))
+    html_output = estimator_html_repr(ovo)
+
+    # inner estimators do not show changes
+    with config_context(print_changed_only=True):
+        assert f"
{str(ovo.estimator)}" in html_output
+        assert "LinearSVC" in html_output
+
+    # outter estimator
+    assert f"
{str(ovo)}" in html_output
+
+
+def test_duck_typing_nested_estimator():
+    # Test duck typing metaestimators with GP
+    kernel = RationalQuadratic(length_scale=1.0, alpha=0.1)
+    gp = GaussianProcessRegressor(kernel=kernel)
+    html_output = estimator_html_repr(gp)
+
+    assert f"
{str(kernel)}" in html_output
+    assert f"
{str(gp)}" in html_output
+
+
+@pytest.mark.parametrize('print_changed_only', [True, False])
+def test_one_estimator_print_change_only(print_changed_only):
+    pca = PCA(n_components=10)
+
+    with config_context(print_changed_only=print_changed_only):
+        pca_repr = str(pca)
+        html_output = estimator_html_repr(pca)
+        assert pca_repr in html_output

From 3a6c8c4b6ce17b84b01d47b66ead1797c04931bc Mon Sep 17 00:00:00 2001
From: Lucy Liu 
Date: Thu, 30 Apr 2020 16:59:44 +0200
Subject: [PATCH 097/125] DOC Remove unnecessary comment (#17091)

---
 .../miscellaneous/plot_partial_dependence_visualization_api.py   | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/miscellaneous/plot_partial_dependence_visualization_api.py b/examples/miscellaneous/plot_partial_dependence_visualization_api.py
index 761dad8b1e1fa..cbfa2c5e8ab64 100644
--- a/examples/miscellaneous/plot_partial_dependence_visualization_api.py
+++ b/examples/miscellaneous/plot_partial_dependence_visualization_api.py
@@ -98,7 +98,6 @@
 # which will plot the partial dependence curves of each model on the same axes.
 # The length of the axes list must be equal to the number of plots drawn.
 
-# Sets this image as the thumbnail for sphinx gallery
 # sphinx_gallery_thumbnail_number = 4
 fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 6))
 tree_disp.plot(ax=[ax1, ax2], line_kw={"label": "Decision Tree"})

From 0c0b834d1146257556ce934b38df1b7fe2ce9ef6 Mon Sep 17 00:00:00 2001
From: Bharat Raghunathan 
Date: Fri, 1 May 2020 18:02:01 +0530
Subject: [PATCH 098/125] DOC detail fit_intercept docstring (#17096)

---
 sklearn/linear_model/_ridge.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
index 309137bed2b5d..ca3fba196d6d3 100644
--- a/sklearn/linear_model/_ridge.py
+++ b/sklearn/linear_model/_ridge.py
@@ -628,9 +628,9 @@ class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge):
         number.
 
     fit_intercept : bool, default=True
-        Whether to calculate the intercept for this model. If set
+        Whether to fit the intercept for this model. If set
         to false, no intercept will be used in calculations
-        (i.e. data is expected to be centered).
+        (i.e. ``X`` and ``y`` are expected to be centered).
 
     normalize : bool, default=False
         This parameter is ignored when ``fit_intercept`` is set to False.

From 863c1d683f888931f96d968faaf5b60c9c821723 Mon Sep 17 00:00:00 2001
From: Nicolas Hug 
Date: Fri, 1 May 2020 11:06:00 -0400
Subject: [PATCH 099/125] DOC Feature highlights for 0.23 (#17062)

---
 doc/modules/ensemble.rst                      |   2 +
 doc/whats_new/v0.23.rst                       |  37 ++--
 .../plot_release_highlights_0_23_0.py         | 165 ++++++++++++++++++
 3 files changed, 188 insertions(+), 16 deletions(-)
 create mode 100644 examples/release_highlights/plot_release_highlights_0_23_0.py

diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
index 3cf8987fcfd5a..434cf146c2d4e 100644
--- a/doc/modules/ensemble.rst
+++ b/doc/modules/ensemble.rst
@@ -1018,6 +1018,8 @@ If no missing values were encountered for a given feature during training,
 then samples with missing values are mapped to whichever child has the most
 samples.
 
+.. _sw_hgbdt:
+
 Sample weight support
 ---------------------
 
diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index 1ac63ca473faf..fba75d62cc380 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -9,6 +9,10 @@ Version 0.23.0
 
 **In Development**
 
+For a short description of the main highlights of the release, please
+refer to
+:ref:`sphx_glr_auto_examples_release_highlights_plot_release_highlights_0_23_0.py`.
+
 
 .. include:: changelog_legend.inc
 
@@ -103,9 +107,9 @@ Changelog
   :pr:`16149` by :user:`Jeremie du Boisberranger ` and
   :user:`Alex Shacked `.
 
-- |Efficiency| The critical parts of :class:`cluster.KMeans` have a more
-  optimized implementation. Parallelism is now over the data instead of over
-  initializations allowing better scalability. :pr:`11950` by
+- |Efficiency| |MajorFeature| The critical parts of :class:`cluster.KMeans`
+  have a more optimized implementation. Parallelism is now over the data
+  instead of over initializations allowing better scalability. :pr:`11950` by
   :user:`Jeremie du Boisberranger `.
 
 - |Enhancement| :class:`cluster.KMeans` now supports sparse data when
@@ -124,6 +128,10 @@ Changelog
   could not have a `np.int64` type. :pr:`16484`
   by :user:`Jeremie du Boisberranger `.
 
+- |Fix| :class:`cluster.AgglomerativeCluClustering` add specific error when
+  distance matrix is not square and `affinity=precomputed`.
+  :pr:`16257` by :user:`Simona Maggio `.
+
 - |API| The ``n_jobs`` parameter of :class:`cluster.KMeans`,
   :class:`cluster.SpectralCoclustering` and
   :class:`cluster.SpectralBiclustering` is deprecated. They now use OpenMP
@@ -234,7 +242,7 @@ Changelog
   samples in the training set. :pr:`14516` by :user:`Johann Faouzi
   `.
 
-- |Feature| :class:`ensemble.HistGradientBoostingClassifier` and
+- |MajorFeature| :class:`ensemble.HistGradientBoostingClassifier` and
   :class:`ensemble.HistGradientBoostingRegressor` now support monotonic
   constraints, useful when features are supposed to have a positive/negative
   effect on the target. :pr:`15582` by `Nicolas Hug`_.
@@ -340,9 +348,10 @@ Changelog
   :pr:`14300` by :user:`Christian Lorentzen `, `Roman Yurchak`_,
   and `Olivier Grisel`_.
 
-- |Feature| Support of `sample_weight` in :class:`linear_model.ElasticNet` and
-  :class:`linear_model.Lasso` for dense feature matrix `X`.
-  :pr:`15436` by :user:`Christian Lorentzen `.
+- |MajorFeature| Support of `sample_weight` in
+  :class:`linear_model.ElasticNet` and :class:`linear_model.Lasso` for dense
+  feature matrix `X`. :pr:`15436` by :user:`Christian Lorentzen
+  `.
 
 - |Efficiency| :class:`linear_model.RidgeCV` and
   :class:`linear_model.RidgeClassifierCV` now does not allocate a
@@ -567,8 +576,11 @@ Changelog
 :mod:`sklearn.utils`
 ....................
 
-- |Feature| Adds :func:`utils.estimator_html_repr` for returning a
-  HTML representation of an estimator. :pr:`14180` by `Thomas Fan`_.
+- |MajorFeature| Estimators can now be displayed with a rich html
+  representation. This can be enabled in Jupyter notebooks by setting
+  `display='diagram'` in :func:`~sklearn.set_config`. The raw html can be
+  returned by using :func:`utils.estimator_html_repr`.
+  :pr:`14180` by `Thomas Fan`_.
 
 - |Enhancement| improve error message in :func:`utils.validation.column_or_1d`.
   :pr:`15926` by :user:`Loïc Estève `.
@@ -598,13 +610,6 @@ Changelog
 - |FIX| :func:`utils.all_estimators` now only returns public estimators.
   :pr:`15380` by `Thomas Fan`_.
 
-:mod:`sklearn.cluster`
-......................
-
-- |Fix| :class:`cluster.AgglomerativeClustering` add specific error when
-  distance matrix is not square and `affinity=precomputed`.
-  :pr:`16257` by :user:`Simona Maggio `.
-
 Miscellaneous
 .............
 
diff --git a/examples/release_highlights/plot_release_highlights_0_23_0.py b/examples/release_highlights/plot_release_highlights_0_23_0.py
new file mode 100644
index 0000000000000..644e0f7747d39
--- /dev/null
+++ b/examples/release_highlights/plot_release_highlights_0_23_0.py
@@ -0,0 +1,165 @@
+# flake8: noqa
+"""
+========================================
+Release Highlights for scikit-learn 0.23
+========================================
+
+.. currentmodule:: sklearn
+
+We are pleased to announce the release of scikit-learn 0.23! Many bug fixes
+and improvements were added, as well as some new key features. We detail
+below a few of the major features of this release. **For an exhaustive list of
+all the changes**, please refer to the :ref:`release notes `.
+
+To install the latest version (with pip)::
+
+    pip install --upgrade scikit-learn
+
+or with conda::
+
+    conda install scikit-learn
+"""
+
+##############################################################################
+# Generalized Linear Models, and Poisson loss for gradient boosting
+# -----------------------------------------------------------------
+# Long-awaited Generalized Linear Models with non-normal loss functions are now
+# available. In particular, three new regressors were implemented:
+# :class:`~sklearn.linear_model.PoissonRegressor`,
+# :class:`~sklearn.linear_model.GammaRegressor`, and
+# :class:`~sklearn.linear_model.TweedieRegressor`. The Poisson regressor can be
+# used to model positive integer counts, or relative frequencies. Read more in
+# the :ref:`User Guide `. Additionally,
+# :class:`~sklearn.ensemble.HistGradientBoostingRegressor` supports a new
+# 'poisson' loss as well.
+
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import PoissonRegressor
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingRegressor
+
+n_samples, n_features = 1000, 20
+rng = np.random.RandomState(0)
+X = rng.randn(n_samples, n_features)
+# positive integer target correlated with X[:, 5] with many zeros:
+y = rng.poisson(lam=np.exp(X[:, 5]) / 2)
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
+glm = PoissonRegressor()
+gbdt = HistGradientBoostingRegressor(loss='poisson', learning_rate=.01)
+glm.fit(X_train, y_train)
+gbdt.fit(X_train, y_train)
+print(glm.score(X_test, y_test))
+print(gbdt.score(X_test, y_test))
+
+##############################################################################
+# Rich HTML representation for estimators
+# ---------------------------------------
+# Estimators can now be rendered in html in notebooks by enabling the
+# `display='diagram'` option. This is particularly useful to visualize
+# pipelines and composite estimators. Click on the entries to expand and see
+# details.
+from sklearn import set_config
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn.impute import SimpleImputer
+from sklearn.compose import make_column_transformer
+from sklearn.linear_model import LogisticRegression
+set_config(display='diagram')
+
+num_proc = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())
+
+cat_proc = make_pipeline(
+    SimpleImputer(strategy='constant', fill_value='missing'),
+    OneHotEncoder(handle_unknown='ignore'))
+
+preprocessor = make_column_transformer((num_proc, ('feat1', 'feat3')),
+                                       (cat_proc, ('feat0', 'feat2')))
+
+clf = make_pipeline(preprocessor, LogisticRegression())
+clf
+
+##############################################################################
+# Scalability and stability improvements to KMeans
+# ------------------------------------------------
+# The :class:`~sklearn.cluster.KMeans` estimator was entirely re-worked, and it
+# is now significantly faster and more stable. In addition, the Elkan algorithm
+# is now compatible with sparse matrices. The estimator uses OpenMP based
+# parallelism instead of relying on joblib, so the `n_jobs` parameter has no
+# effect anymore. For more details on how to control the number of threads,
+# please refer to our :ref:`parallelism` notes.
+import scipy
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.cluster import KMeans
+from sklearn.datasets import make_blobs
+from sklearn.metrics import completeness_score
+
+rng = np.random.RandomState(0)
+X, y = make_blobs(random_state=rng)
+X = scipy.sparse.csr_matrix(X)
+X_train, X_test, _, y_test = train_test_split(X, y, random_state=rng)
+kmeans = KMeans(algorithm='elkan').fit(X_train)
+print(completeness_score(kmeans.predict(X_test), y_test))
+
+##############################################################################
+# Improvements to the histogram-based Gradient Boosting estimators
+# ----------------------------------------------------------------
+# Various improvements were made to
+# :class:`~sklearn.ensemble.HistGradientBoostingClassifier` and
+# :class:`~sklearn.ensemble.HistGradientBoostingRegressor`. On top of the
+# Poisson loss mentionned above, these estimators now support :ref:`sample
+# weights `. Also, an automatic early-stopping criterion was added:
+# early-stopping is enabled by default when the number of samples exceeds 10k.
+# Finally, users can now define :ref:`monotonic constraints
+# ` to constrain the predictions based on the variations of
+# specific features. In the following example, we construct a target that is
+# generally positively correlated with the first feature, with some noise.
+# Applying monotoinc constraints allows the prediction to capture the global
+# effect of the first feature, instead of fitting the noise.
+import numpy as np
+from matplotlib import pyplot as plt
+from sklearn.model_selection import train_test_split
+from sklearn.inspection import plot_partial_dependence
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingRegressor
+
+n_samples = 500
+rng = np.random.RandomState(0)
+X = rng.randn(n_samples, 2)
+noise = rng.normal(loc=0.0, scale=0.01, size=n_samples)
+y = (5 * X[:, 0] + np.sin(10 * np.pi * X[:, 0]) - noise)
+
+gbdt_no_cst = HistGradientBoostingRegressor().fit(X, y)
+gbdt_cst = HistGradientBoostingRegressor(monotonic_cst=[1, 0]).fit(X, y)
+
+disp = plot_partial_dependence(
+    gbdt_no_cst, X, features=[0], feature_names=['feature 0'],
+    line_kw={'linewidth': 4, 'label': 'unconstrained'})
+plot_partial_dependence(gbdt_cst, X, features=[0],
+    line_kw={'linewidth': 4, 'label': 'constrained'}, ax=disp.axes_)
+disp.axes_[0, 0].plot(X[:, 0], y, 'o', alpha=.5, zorder=-1, label='samples')
+disp.axes_[0, 0].set_ylim(-3, 3); disp.axes_[0, 0].set_xlim(-1, 1)
+plt.legend()
+plt.show()
+
+##############################################################################
+# Sample-weight support for Lasso and ElasticNet
+# ----------------------------------------------
+# The two linear regressors :class:`~sklearn.linear_model.Lasso` and
+# :class:`~sklearn.linear_model.ElasticNet` now support sample weights.
+
+from sklearn.model_selection import train_test_split
+from sklearn.datasets import make_regression
+from sklearn.linear_model import Lasso
+import numpy as np
+
+n_samples, n_features = 1000, 20
+rng = np.random.RandomState(0)
+X, y = make_regression(n_samples, n_features, random_state=rng)
+sample_weight = rng.rand(n_samples)
+X_train, X_test, y_train, y_test, sw_train, sw_test = train_test_split(
+    X, y, sample_weight, random_state=rng)
+reg = Lasso()
+reg.fit(X_train, y_train, sample_weight=sw_train)
+print(reg.score(X_test, y_test, sw_test))

From c71a1c21d14fc7a98493acb1f3d315db720ca4ac Mon Sep 17 00:00:00 2001
From: Gael Varoquaux 
Date: Fri, 1 May 2020 18:27:05 +0200
Subject: [PATCH 100/125] MISC fix rst syntax (#17098)

* MISC: fix rst syntax

* DOC Adds full link

Co-authored-by: Thomas J Fan 
---
 doc/whats_new/v0.23.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index fba75d62cc380..e8a1566349f06 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -445,9 +445,9 @@ Changelog
   type and details.
   :pr:`15622` by :user:`Gregory Morse `.
 
-- |Fix| :func: `cross_val_predict` supports `method="predict_proba"`
-  when `y=None`.
-  :pr:`15918` by :user:`Luca Kubin `.
+- |Fix| :func:`model_selection.cross_val_predict` supports
+  `method="predict_proba"` when `y=None`.:pr:`15918` by
+  :user:`Luca Kubin `.
 
 - |Fix| :func:`model_selection.fit_grid_point` is deprecated in 0.23 and will
   be removed in 0.25. :pr:`16401` by

From 04d2e3290f47ee7ee86767ae534629cd4112ffbf Mon Sep 17 00:00:00 2001
From: Alexandre Gramfort 
Date: Sat, 2 May 2020 10:11:07 +0200
Subject: [PATCH 101/125] Speedup MultiTaskLasso (#17021)

---
 doc/whats_new/v0.23.rst                       |   7 ++
 sklearn/linear_model/_cd_fast.pyx             | 116 ++++++++++--------
 sklearn/linear_model/_coordinate_descent.py   |  31 +++--
 .../tests/test_coordinate_descent.py          |   4 +-
 4 files changed, 86 insertions(+), 72 deletions(-)

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index e8a1566349f06..bd31752973f6a 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -404,6 +404,13 @@ Changelog
   using joblib loky backend. :pr:`14264` by
   :user:`Jérémie du Boisberranger `.
 
+- |Efficiency| Speed up :class:`linear_model.MultiTaskLasso`,
+  :class:`linear_model.MultiTaskLassoCV`, :class:`linear_model.MultiTaskElasticNet`,
+  :class:`linear_model.MultiTaskElasticNetCV` by avoiding slower
+  BLAS Level 2 calls on small arrays
+  :pr:`17021` by :user:`Alex Gramfort ` and
+  :user:`Mathurin Massias `.
+
 :mod:`sklearn.metrics`
 ......................
 
diff --git a/sklearn/linear_model/_cd_fast.pyx b/sklearn/linear_model/_cd_fast.pyx
index fcbe46ce77711..5b47f45c2e248 100644
--- a/sklearn/linear_model/_cd_fast.pyx
+++ b/sklearn/linear_model/_cd_fast.pyx
@@ -19,7 +19,7 @@ from cython cimport floating
 import warnings
 from ..exceptions import ConvergenceWarning
 
-from ..utils._cython_blas cimport (_axpy, _dot, _asum, _ger, _gemv, _nrm2, 
+from ..utils._cython_blas cimport (_axpy, _dot, _asum, _ger, _gemv, _nrm2,
                                    _copy, _scal)
 from ..utils._cython_blas cimport RowMajor, ColMajor, Trans, NoTrans
 
@@ -154,7 +154,7 @@ def enet_coordinate_descent(floating[::1] w,
     with nogil:
         # R = y - np.dot(X, w)
         _copy(n_samples, &y[0], 1, &R[0], 1)
-        _gemv(ColMajor, NoTrans, n_samples, n_features, -1.0, &X[0, 0], 
+        _gemv(ColMajor, NoTrans, n_samples, n_features, -1.0, &X[0, 0],
               n_samples, &w[0], 1, 1.0, &R[0], 1)
 
         # tol *= np.dot(y, y)
@@ -620,18 +620,17 @@ def enet_coordinate_descent_gram(floating[::1] w,
     return np.asarray(w), gap, tol, n_iter + 1
 
 
-def enet_coordinate_descent_multi_task(floating[::1, :] W, floating l1_reg,
-                                       floating l2_reg,
-                                       np.ndarray[floating, ndim=2, mode='fortran'] X,
-                                       np.ndarray[floating, ndim=2] Y,
-                                       int max_iter, floating tol, object rng,
-                                       bint random=0):
+def enet_coordinate_descent_multi_task(
+        floating[::1, :] W, floating l1_reg, floating l2_reg,
+        np.ndarray[floating, ndim=2, mode='fortran'] X,  # TODO: use views in 0.24
+        np.ndarray[floating, ndim=2, mode='fortran'] Y,
+        int max_iter, floating tol, object rng, bint random=0):
     """Cython version of the coordinate descent algorithm
         for Elastic-Net mult-task regression
 
         We minimize
 
-        (1/2) * norm(y - X w, 2)^2 + l1_reg ||w||_21 + (1/2) * l2_reg norm(w, 2)^2
+        0.5 * norm(Y - X W.T, 2)^2 + l1_reg ||W.T||_21 + 0.5 * l2_reg norm(W.T, 2)^2
 
     """
 
@@ -651,11 +650,11 @@ def enet_coordinate_descent_multi_task(floating[::1, :] W, floating l1_reg,
     cdef floating dual_norm_XtA
 
     # initial value of the residuals
-    cdef floating[:, ::1] R = np.zeros((n_samples, n_tasks), dtype=dtype)
+    cdef floating[::1, :] R = np.zeros((n_samples, n_tasks), dtype=dtype, order='F')
 
-    cdef floating[:] norm_cols_X = np.zeros(n_features, dtype=dtype)
+    cdef floating[::1] norm_cols_X = np.zeros(n_features, dtype=dtype)
     cdef floating[::1] tmp = np.zeros(n_tasks, dtype=dtype)
-    cdef floating[:] w_ii = np.zeros(n_tasks, dtype=dtype)
+    cdef floating[::1] w_ii = np.zeros(n_tasks, dtype=dtype)
     cdef floating d_w_max
     cdef floating w_max
     cdef floating d_w_ii
@@ -675,9 +674,7 @@ def enet_coordinate_descent_multi_task(floating[::1, :] W, floating l1_reg,
     cdef UINT32_t* rand_r_state = &rand_r_state_seed
 
     cdef floating* X_ptr = &X[0, 0]
-    cdef floating* W_ptr = &W[0, 0]
     cdef floating* Y_ptr = &Y[0, 0]
-    cdef floating* wii_ptr = &w_ii[0]
 
     if l1_reg == 0:
         warnings.warn("Coordinate descent with l1_reg=0 may lead to unexpected"
@@ -686,15 +683,15 @@ def enet_coordinate_descent_multi_task(floating[::1, :] W, floating l1_reg,
     with nogil:
         # norm_cols_X = (np.asarray(X) ** 2).sum(axis=0)
         for ii in range(n_features):
-            for jj in range(n_samples):
-                norm_cols_X[ii] += X[jj, ii] ** 2
+            norm_cols_X[ii] = _nrm2(n_samples, X_ptr + ii * n_samples, 1) ** 2
 
         # R = Y - np.dot(X, W.T)
-        for ii in range(n_samples):
+        _copy(n_samples * n_tasks, Y_ptr, 1, &R[0, 0], 1)
+        for ii in range(n_features):
             for jj in range(n_tasks):
-                R[ii, jj] = Y[ii, jj] - (
-                    _dot(n_features, X_ptr + ii, n_samples, W_ptr + jj, n_tasks)
-                    )
+                if W[jj, ii] != 0:
+                    _axpy(n_samples, -W[jj, ii], X_ptr + ii * n_samples, 1,
+                          &R[0, jj], 1)
 
         # tol = tol * linalg.norm(Y, ord='fro') ** 2
         tol = tol * _nrm2(n_samples * n_tasks, Y_ptr, 1) ** 2
@@ -712,42 +709,59 @@ def enet_coordinate_descent_multi_task(floating[::1, :] W, floating l1_reg,
                     continue
 
                 # w_ii = W[:, ii] # Store previous value
-                _copy(n_tasks, W_ptr + ii * n_tasks, 1, wii_ptr, 1)
-
-                # if np.sum(w_ii ** 2) != 0.0:  # can do better
-                if _nrm2(n_tasks, wii_ptr, 1) != 0.0:
-                    # R += np.dot(X[:, ii][:, None], w_ii[None, :]) # rank 1 update
-                    _ger(RowMajor, n_samples, n_tasks, 1.0,
-                         X_ptr + ii * n_samples, 1,
-                         wii_ptr, 1, &R[0, 0], n_tasks)
-
+                _copy(n_tasks, &W[0, ii], 1, &w_ii[0], 1)
+
+                # Using Numpy:
+                # R += np.dot(X[:, ii][:, None], w_ii[None, :]) # rank 1 update
+                # Using Blas Level2:
+                # _ger(RowMajor, n_samples, n_tasks, 1.0,
+                #      &X[0, ii], 1,
+                #      &w_ii[0], 1, &R[0, 0], n_tasks)
+                # Using Blas Level1 and for loop to avoid slower threads
+                # for such small vectors
+                for jj in range(n_tasks):
+                    if w_ii[jj] != 0:
+                        _axpy(n_samples, w_ii[jj], X_ptr + ii * n_samples, 1,
+                              &R[0, jj], 1)
+
+                # Using numpy:
                 # tmp = np.dot(X[:, ii][None, :], R).ravel()
-                _gemv(RowMajor, Trans, n_samples, n_tasks, 1.0, &R[0, 0],
-                      n_tasks, X_ptr + ii * n_samples, 1, 0.0, &tmp[0], 1)
+                # Using BLAS Level 2:
+                # _gemv(RowMajor, Trans, n_samples, n_tasks, 1.0, &R[0, 0],
+                #       n_tasks, &X[0, ii], 1, 0.0, &tmp[0], 1)
+                # Using BLAS Level 1 (faster for small vectors like here):
+                for jj in range(n_tasks):
+                    tmp[jj] = _dot(n_samples, X_ptr + ii * n_samples, 1,
+                                   &R[0, jj], 1)
 
                 # nn = sqrt(np.sum(tmp ** 2))
                 nn = _nrm2(n_tasks, &tmp[0], 1)
 
                 # W[:, ii] = tmp * fmax(1. - l1_reg / nn, 0) / (norm_cols_X[ii] + l2_reg)
-                _copy(n_tasks, &tmp[0], 1, W_ptr + ii * n_tasks, 1)
+                _copy(n_tasks, &tmp[0], 1, &W[0, ii], 1)
                 _scal(n_tasks, fmax(1. - l1_reg / nn, 0) / (norm_cols_X[ii] + l2_reg),
-                      W_ptr + ii * n_tasks, 1)
-
-                # if np.sum(W[:, ii] ** 2) != 0.0:  # can do better
-                if _nrm2(n_tasks, W_ptr + ii * n_tasks, 1) != 0.0:
-                    # R -= np.dot(X[:, ii][:, None], W[:, ii][None, :])
-                    # Update residual : rank 1 update
-                    _ger(RowMajor, n_samples, n_tasks, -1.0,
-                         X_ptr + ii * n_samples, 1, W_ptr + ii * n_tasks, 1,
-                         &R[0, 0], n_tasks)
+                      &W[0, ii], 1)
+
+                # Using numpy:
+                # R -= np.dot(X[:, ii][:, None], W[:, ii][None, :])
+                # Using BLAS Level 2:
+                # Update residual : rank 1 update
+                # _ger(RowMajor, n_samples, n_tasks, -1.0,
+                #      &X[0, ii], 1, &W[0, ii], 1,
+                #      &R[0, 0], n_tasks)
+                # Using BLAS Level 1 (faster for small vectors like here):
+                for jj in range(n_tasks):
+                    if W[jj, ii] != 0:
+                        _axpy(n_samples, -W[jj, ii], X_ptr + ii * n_samples, 1,
+                              &R[0, jj], 1)
 
                 # update the maximum absolute coefficient update
-                d_w_ii = diff_abs_max(n_tasks, W_ptr + ii * n_tasks, wii_ptr)
+                d_w_ii = diff_abs_max(n_tasks, &W[0, ii], &w_ii[0])
 
                 if d_w_ii > d_w_max:
                     d_w_max = d_w_ii
 
-                W_ii_abs_max = abs_max(n_tasks, W_ptr + ii * n_tasks)
+                W_ii_abs_max = abs_max(n_tasks, &W[0, ii])
                 if W_ii_abs_max > w_max:
                     w_max = W_ii_abs_max
 
@@ -760,16 +774,14 @@ def enet_coordinate_descent_multi_task(floating[::1, :] W, floating l1_reg,
                 for ii in range(n_features):
                     for jj in range(n_tasks):
                         XtA[ii, jj] = _dot(
-                            n_samples, X_ptr + ii * n_samples, 1,
-                            &R[0, 0] + jj, n_tasks
+                            n_samples, X_ptr + ii * n_samples, 1, &R[0, jj], 1
                             ) - l2_reg * W[jj, ii]
 
                 # dual_norm_XtA = np.max(np.sqrt(np.sum(XtA ** 2, axis=1)))
                 dual_norm_XtA = 0.0
                 for ii in range(n_features):
                     # np.sqrt(np.sum(XtA ** 2, axis=1))
-                    XtA_axis1norm = _nrm2(n_tasks,
-                                          &XtA[0, 0] + ii * n_tasks, 1)
+                    XtA_axis1norm = _nrm2(n_tasks, &XtA[ii, 0], 1)
                     if XtA_axis1norm > dual_norm_XtA:
                         dual_norm_XtA = XtA_axis1norm
 
@@ -777,7 +789,7 @@ def enet_coordinate_descent_multi_task(floating[::1, :] W, floating l1_reg,
                 # R_norm = linalg.norm(R, ord='fro')
                 # w_norm = linalg.norm(W, ord='fro')
                 R_norm = _nrm2(n_samples * n_tasks, &R[0, 0], 1)
-                w_norm = _nrm2(n_features * n_tasks, W_ptr, 1)
+                w_norm = _nrm2(n_features * n_tasks, &W[0, 0], 1)
                 if (dual_norm_XtA > l1_reg):
                     const =  l1_reg / dual_norm_XtA
                     A_norm = R_norm * const
@@ -787,16 +799,12 @@ def enet_coordinate_descent_multi_task(floating[::1, :] W, floating l1_reg,
                     gap = R_norm ** 2
 
                 # ry_sum = np.sum(R * y)
-                ry_sum = 0.0
-                for ii in range(n_samples):
-                    for jj in range(n_tasks):
-                        ry_sum += R[ii, jj] * Y[ii, jj]
+                ry_sum = _dot(n_samples * n_tasks, &R[0, 0], 1, &Y[0, 0], 1)
 
                 # l21_norm = np.sqrt(np.sum(W ** 2, axis=0)).sum()
                 l21_norm = 0.0
                 for ii in range(n_features):
-                    # np.sqrt(np.sum(W ** 2, axis=0))
-                    l21_norm += _nrm2(n_tasks, W_ptr + n_tasks * ii, 1)
+                    l21_norm += _nrm2(n_tasks, &W[0, ii], 1)
 
                 gap += l1_reg * l21_norm - const * ry_sum + \
                      0.5 * l2_reg * (1 + const ** 2) * (w_norm ** 2)
diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index 2d8567b04db56..3ac0d155169af 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -1733,9 +1733,9 @@ class MultiTaskElasticNet(Lasso):
 
     Where::
 
-        ||W||_21 = sum_i sqrt(sum_j w_ij ^ 2)
+        ||W||_21 = sum_i sqrt(sum_j W_ij ^ 2)
 
-    i.e. the sum of norm of each row.
+    i.e. the sum of norms of each row.
 
     Read more in the :ref:`User Guide `.
 
@@ -1829,8 +1829,8 @@ class MultiTaskElasticNet(Lasso):
     -----
     The algorithm used to fit the model is coordinate descent.
 
-    To avoid unnecessary memory duplication the X argument of the fit method
-    should be directly passed as a Fortran-contiguous numpy array.
+    To avoid unnecessary memory duplication the X and y arguments of the fit
+    method should be directly passed as Fortran-contiguous numpy arrays.
     """
     @_deprecate_positional_args
     def __init__(self, alpha=1.0, *, l1_ratio=0.5, fit_intercept=True,
@@ -1867,12 +1867,11 @@ def fit(self, X, y):
         To avoid memory re-allocation it is advised to allocate the
         initial data in memory directly using that format.
         """
-
         # Need to validate separately here.
         # We can't pass multi_ouput=True because that would allow y to be csr.
         check_X_params = dict(dtype=[np.float64, np.float32], order='F',
                               copy=self.copy_X and self.fit_intercept)
-        check_y_params = dict(ensure_2d=False)
+        check_y_params = dict(ensure_2d=False, order='F')
         X, y = self._validate_data(X, y, validate_separately=(check_X_params,
                                                               check_y_params))
         y = y.astype(X.dtype)
@@ -2000,13 +1999,13 @@ class MultiTaskLasso(MultiTaskElasticNet):
     --------
     >>> from sklearn import linear_model
     >>> clf = linear_model.MultiTaskLasso(alpha=0.1)
-    >>> clf.fit([[0,0], [1, 1], [2, 2]], [[0, 0], [1, 1], [2, 2]])
+    >>> clf.fit([[0, 1], [1, 2], [2, 4]], [[0, 0], [1, 1], [2, 3]])
     MultiTaskLasso(alpha=0.1)
     >>> print(clf.coef_)
-    [[0.89393398 0.        ]
-     [0.89393398 0.        ]]
+    [[0.         0.60809415]
+    [0.         0.94592424]]
     >>> print(clf.intercept_)
-    [0.10606602 0.10606602]
+    [-0.41888636 -0.87382323]
 
     See also
     --------
@@ -2018,8 +2017,8 @@ class MultiTaskLasso(MultiTaskElasticNet):
     -----
     The algorithm used to fit the model is coordinate descent.
 
-    To avoid unnecessary memory duplication the X argument of the fit method
-    should be directly passed as a Fortran-contiguous numpy array.
+    To avoid unnecessary memory duplication the X and y arguments of the fit
+    method should be directly passed as Fortran-contiguous numpy arrays.
     """
     @_deprecate_positional_args
     def __init__(self, alpha=1.0, *, fit_intercept=True, normalize=False,
@@ -2196,8 +2195,8 @@ class MultiTaskElasticNetCV(RegressorMixin, LinearModelCV):
     -----
     The algorithm used to fit the model is coordinate descent.
 
-    To avoid unnecessary memory duplication the X argument of the fit method
-    should be directly passed as a Fortran-contiguous numpy array.
+    To avoid unnecessary memory duplication the X and y arguments of the fit
+    method should be directly passed as Fortran-contiguous numpy arrays.
     """
     path = staticmethod(enet_path)
 
@@ -2368,8 +2367,8 @@ class MultiTaskLassoCV(RegressorMixin, LinearModelCV):
     -----
     The algorithm used to fit the model is coordinate descent.
 
-    To avoid unnecessary memory duplication the X argument of the fit method
-    should be directly passed as a Fortran-contiguous numpy array.
+    To avoid unnecessary memory duplication the X and y arguments of the fit
+    method should be directly passed as Fortran-contiguous numpy arrays.
     """
     path = staticmethod(lasso_path)
 
diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
index 142c1e9ac2a47..1b2f7c656f015 100644
--- a/sklearn/linear_model/tests/test_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -882,9 +882,9 @@ def test_convergence_warnings():
     X = random_state.standard_normal((1000, 500))
     y = random_state.standard_normal((1000, 3))
 
-    # check that the model fails to converge
+    # check that the model fails to converge (a negative dual gap cannot occur)
     with pytest.warns(ConvergenceWarning):
-        MultiTaskElasticNet(max_iter=1, tol=0).fit(X, y)
+        MultiTaskElasticNet(max_iter=1, tol=-1).fit(X, y)
 
     # check that the model converges w/o warnings
     with pytest.warns(None) as record:

From 5756205920ae072f1f3f3868e4132a85a374a38e Mon Sep 17 00:00:00 2001
From: Thomas J Fan 
Date: Sun, 3 May 2020 11:24:25 -0400
Subject: [PATCH 102/125] DOC Adds release highlights to front page (#17071)

---
 doc/conf.py                             | 18 ++++++++++++++++++
 doc/templates/index.html                |  4 ++--
 doc/themes/scikit-learn-modern/nav.html |  1 +
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/doc/conf.py b/doc/conf.py
index d459cdfd3f1af..74e37d01307be 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -17,6 +17,7 @@
 import warnings
 import re
 from packaging.version import parse
+from pathlib import Path
 
 # If extensions (or modules to document with autodoc) are in another
 # directory, add these directories to sys.path here. If the directory
@@ -208,6 +209,23 @@
 # If true, the reST sources are included in the HTML build as _sources/name.
 html_copy_source = True
 
+# Adds variables into templates
+html_context = {}
+# finds latest release highlights and places it into HTML context for
+# index.html
+release_highlights_dir = Path("..") / "examples" / "release_highlights"
+# Finds the highlight with the latest version number
+latest_highlights = sorted(release_highlights_dir.glob(
+                           "plot_release_highlights_*.py"))[-1]
+latest_highlights = latest_highlights.with_suffix('').name
+html_context["release_highlights"] = \
+    f"auto_examples/release_highlights/{latest_highlights}"
+
+# get version from higlight name assuming highlights have the form
+# plot_release_highlights_0_22_0
+highlight_version = ".".join(latest_highlights.split("_")[-3:-1])
+html_context["release_highlights_version"] = highlight_version
+
 # -- Options for LaTeX output ------------------------------------------------
 latex_elements = {
     # The paper size ('letterpaper' or 'a4paper').
diff --git a/doc/templates/index.html b/doc/templates/index.html
index 8d3bdfaec2b28..367e6a3c01902 100644
--- a/doc/templates/index.html
+++ b/doc/templates/index.html
@@ -8,7 +8,7 @@
         

scikit-learn

Machine Learning in Python

Getting Started - What's New in {{ release }} + Release Highlights for {{ release_highlights_version }} GitHub
@@ -160,7 +160,7 @@

News

  • March 2020. scikit-learn 0.22.2 is available for download (Changelog).
  • January 2020. scikit-learn 0.22.1 is available for download (Changelog). -
  • December 2019. scikit-learn 0.22 is available for download (Changelog). +
  • December 2019. scikit-learn 0.22 is available for download (Changelog and Release Highlights).
  • Scikit-learn from 0.21 requires Python 3.5 or greater.
  • diff --git a/doc/themes/scikit-learn-modern/nav.html b/doc/themes/scikit-learn-modern/nav.html index 57c631f6cbee7..4fbd22f48a4dd 100644 --- a/doc/themes/scikit-learn-modern/nav.html +++ b/doc/themes/scikit-learn-modern/nav.html @@ -9,6 +9,7 @@ {%- set drop_down_navigation = [ ('Getting Started', pathto('getting_started')), ('Tutorial', pathto('tutorial/index')), + ("What's new", 'whats_new/v' + version + '.html'), ('Glossary', pathto('glossary')), ('Development', pathto('developers/index')), ('FAQ', pathto('faq')), From 8b1b281de13138aac954c56f5be95cd7ec9ca44b Mon Sep 17 00:00:00 2001 From: Christian Kastner Date: Sun, 3 May 2020 23:30:02 +0200 Subject: [PATCH 103/125] EXA Remove stray executable flag from example (#17116) --- examples/linear_model/plot_bayesian_ridge_curvefit.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 examples/linear_model/plot_bayesian_ridge_curvefit.py diff --git a/examples/linear_model/plot_bayesian_ridge_curvefit.py b/examples/linear_model/plot_bayesian_ridge_curvefit.py old mode 100755 new mode 100644 From 4fae53964b00ece9b32c85298289514d6d646b93 Mon Sep 17 00:00:00 2001 From: Pankaj Jindal <36332727+jindalpankaj@users.noreply.github.com> Date: Mon, 4 May 2020 08:09:48 -0700 Subject: [PATCH 104/125] DOC Correcting an attribute's name (#17110) --- doc/modules/svm.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst index 23dc7fbf67b65..8acebc79e412e 100644 --- a/doc/modules/svm.rst +++ b/doc/modules/svm.rst @@ -90,7 +90,7 @@ After being fitted, the model can then be used to predict new values:: SVMs decision function (detailed in the :ref:`svm_mathematical_formulation`) depends on some subset of the training data, called the support vectors. Some properties of these support vectors can be found in attributes -``support_vectors_``, ``support_`` and ``n_support``:: +``support_vectors_``, ``support_`` and ``n_support_``:: >>> # get support vectors >>> clf.support_vectors_ From a0c76ce3cdbcb87e6d61348c46cbc12486677354 Mon Sep 17 00:00:00 2001 From: Christoph Deil Date: Mon, 4 May 2020 21:00:53 +0200 Subject: [PATCH 105/125] MNT Remove sklearn logger default StreamHandler (#16451) * Remove sklearn logger default StreamHandler To avoid duplicate log messages * DOC Adds whats_new * CLN Address comments * MNT Remove setLevel Co-authored-by: Thomas J Fan --- doc/whats_new/v0.23.rst | 6 ++++++ sklearn/__init__.py | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index bd31752973f6a..d60b16edce903 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -195,6 +195,12 @@ Changelog `ValueError` for arguments `n_classes < 1` OR `length < 1`. :pr:`16006` by :user:`Rushabh Vasani `. +- |API| The `StreamHandler` was removed from `sklearn.logger` to avoid + double logging of messages in common cases where a hander is attached + to the root logger, and to follow the Python logging documentation + recommendation for libraries to leave the log message handling to + users and application code. :pr:`16451` by :user:`Christoph Deil `. + :mod:`sklearn.decomposition` ............................ diff --git a/sklearn/__init__.py b/sklearn/__init__.py index 7f203a079f22b..4d942319c5eb7 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -19,8 +19,6 @@ from ._config import get_config, set_config, config_context logger = logging.getLogger(__name__) -logger.addHandler(logging.StreamHandler()) -logger.setLevel(logging.INFO) # PEP0440 compatible formatted version, see: From a670bb9202396065bd093684ebb79b6753901d26 Mon Sep 17 00:00:00 2001 From: Vikas Pandey Date: Tue, 5 May 2020 01:38:59 +0530 Subject: [PATCH 106/125] add dtreevis to related packages #17105 (#17113) --- doc/related_projects.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/related_projects.rst b/doc/related_projects.rst index 89079971ca29a..15c35a51f9b0c 100644 --- a/doc/related_projects.rst +++ b/doc/related_projects.rst @@ -73,6 +73,9 @@ enhance the functionality of scikit-learn's estimators. **Model inspection and visualisation** +- `dtreeviz `_ A python library for + decision tree visualization and model interpretation. + - `eli5 `_ A library for debugging/inspecting machine learning models and explaining their predictions. From 962849aac4b7df13cd7043646e653ccec5021f3b Mon Sep 17 00:00:00 2001 From: Abo7atm <33042538+Abo7atm@users.noreply.github.com> Date: Tue, 5 May 2020 01:15:13 +0300 Subject: [PATCH 107/125] DOC Add tslearn to related projects (#17109) --- doc/related_projects.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/related_projects.rst b/doc/related_projects.rst index 15c35a51f9b0c..17e33ed691eb5 100644 --- a/doc/related_projects.rst +++ b/doc/related_projects.rst @@ -121,6 +121,9 @@ and tasks. **Structured learning** +- `tslearn `_ A machine learning library for time series + that offers tools for pre-processing and feature extraction as well as dedicated models for clustering, classification and regression. + - `sktime `_ A scikit-learn compatible toolbox for machine learning with time series including time series classification/regression and (supervised/panel) forecasting. - `Seqlearn `_ Sequence classification From 2cad437fc940bfe6fbb7a30b1c4e82d4f670fa25 Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Mon, 4 May 2020 18:19:35 -0400 Subject: [PATCH 108/125] STY Adjust line height of code blocks (#17094) --- doc/themes/scikit-learn-modern/static/css/theme.css | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/themes/scikit-learn-modern/static/css/theme.css b/doc/themes/scikit-learn-modern/static/css/theme.css index 2b80d6fe2b762..ceda27c6de093 100644 --- a/doc/themes/scikit-learn-modern/static/css/theme.css +++ b/doc/themes/scikit-learn-modern/static/css/theme.css @@ -90,7 +90,7 @@ div.highlight { div.highlight pre { margin-bottom: 0; - line-height: 1rem; + line-height: 1.2rem; } div.highlight a { From bd3fb2af30564bf918e506aed5a28c730e9da0e8 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 5 May 2020 09:01:09 +0200 Subject: [PATCH 109/125] DOC Remove unmaintained related projects (#17112) --- doc/related_projects.rst | 75 ++++------------------------------------ 1 file changed, 7 insertions(+), 68 deletions(-) diff --git a/doc/related_projects.rst b/doc/related_projects.rst index 17e33ed691eb5..825498d95ce92 100644 --- a/doc/related_projects.rst +++ b/doc/related_projects.rst @@ -23,17 +23,12 @@ enhance the functionality of scikit-learn's estimators. - `sklearn_pandas `_ bridge for scikit-learn pipelines and pandas data frame with dedicated transformers. - + - `sklearn_xarray `_ provides compatibility of scikit-learn estimators with xarray data structures. **Auto-ML** -- `auto_ml `_ - Automated machine learning for production and analytics, built on scikit-learn - and related projects. Trains a pipeline wth all the standard machine learning - steps. Tuned for prediction speed and ease of transfer to production environments. - - `auto-sklearn `_ An automated machine learning toolkit and a drop-in replacement for a scikit-learn estimator @@ -55,22 +50,11 @@ enhance the functionality of scikit-learn's estimators. - `REP `_ Environment for conducting data-driven research in a consistent and reproducible way -- `ML Frontend `_ provides - dataset management and SVM fitting/prediction through - `web-based `_ - and `programmatic `_ - interfaces. - - `Scikit-Learn Laboratory `_ A command-line wrapper around scikit-learn that makes it easy to run machine learning experiments with multiple learners and large feature sets. -- `Xcessiv `_ is a notebook-like - application for quick, scalable, and automated hyperparameter tuning - and stacked ensembling. Provides a framework for keeping track of - model-hyperparameter combinations. - **Model inspection and visualisation** - `dtreeviz `_ A python library for @@ -83,9 +67,6 @@ enhance the functionality of scikit-learn's estimators. - `mlxtend `_ Includes model visualization utilities. -- `scikit-plot `_ A visualization library - for quick and easy generation of common plots in data analysis and machine learning. - - `yellowbrick `_ A suite of custom matplotlib visualizers for scikit-learn estimators to support visual feature analysis, model selection, evaluation, and diagnostics. @@ -105,11 +86,6 @@ enhance the functionality of scikit-learn's estimators. - `sklearn-porter `_ Transpile trained scikit-learn models to C, Java, Javascript and others. -- `sklearn-compiledtrees `_ - Generate a C++ implementation of the predict function for decision trees (and - ensembles) trained by sklearn. Useful for latency-sensitive production - environments. - Other estimators and tasks -------------------------- @@ -126,9 +102,6 @@ and tasks. - `sktime `_ A scikit-learn compatible toolbox for machine learning with time series including time series classification/regression and (supervised/panel) forecasting. -- `Seqlearn `_ Sequence classification - using HMMs or structured perceptron. - - `HMMLearn `_ Implementation of hidden markov models that was previously part of scikit-learn. @@ -145,12 +118,6 @@ and tasks. **Deep neural networks etc.** -- `pylearn2 `_ A deep learning and - neural network library build on theano with scikit-learn like interface. - -- `sklearn_theano `_ scikit-learn compatible - estimators, transformers, and datasets which use Theano internally - - `nolearn `_ A number of wrappers and abstractions around existing neural network libraries @@ -159,8 +126,8 @@ and tasks. - `lasagne `_ A lightweight library to build and train neural networks in Theano. - -- `skorch `_ A scikit-learn compatible + +- `skorch `_ A scikit-learn compatible neural network library that wraps PyTorch. **Broad scope** @@ -168,9 +135,6 @@ and tasks. - `mlxtend `_ Includes a number of additional estimators as well as model visualization utilities. -- `sparkit-learn `_ Scikit-learn - API and functionality for PySpark's distributed modelling. - **Other regression and classification** - `xgboost `_ Optimised gradient boosted decision @@ -193,18 +157,15 @@ and tasks. - `gplearn `_ Genetic Programming for symbolic regression tasks. -- `multiisotonic `_ Isotonic - regression on multidimensional features. - -- `scikit-multilearn `_ Multi-label classification with - focus on label space manipulation. +- `scikit-multilearn `_ + Multi-label classification with focus on label space manipulation. -- `seglearn `_ Time series and sequence +- `seglearn `_ Time series and sequence learning using sliding window segmentation. **Decomposition and clustering** -- `lda `_: Fast implementation of latent +- `lda `_: Fast implementation of latent Dirichlet allocation in Cython which uses `Gibbs sampling `_ to sample from the true posterior distribution. (scikit-learn's @@ -213,9 +174,6 @@ and tasks. `_ to sample from a tractable approximation of a topic model's posterior distribution.) -- `Sparse Filtering `_ - Unsupervised feature learning based on sparse-filtering - - `kmodes `_ k-modes clustering algorithm for categorical data, and several of its variations. @@ -243,9 +201,6 @@ Other packages useful for data analysis and machine learning. - `Pandas `_ Tools for working with heterogeneous and columnar data, relational queries, time series and basic statistics. -- `theano `_ A CPU/GPU array - processing framework geared towards deep learning research. - - `statsmodels `_ Estimating and analysing statistical models. More focused on statistical tests and less on prediction than scikit-learn. @@ -259,17 +214,9 @@ Other packages useful for data analysis and machine learning. - `Seaborn `_ Visualization library based on matplotlib. It provides a high-level interface for drawing attractive statistical graphics. -- `Deep Learning `_ A curated list of deep learning - software libraries. - Recommendation Engine packages ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - `GraphLab - `_ - Implementation of classical recommendation techniques (in C++, with - Python bindings). - - `implicit `_, Library for implicit feedback datasets. @@ -303,11 +250,3 @@ Domain specific packages - `MSMBuilder `_ Machine learning for protein conformational dynamics time series. - -- `scikit-surprise `_ A scikit for building and - evaluating recommender systems. - -Snippets and tidbits ---------------------- - -The `wiki `_ has more! From b2b88d3ee47cfb2f4b947751ec141d6107717243 Mon Sep 17 00:00:00 2001 From: Adrin Jalali Date: Tue, 5 May 2020 09:07:25 +0200 Subject: [PATCH 110/125] MNT bump master version to 0.24.dev0 (#17121) --- sklearn/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/__init__.py b/sklearn/__init__.py index 4d942319c5eb7..870d0d9a93f0d 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -37,7 +37,7 @@ # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = '0.23.dev0' +__version__ = '0.24.dev0' # On OSX, we can get a runtime error due to multiple OpenMP libraries loaded From f23b940dcabdc86b8b71dc8a9a90ef91505407cc Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 5 May 2020 03:58:10 -0400 Subject: [PATCH 111/125] FIX Adjusts html_repr based on configuration (#17093) * ENH Adjusts html_repr based on configuration * CLN Returns None instead * CLN Uses property hack * CLN Address comments --- sklearn/base.py | 19 ++++++++++++++++++- sklearn/tests/test_base.py | 12 ++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/sklearn/base.py b/sklearn/base.py index 666574b491594..77c3223ed75e1 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -437,8 +437,25 @@ def _validate_data(self, X, y=None, reset=True, return out + @property def _repr_html_(self): - """HTML representation of estimator""" + """HTML representation of estimator. + + This is redundant with the logic of `_repr_mimebundle_`. The latter + should be favorted in the long term, `_repr_html_` is only + implemented for consumers who do not interpret `_repr_mimbundle_`. + """ + if get_config()["display"] != 'diagram': + raise AttributeError("_repr_html_ is only defined when the " + "'display' configuration option is set to " + "'diagram'") + return self._repr_html_inner + + def _repr_html_inner(self): + """This function is returned by the @property `_repr_html_` to make + `hasattr(estimator, "_repr_html_") return `True` or `False` depending + on `get_config()["display"]`. + """ return estimator_html_repr(self) def _repr_mimebundle_(self, **kwargs): diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index e20fa440d1933..db5c88051346a 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -525,3 +525,15 @@ def test_repr_mimebundle_(): output = tree._repr_mimebundle_() assert "text/plain" in output assert "text/html" in output + + +def test_repr_html_wraps(): + # Checks the display configuration flag controls the html output + tree = DecisionTreeClassifier() + msg = "_repr_html_ is only defined when" + with pytest.raises(AttributeError, match=msg): + output = tree._repr_html_() + + with config_context(display='diagram'): + output = tree._repr_html_() + assert "