From 5a23a850fd061df0c51b8b7917f8589133917ffe Mon Sep 17 00:00:00 2001 From: Jordan Silke <51223540+jsilke@users.noreply.github.com> Date: Fri, 8 Apr 2022 12:27:07 -0400 Subject: [PATCH] DOC modify plot_scalable_poly_kernels.py format (#23009) Co-authored-by: Guillaume Lemaitre --- .../plot_scalable_poly_kernels.py | 55 +++++++++++++------ 1 file changed, 38 insertions(+), 17 deletions(-) diff --git a/examples/kernel_approximation/plot_scalable_poly_kernels.py b/examples/kernel_approximation/plot_scalable_poly_kernels.py index e1ad883dd6517..ade27e16e349a 100644 --- a/examples/kernel_approximation/plot_scalable_poly_kernels.py +++ b/examples/kernel_approximation/plot_scalable_poly_kernels.py @@ -24,16 +24,10 @@ # Author: Daniel Lopez-Sanchez # License: BSD 3 clause -import matplotlib.pyplot as plt -from sklearn.datasets import fetch_covtype -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import MinMaxScaler, Normalizer -from sklearn.svm import LinearSVC -from sklearn.kernel_approximation import PolynomialCountSketch -from sklearn.pipeline import Pipeline, make_pipeline -import time - # %% +# Preparing the data +# ------------------ +# # Load the Covtype dataset, which contains 581,012 samples # with 54 features each, distributed among 6 classes. The goal of this dataset # is to predict forest cover type from cartographic variables only @@ -41,34 +35,53 @@ # classification problem to match the version of the dataset in the # LIBSVM webpage [2], which was the one used in [1]. +from sklearn.datasets import fetch_covtype + X, y = fetch_covtype(return_X_y=True) y[y != 2] = 0 y[y == 2] = 1 # We will try to separate class 2 from the other 6 classes. # %% +# Partitioning the data +# --------------------- +# # Here we select 5,000 samples for training and 10,000 for testing. # To actually reproduce the results in the original Tensor Sketch paper, # select 100,000 for training. +from sklearn.model_selection import train_test_split + X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=5_000, test_size=10_000, random_state=42 ) # %% +# Feature normalization +# --------------------- +# # Now scale features to the range [0, 1] to match the format of the dataset in # the LIBSVM webpage, and then normalize to unit length as done in the # original Tensor Sketch paper [1]. +from sklearn.preprocessing import MinMaxScaler, Normalizer +from sklearn.pipeline import make_pipeline + mm = make_pipeline(MinMaxScaler(), Normalizer()) X_train = mm.fit_transform(X_train) X_test = mm.transform(X_test) # %% +# Establishing a baseline model +# ----------------------------- +# # As a baseline, train a linear SVM on the original features and print the # accuracy. We also measure and store accuracies and training times to # plot them later. +import time +from sklearn.svm import LinearSVC + results = {} lsvm = LinearSVC() @@ -81,6 +94,9 @@ print(f"Linear SVM score on raw features: {lsvm_score:.2f}%") # %% +# Establishing the kernel approximation model +# ------------------------------------------- +# # Then we train linear SVMs on the features generated by # :class:`PolynomialCountSketch` with different values for `n_components`, # showing that these kernel feature approximations improve the accuracy @@ -98,6 +114,8 @@ # (`n_runs` = 1) in this example, in practice one should repeat the experiment several # times to compensate for the stochastic nature of :class:`PolynomialCountSketch`. +from sklearn.kernel_approximation import PolynomialCountSketch + n_runs = 1 N_COMPONENTS = [250, 500, 1000, 2000] @@ -107,14 +125,9 @@ ps_lsvm_score = 0 for _ in range(n_runs): - pipeline = Pipeline( - steps=[ - ( - "kernel_approximator", - PolynomialCountSketch(n_components=n_components, degree=4), - ), - ("linear_classifier", LinearSVC()), - ] + pipeline = make_pipeline( + PolynomialCountSketch(n_components=n_components, degree=4), + LinearSVC(), ) start = time.time() @@ -135,6 +148,9 @@ ) # %% +# Establishing the kernelized SVM model +# ------------------------------------- +# # Train a kernelized SVM to see how well :class:`PolynomialCountSketch` # is approximating the performance of the kernel. This, of course, may take # some time, as the SVC class has a relatively poor scalability. This is the @@ -153,11 +169,16 @@ print(f"Kernel-SVM score on raw features: {ksvm_score:.2f}%") # %% +# Comparing the results +# --------------------- +# # Finally, plot the results of the different methods against their training # times. As we can see, the kernelized SVM achieves a higher accuracy, # but its training time is much larger and, most importantly, will grow # much faster if the number of training samples increases. +import matplotlib.pyplot as plt + fig, ax = plt.subplots(figsize=(7, 7)) ax.scatter( [