Skip to content

Commit

Permalink
DOC modify plot_scalable_poly_kernels.py format (#23009)
Browse files Browse the repository at this point in the history
Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
  • Loading branch information
jsilke and glemaitre committed Apr 8, 2022
1 parent a4bdf3c commit 5a23a85
Showing 1 changed file with 38 additions and 17 deletions.
55 changes: 38 additions & 17 deletions examples/kernel_approximation/plot_scalable_poly_kernels.py
Expand Up @@ -24,51 +24,64 @@
# Author: Daniel Lopez-Sanchez <lope@usal.es>
# License: BSD 3 clause

import matplotlib.pyplot as plt
from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.svm import LinearSVC
from sklearn.kernel_approximation import PolynomialCountSketch
from sklearn.pipeline import Pipeline, make_pipeline
import time

# %%
# Preparing the data
# ------------------
#
# Load the Covtype dataset, which contains 581,012 samples
# with 54 features each, distributed among 6 classes. The goal of this dataset
# is to predict forest cover type from cartographic variables only
# (no remotely sensed data). After loading, we transform it into a binary
# classification problem to match the version of the dataset in the
# LIBSVM webpage [2], which was the one used in [1].

from sklearn.datasets import fetch_covtype

X, y = fetch_covtype(return_X_y=True)

y[y != 2] = 0
y[y == 2] = 1 # We will try to separate class 2 from the other 6 classes.

# %%
# Partitioning the data
# ---------------------
#
# Here we select 5,000 samples for training and 10,000 for testing.
# To actually reproduce the results in the original Tensor Sketch paper,
# select 100,000 for training.

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
X, y, train_size=5_000, test_size=10_000, random_state=42
)

# %%
# Feature normalization
# ---------------------
#
# Now scale features to the range [0, 1] to match the format of the dataset in
# the LIBSVM webpage, and then normalize to unit length as done in the
# original Tensor Sketch paper [1].

from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.pipeline import make_pipeline

mm = make_pipeline(MinMaxScaler(), Normalizer())
X_train = mm.fit_transform(X_train)
X_test = mm.transform(X_test)

# %%
# Establishing a baseline model
# -----------------------------
#
# As a baseline, train a linear SVM on the original features and print the
# accuracy. We also measure and store accuracies and training times to
# plot them later.

import time
from sklearn.svm import LinearSVC

results = {}

lsvm = LinearSVC()
Expand All @@ -81,6 +94,9 @@
print(f"Linear SVM score on raw features: {lsvm_score:.2f}%")

# %%
# Establishing the kernel approximation model
# -------------------------------------------
#
# Then we train linear SVMs on the features generated by
# :class:`PolynomialCountSketch` with different values for `n_components`,
# showing that these kernel feature approximations improve the accuracy
Expand All @@ -98,6 +114,8 @@
# (`n_runs` = 1) in this example, in practice one should repeat the experiment several
# times to compensate for the stochastic nature of :class:`PolynomialCountSketch`.

from sklearn.kernel_approximation import PolynomialCountSketch

n_runs = 1
N_COMPONENTS = [250, 500, 1000, 2000]

Expand All @@ -107,14 +125,9 @@
ps_lsvm_score = 0
for _ in range(n_runs):

pipeline = Pipeline(
steps=[
(
"kernel_approximator",
PolynomialCountSketch(n_components=n_components, degree=4),
),
("linear_classifier", LinearSVC()),
]
pipeline = make_pipeline(
PolynomialCountSketch(n_components=n_components, degree=4),
LinearSVC(),
)

start = time.time()
Expand All @@ -135,6 +148,9 @@
)

# %%
# Establishing the kernelized SVM model
# -------------------------------------
#
# Train a kernelized SVM to see how well :class:`PolynomialCountSketch`
# is approximating the performance of the kernel. This, of course, may take
# some time, as the SVC class has a relatively poor scalability. This is the
Expand All @@ -153,11 +169,16 @@
print(f"Kernel-SVM score on raw features: {ksvm_score:.2f}%")

# %%
# Comparing the results
# ---------------------
#
# Finally, plot the results of the different methods against their training
# times. As we can see, the kernelized SVM achieves a higher accuracy,
# but its training time is much larger and, most importantly, will grow
# much faster if the number of training samples increases.

import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(7, 7))
ax.scatter(
[
Expand Down

0 comments on commit 5a23a85

Please sign in to comment.