From 718825d02783730f7bee4a7f3633e7815794df61 Mon Sep 17 00:00:00 2001 From: trevorstephens Date: Sun, 24 May 2015 11:29:39 -0700 Subject: [PATCH] add class_weight to PA cls, remove from PA reg rebase on top of #4347 improve error message --- doc/whats_new.rst | 6 +- sklearn/linear_model/passive_aggressive.py | 30 ++++++-- .../tests/test_passive_aggressive.py | 72 +++++++++++++++++++ 3 files changed, 102 insertions(+), 6 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 3ab7946e4f05a..bd776daf999e3 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -61,6 +61,10 @@ Enhancements option, which has a simpler forumlar and interpretation. By Hanna Wallach and `Andreas Müller`_. + - Add ``class_weight`` parameter to automatically weight samples by class + frequency for :class:`linear_model.PassiveAgressiveClassifier`. By + `Trevor Stephens`_. + - Added backlinks from the API reference pages to the user guide. By `Andreas Müller`_. @@ -572,7 +576,7 @@ API changes summary - The ``shuffle`` option of :class:`.linear_model.SGDClassifier`, :class:`linear_model.SGDRegressor`, :class:`linear_model.Perceptron`, - :class:`linear_model.PassiveAgressiveClassivier` and + :class:`linear_model.PassiveAgressiveClassifier` and :class:`linear_model.PassiveAgressiveRegressor` now defaults to ``True``. - :class:`cluster.DBSCAN` now uses a deterministic initialization. The diff --git a/sklearn/linear_model/passive_aggressive.py b/sklearn/linear_model/passive_aggressive.py index fb96a1823b9d1..19d50feca3bc7 100644 --- a/sklearn/linear_model/passive_aggressive.py +++ b/sklearn/linear_model/passive_aggressive.py @@ -49,6 +49,16 @@ class PassiveAggressiveClassifier(BaseSGDClassifier): When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. + class_weight : dict, {class_label: weight} or "balanced" or None, optional + Preset for the class_weight fit parameter. + + Weights associated with classes. If not given, all classes + are supposed to have weight one. + + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))`` + Attributes ---------- coef_ : array, shape = [1, n_features] if n_classes == 2 else [n_classes,\ @@ -71,9 +81,9 @@ class PassiveAggressiveClassifier(BaseSGDClassifier): K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR (2006) """ - def __init__(self, C=1.0, fit_intercept=True, - n_iter=5, shuffle=True, verbose=0, loss="hinge", - n_jobs=1, random_state=None, warm_start=False): + def __init__(self, C=1.0, fit_intercept=True, n_iter=5, shuffle=True, + verbose=0, loss="hinge", n_jobs=1, random_state=None, + warm_start=False, class_weight=None): BaseSGDClassifier.__init__(self, penalty=None, fit_intercept=fit_intercept, @@ -83,6 +93,7 @@ def __init__(self, C=1.0, fit_intercept=True, random_state=random_state, eta0=1.0, warm_start=warm_start, + class_weight=class_weight, n_jobs=n_jobs) self.C = C self.loss = loss @@ -110,6 +121,16 @@ def partial_fit(self, X, y, classes=None): ------- self : returns an instance of self. """ + if self.class_weight == 'balanced': + raise ValueError("class_weight 'balanced' is not supported for " + "partial_fit. In order to use 'balanced' " + "weights, from the sklearn.utils module use " + "compute_class_weight('balanced', classes, y). " + "In place of y you can us a large enough sample " + "of the full training set target to properly " + "estimate the class frequency distributions. " + "Pass the resulting weights as the class_weight " + "parameter.") lr = "pa1" if self.loss == "hinge" else "pa2" return self._partial_fit(X, y, alpha=1.0, C=self.C, loss="hinge", learning_rate=lr, n_iter=1, @@ -209,8 +230,7 @@ class PassiveAggressiveRegressor(BaseSGDRegressor): """ def __init__(self, C=1.0, fit_intercept=True, n_iter=5, shuffle=True, verbose=0, loss="epsilon_insensitive", - epsilon=DEFAULT_EPSILON, random_state=None, class_weight=None, - warm_start=False): + epsilon=DEFAULT_EPSILON, random_state=None, warm_start=False): BaseSGDRegressor.__init__(self, penalty=None, l1_ratio=0, diff --git a/sklearn/linear_model/tests/test_passive_aggressive.py b/sklearn/linear_model/tests/test_passive_aggressive.py index 55d4fb23cfadb..ec8d4a8f7c1dd 100644 --- a/sklearn/linear_model/tests/test_passive_aggressive.py +++ b/sklearn/linear_model/tests/test_passive_aggressive.py @@ -4,6 +4,7 @@ from sklearn.utils.testing import assert_less from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_array_almost_equal, assert_array_equal +from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_raises from sklearn.base import ClassifierMixin @@ -125,6 +126,77 @@ def test_classifier_undefined_methods(): assert_raises(AttributeError, lambda x: getattr(clf, x), meth) +def test_class_weights(): + # Test class weights. + X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], + [1.0, 1.0], [1.0, 0.0]]) + y2 = [1, 1, 1, -1, -1] + + clf = PassiveAggressiveClassifier(C=0.1, n_iter=100, class_weight=None, + random_state=100) + clf.fit(X2, y2) + assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1])) + + # we give a small weights to class 1 + clf = PassiveAggressiveClassifier(C=0.1, n_iter=100, + class_weight={1: 0.001}, + random_state=100) + clf.fit(X2, y2) + + # now the hyperplane should rotate clock-wise and + # the prediction on this point should shift + assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1])) + + +def test_partial_fit_weight_class_balanced(): + # partial_fit with class_weight='balanced' not supported + clf = PassiveAggressiveClassifier(class_weight="balanced") + assert_raises(ValueError, clf.partial_fit, X, y, classes=np.unique(y)) + + +def test_equal_class_weight(): + X2 = [[1, 0], [1, 0], [0, 1], [0, 1]] + y2 = [0, 0, 1, 1] + clf = PassiveAggressiveClassifier(C=0.1, n_iter=1000, class_weight=None) + clf.fit(X2, y2) + + # Already balanced, so "balanced" weights should have no effect + clf_balanced = PassiveAggressiveClassifier(C=0.1, n_iter=1000, + class_weight="balanced") + clf_balanced.fit(X2, y2) + + clf_weighted = PassiveAggressiveClassifier(C=0.1, n_iter=1000, + class_weight={0: 0.5, 1: 0.5}) + clf_weighted.fit(X2, y2) + + # should be similar up to some epsilon due to learning rate schedule + assert_almost_equal(clf.coef_, clf_weighted.coef_, decimal=2) + assert_almost_equal(clf.coef_, clf_balanced.coef_, decimal=2) + + +def test_wrong_class_weight_label(): + # ValueError due to wrong class_weight label. + X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], + [1.0, 1.0], [1.0, 0.0]]) + y2 = [1, 1, 1, -1, -1] + + clf = PassiveAggressiveClassifier(class_weight={0: 0.5}) + assert_raises(ValueError, clf.fit, X2, y2) + + +def test_wrong_class_weight_format(): + # ValueError due to wrong class_weight argument type. + X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], + [1.0, 1.0], [1.0, 0.0]]) + y2 = [1, 1, 1, -1, -1] + + clf = PassiveAggressiveClassifier(class_weight=[0.5]) + assert_raises(ValueError, clf.fit, X2, y2) + + clf = PassiveAggressiveClassifier(class_weight="the larch") + assert_raises(ValueError, clf.fit, X2, y2) + + def test_regressor_mse(): y_bin = y.copy() y_bin[y != 1] = -1