From 718825d02783730f7bee4a7f3633e7815794df61 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Sun, 24 May 2015 11:29:39 -0700
Subject: [PATCH] add class_weight to PA cls, remove from PA reg

rebase on top of #4347

improve error message
---
 doc/whats_new.rst                             |  6 +-
 sklearn/linear_model/passive_aggressive.py    | 30 ++++++--
 .../tests/test_passive_aggressive.py          | 72 +++++++++++++++++++
 3 files changed, 102 insertions(+), 6 deletions(-)

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index 3ab7946e4f05a..bd776daf999e3 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -61,6 +61,10 @@ Enhancements
      option, which has a simpler forumlar and interpretation.
      By Hanna Wallach and `Andreas Müller`_.
 
+   - Add ``class_weight`` parameter to automatically weight samples by class
+     frequency for :class:`linear_model.PassiveAgressiveClassifier`. By
+     `Trevor Stephens`_.
+
    - Added backlinks from the API reference pages to the user guide. By
      `Andreas Müller`_.
 
@@ -572,7 +576,7 @@ API changes summary
 
     - The ``shuffle`` option of :class:`.linear_model.SGDClassifier`,
       :class:`linear_model.SGDRegressor`, :class:`linear_model.Perceptron`,
-      :class:`linear_model.PassiveAgressiveClassivier` and
+      :class:`linear_model.PassiveAgressiveClassifier` and
       :class:`linear_model.PassiveAgressiveRegressor` now defaults to ``True``.
 
     - :class:`cluster.DBSCAN` now uses a deterministic initialization. The
diff --git a/sklearn/linear_model/passive_aggressive.py b/sklearn/linear_model/passive_aggressive.py
index fb96a1823b9d1..19d50feca3bc7 100644
--- a/sklearn/linear_model/passive_aggressive.py
+++ b/sklearn/linear_model/passive_aggressive.py
@@ -49,6 +49,16 @@ class PassiveAggressiveClassifier(BaseSGDClassifier):
         When set to True, reuse the solution of the previous call to fit as
         initialization, otherwise, just erase the previous solution.
 
+    class_weight : dict, {class_label: weight} or "balanced" or None, optional
+        Preset for the class_weight fit parameter.
+
+        Weights associated with classes. If not given, all classes
+        are supposed to have weight one.
+
+        The "balanced" mode uses the values of y to automatically adjust
+        weights inversely proportional to class frequencies in the input data
+        as ``n_samples / (n_classes * np.bincount(y))``
+
     Attributes
     ----------
     coef_ : array, shape = [1, n_features] if n_classes == 2 else [n_classes,\
@@ -71,9 +81,9 @@ class PassiveAggressiveClassifier(BaseSGDClassifier):
     K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR (2006)
 
     """
-    def __init__(self, C=1.0, fit_intercept=True,
-                 n_iter=5, shuffle=True, verbose=0, loss="hinge",
-                 n_jobs=1, random_state=None, warm_start=False):
+    def __init__(self, C=1.0, fit_intercept=True, n_iter=5, shuffle=True,
+                 verbose=0, loss="hinge", n_jobs=1, random_state=None,
+                 warm_start=False, class_weight=None):
         BaseSGDClassifier.__init__(self,
                                    penalty=None,
                                    fit_intercept=fit_intercept,
@@ -83,6 +93,7 @@ def __init__(self, C=1.0, fit_intercept=True,
                                    random_state=random_state,
                                    eta0=1.0,
                                    warm_start=warm_start,
+                                   class_weight=class_weight,
                                    n_jobs=n_jobs)
         self.C = C
         self.loss = loss
@@ -110,6 +121,16 @@ def partial_fit(self, X, y, classes=None):
         -------
         self : returns an instance of self.
         """
+        if self.class_weight == 'balanced':
+            raise ValueError("class_weight 'balanced' is not supported for "
+                             "partial_fit. In order to use 'balanced' "
+                             "weights, from the sklearn.utils module use "
+                             "compute_class_weight('balanced', classes, y). "
+                             "In place of y you can us a large enough sample "
+                             "of the full training set target to properly "
+                             "estimate the class frequency distributions. "
+                             "Pass the resulting weights as the class_weight "
+                             "parameter.")
         lr = "pa1" if self.loss == "hinge" else "pa2"
         return self._partial_fit(X, y, alpha=1.0, C=self.C,
                                  loss="hinge", learning_rate=lr, n_iter=1,
@@ -209,8 +230,7 @@ class PassiveAggressiveRegressor(BaseSGDRegressor):
     """
     def __init__(self, C=1.0, fit_intercept=True, n_iter=5, shuffle=True,
                  verbose=0, loss="epsilon_insensitive",
-                 epsilon=DEFAULT_EPSILON, random_state=None, class_weight=None,
-                 warm_start=False):
+                 epsilon=DEFAULT_EPSILON, random_state=None, warm_start=False):
         BaseSGDRegressor.__init__(self,
                                   penalty=None,
                                   l1_ratio=0,
diff --git a/sklearn/linear_model/tests/test_passive_aggressive.py b/sklearn/linear_model/tests/test_passive_aggressive.py
index 55d4fb23cfadb..ec8d4a8f7c1dd 100644
--- a/sklearn/linear_model/tests/test_passive_aggressive.py
+++ b/sklearn/linear_model/tests/test_passive_aggressive.py
@@ -4,6 +4,7 @@
 from sklearn.utils.testing import assert_less
 from sklearn.utils.testing import assert_greater
 from sklearn.utils.testing import assert_array_almost_equal, assert_array_equal
+from sklearn.utils.testing import assert_almost_equal
 from sklearn.utils.testing import assert_raises
 
 from sklearn.base import ClassifierMixin
@@ -125,6 +126,77 @@ def test_classifier_undefined_methods():
         assert_raises(AttributeError, lambda x: getattr(clf, x), meth)
 
 
+def test_class_weights():
+    # Test class weights.
+    X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0],
+                   [1.0, 1.0], [1.0, 0.0]])
+    y2 = [1, 1, 1, -1, -1]
+
+    clf = PassiveAggressiveClassifier(C=0.1, n_iter=100, class_weight=None,
+                                      random_state=100)
+    clf.fit(X2, y2)
+    assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))
+
+    # we give a small weights to class 1
+    clf = PassiveAggressiveClassifier(C=0.1, n_iter=100,
+                                      class_weight={1: 0.001},
+                                      random_state=100)
+    clf.fit(X2, y2)
+
+    # now the hyperplane should rotate clock-wise and
+    # the prediction on this point should shift
+    assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
+
+
+def test_partial_fit_weight_class_balanced():
+    # partial_fit with class_weight='balanced' not supported
+    clf = PassiveAggressiveClassifier(class_weight="balanced")
+    assert_raises(ValueError, clf.partial_fit, X, y, classes=np.unique(y))
+
+
+def test_equal_class_weight():
+    X2 = [[1, 0], [1, 0], [0, 1], [0, 1]]
+    y2 = [0, 0, 1, 1]
+    clf = PassiveAggressiveClassifier(C=0.1, n_iter=1000, class_weight=None)
+    clf.fit(X2, y2)
+
+    # Already balanced, so "balanced" weights should have no effect
+    clf_balanced = PassiveAggressiveClassifier(C=0.1, n_iter=1000,
+                                               class_weight="balanced")
+    clf_balanced.fit(X2, y2)
+
+    clf_weighted = PassiveAggressiveClassifier(C=0.1, n_iter=1000,
+                                               class_weight={0: 0.5, 1: 0.5})
+    clf_weighted.fit(X2, y2)
+
+    # should be similar up to some epsilon due to learning rate schedule
+    assert_almost_equal(clf.coef_, clf_weighted.coef_, decimal=2)
+    assert_almost_equal(clf.coef_, clf_balanced.coef_, decimal=2)
+
+
+def test_wrong_class_weight_label():
+    # ValueError due to wrong class_weight label.
+    X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0],
+                   [1.0, 1.0], [1.0, 0.0]])
+    y2 = [1, 1, 1, -1, -1]
+
+    clf = PassiveAggressiveClassifier(class_weight={0: 0.5})
+    assert_raises(ValueError, clf.fit, X2, y2)
+
+
+def test_wrong_class_weight_format():
+    # ValueError due to wrong class_weight argument type.
+    X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0],
+                   [1.0, 1.0], [1.0, 0.0]])
+    y2 = [1, 1, 1, -1, -1]
+
+    clf = PassiveAggressiveClassifier(class_weight=[0.5])
+    assert_raises(ValueError, clf.fit, X2, y2)
+
+    clf = PassiveAggressiveClassifier(class_weight="the larch")
+    assert_raises(ValueError, clf.fit, X2, y2)
+
+
 def test_regressor_mse():
     y_bin = y.copy()
     y_bin[y != 1] = -1