Merge pull request #2 from mblondel/multilabel

Multilabel
amueller · Dec 20, 2011 · 28f947d · 28f947d
2 parents 498f293 + 117be7e
commit 28f947d
Show file tree

Hide file tree

Showing 314 changed files with 26,862 additions and 11,964 deletions.
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -11,14 +11,16 @@ This project was started in 2007 as a Google Summer of Code project by
 David Cournapeau. Later that year, Matthieu Brucher started work on
 this project as part of his thesis.
 
-In 2010 Fabian Pedregosa, Gael Varoquaux, Alexandre Gramfort and
-Vincent Michel took leadership of the project and made the first
-public release, February the 1st 2010. Since then, several releases
-have appeard following a ~3 month cycle.
+In 2010 Fabian Pedregosa, Gael Varoquaux, Alexandre Gramfort and Vincent
+Michel of INRIA took leadership of the project and made the first public
+release, February the 1st 2010. Since then, several releases have appeard
+following a ~3 month cycle, and a striving international community has
+been leading the development.
 
 People
 ------
 
+.. hlist::
 
   * David Cournapeau
 

diff --git a/README.rst b/README.rst
@@ -29,8 +29,9 @@ Important links
 Dependencies
 ============
 
-The required dependencies to build the software are Python >= 2.5,
-setuptools, Numpy >= 1.2, SciPy >= 0.7 and a working C++ compiler.
+The required dependencies to build the software are Python >= 2.6,
+setuptools, Numpy >= 1.3, SciPy >= 0.7 and a working C/C++ compiler.
+This configuration matches the Ubuntu 10.04 LTS release from April 2010.
 
 To run the tests you will also need nose >= 0.10.
 

diff --git a/benchmarks/bench_bayes.py b/benchmarks/bench_bayes.py
diff --git a/benchmarks/bench_glm.py b/benchmarks/bench_glm.py
@@ -20,13 +20,13 @@
     time_ols = np.empty(n_iter)
     time_lasso = np.empty(n_iter)
 
-    dimensions = 500 * np.arange(1, n_iter+1)
+    dimensions = 500 * np.arange(1, n_iter + 1)
 
     for i in range(n_iter):
 
         print 'Iteration %s of %s' % (i, n_iter)
 
-        n_samples, n_features = 10*i + 3, 10*i + 3
+        n_samples, n_features = 10 * i + 3, 10 * i + 3
 
         X = np.random.randn(n_samples, n_features)
         Y = np.random.randn(n_samples)
@@ -46,7 +46,6 @@
         lasso.fit(X, Y)
         time_lasso[i] = total_seconds(datetime.now() - start)
 
-
     pl.xlabel('Dimesions')
     pl.ylabel('Time (in seconds)')
     pl.plot(dimensions, time_ridge, color='r')

diff --git a/benchmarks/bench_glmnet.py b/benchmarks/bench_glmnet.py
@@ -77,10 +77,10 @@ def bench(factory, X, Y, X_test, Y_test, ref_coef):
         glmnet_results.append(bench(GlmnetLasso, X, Y, X_test, Y_test, coef_))
 
     pl.clf()
-    xx = range(0, n*step, step)
+    xx = range(0, n * step, step)
     pl.title('Lasso regression on sample dataset (%d features)' % n_features)
     pl.plot(xx, scikit_results, 'b-', label='scikit-learn')
-    pl.plot(xx, glmnet_results,'r-', label='glmnet')
+    pl.plot(xx, glmnet_results, 'r-', label='glmnet')
     pl.legend()
     pl.xlabel('number of samples to classify')
     pl.ylabel('time (in seconds)')
@@ -120,10 +120,9 @@ def bench(factory, X, Y, X_test, Y_test, ref_coef):
     pl.figure()
     pl.title('Regression in high dimensional spaces (%d samples)' % n_samples)
     pl.plot(xx, scikit_results, 'b-', label='scikit-learn')
-    pl.plot(xx, glmnet_results,'r-', label='glmnet')
+    pl.plot(xx, glmnet_results, 'r-', label='glmnet')
     pl.legend()
     pl.xlabel('number of features')
     pl.ylabel('time (in seconds)')
     pl.axis('tight')
     pl.show()
-
diff --git a/benchmarks/bench_lasso.py b/benchmarks/bench_lasso.py
@@ -38,20 +38,21 @@ def compute_bench(alpha, n_samples, n_features, precompute):
                                           n_informative=n_informative,
                                           noise=0.1, coef=True)
 
-            X /= np.sqrt(np.sum(X**2, axis=0)) # Normalize data
+            X /= np.sqrt(np.sum(X ** 2, axis=0))  # Normalize data
 
             gc.collect()
             print "- benching Lasso"
-            clf = Lasso(alpha=alpha, fit_intercept=False)
+            clf = Lasso(alpha=alpha, fit_intercept=False, precompute=precompute)
             tstart = time()
-            clf.fit(X, Y, precompute=precompute)
+            clf.fit(X, Y)
             lasso_results.append(time() - tstart)
 
             gc.collect()
             print "- benching LassoLars"
-            clf = LassoLars(alpha=alpha, fit_intercept=False)
+            clf = LassoLars(alpha=alpha, fit_intercept=False, 
+		normalize=False, precompute=precompute)
             tstart = time()
-            clf.fit(X, Y, normalize=False, precompute=precompute)
+            clf.fit(X, Y)
             lars_lasso_results.append(time() - tstart)
 
     return lasso_results, lars_lasso_results
@@ -61,7 +62,7 @@ def compute_bench(alpha, n_samples, n_features, precompute):
     from sklearn.linear_model import Lasso, LassoLars
     import pylab as pl
 
-    alpha = 0.01 # regularization parameter
+    alpha = 0.01  # regularization parameter
 
     n_features = 10
     list_n_samples = np.linspace(100, 1000000, 5).astype(np.int)

diff --git a/benchmarks/bench_plot_fastkmeans.py b/benchmarks/bench_plot_fastkmeans.py
@@ -20,7 +20,7 @@ def compute_bench(samples_range, features_range):
         for n_features in features_range:
             it += 1
             print '=============================='
-            print 'Iteration %03d of %03d' %(it, max_it)
+            print 'Iteration %03d of %03d' % (it, max_it)
             print '=============================='
             print ''
             data = nr.random_integers(-50, 50, (n_samples, n_features))
@@ -70,7 +70,7 @@ def compute_bench_2(chunks):
     for chunk in chunks:
         it += 1
         print '=============================='
-        print 'Iteration %03d of %03d' %(it, max_it)
+        print 'Iteration %03d of %03d' % (it, max_it)
         print '=============================='
         print ''
 
@@ -93,7 +93,7 @@ def compute_bench_2(chunks):
 
 
 if __name__ == '__main__':
-    from mpl_toolkits.mplot3d import axes3d # register the 3d projection
+    from mpl_toolkits.mplot3d import axes3d  # register the 3d projection
     import matplotlib.pyplot as plt
 
     samples_range = np.linspace(50, 150, 5).astype(np.int)
@@ -126,7 +126,6 @@ def compute_bench_2(chunks):
         ax.set_xlabel('n_samples')
         ax.set_ylabel('n_features')
 
-
     i = 0
     for c, (label, timings) in zip('br',
                                    sorted(results_2.iteritems())):
@@ -137,5 +136,4 @@ def compute_bench_2(chunks):
         ax.set_xlabel('chunks')
         ax.set_ylabel(label)
 
-
     plt.show()
diff --git a/benchmarks/bench_plot_lasso_path.py b/benchmarks/bench_plot_lasso_path.py
@@ -43,7 +43,7 @@ def compute_bench(samples_range, features_range):
             print "benching lars_path (with Gram):",
             sys.stdout.flush()
             tstart = time()
-            G = np.dot(X.T, X) # precomputed Gram matrix
+            G = np.dot(X.T, X)  # precomputed Gram matrix
             Xy = np.dot(X.T, y)
             lars_path(X, y, Xy=Xy, Gram=G, method='lasso')
             delta = time() - tstart
@@ -81,7 +81,7 @@ def compute_bench(samples_range, features_range):
 
 
 if __name__ == '__main__':
-    from mpl_toolkits.mplot3d import axes3d # register the 3d projection
+    from mpl_toolkits.mplot3d import axes3d  # register the 3d projection
     import matplotlib.pyplot as plt
 
     samples_range = np.linspace(10, 2000, 5).astype(np.int)

diff --git a/benchmarks/bench_plot_neighbors.py b/benchmarks/bench_plot_neighbors.py
@@ -59,7 +59,6 @@ def barplot_neighbors(Nrange=2 ** np.arange(1, 11),
             N_results_build[algorithm][i] = (t1 - t0)
             N_results_query[algorithm][i] = (t2 - t1)
 
-
     #------------------------------------------------------------
     # varying D
     D_results_build = dict([(alg, np.zeros(len(Drange)))
@@ -83,7 +82,6 @@ def barplot_neighbors(Nrange=2 ** np.arange(1, 11),
             D_results_build[algorithm][i] = (t1 - t0)
             D_results_query[algorithm][i] = (t2 - t1)
 
-
     #------------------------------------------------------------
     # varying k
     k_results_build = dict([(alg, np.zeros(len(krange)))

diff --git a/benchmarks/bench_plot_omp_lars.py b/benchmarks/bench_plot_omp_lars.py
@@ -108,7 +108,7 @@ def compute_bench(samples_range, features_range):
     for i, (label, timings) in enumerate(sorted(results.iteritems())):
         ax = fig.add_subplot(1, 2, i)
         vmax = max(1 - timings.min(), -1 + timings.max())
-        pl.matshow(timings, fignum=False, vmin=1-vmax, vmax=1+vmax)
+        pl.matshow(timings, fignum=False, vmin=1 - vmax, vmax=1 + vmax)
         ax.set_xticklabels([''] + map(str, samples_range))
         ax.set_yticklabels([''] + map(str, features_range))
         pl.xlabel('n_samples')

diff --git a/benchmarks/bench_plot_svd.py b/benchmarks/bench_plot_svd.py
@@ -25,7 +25,8 @@ def compute_bench(samples_range, features_range, q=3, rank=50):
             print '===================='
             print 'Iteration %03d of %03d' % (it, max_it)
             print '===================='
-            X = make_low_rank_matrix(n_samples, n_features, effective_rank=rank,
+            X = make_low_rank_matrix(n_samples, n_features,
+                                  effective_rank=rank,
                                   tail_strength=0.2)
 
             gc.collect()
@@ -50,7 +51,7 @@ def compute_bench(samples_range, features_range, q=3, rank=50):
 
 
 if __name__ == '__main__':
-    from mpl_toolkits.mplot3d import axes3d # register the 3d projection
+    from mpl_toolkits.mplot3d import axes3d  # register the 3d projection
     import matplotlib.pyplot as plt
 
     samples_range = np.linspace(2, 1000, 4).astype(np.int)

diff --git a/benchmarks/bench_plot_ward.py b/benchmarks/bench_plot_ward.py
@@ -29,7 +29,7 @@
         hierarchy.ward(X)
         scipy_time[j, i] = time.time() - t0
 
-ratio = scikits_time/scipy_time
+ratio = scikits_time / scipy_time
 
 pl.clf()
 pl.imshow(np.log(ratio), aspect='auto', origin="lower")

diff --git a/benchmarks/bench_sgd_covertype.py b/benchmarks/bench_sgd_covertype.py
@@ -3,23 +3,26 @@
 Covertype dataset with dense SGD
 ================================
 
-Benchmark stochastic gradient descent (SGD), Liblinear, and Naive Bayes on
-the forest covertype dataset of Blackard, Jock, and Dean [1]. The dataset
-comprises 581,012 samples. It is low-dimensional with 54 features and a
-sparsity of approx. 23%. Here, we consider the task of predicting class 1
-(spruce/fir). The classification performance of SGD is competitive with
-Liblinear while being two orders of magnitude faster to train::
+Benchmark stochastic gradient descent (SGD), Liblinear, and Naive Bayes, CART
+(decision tree), RandomForest and Extra-Trees on the forest covertype dataset
+of Blackard, Jock, and Dean [1]. The dataset comprises 581,012 samples. It is
+low dimensional with 54 features and a sparsity of approx. 23%. Here, we
+consider the task of predicting class 1 (spruce/fir). The classification
+performance of SGD is competitive with Liblinear while being two orders of
+magnitude faster to train::
 
     [..]
     Classification performance:
     ===========================
 
     Classifier   train-time test-time error-rate
     --------------------------------------------
-    Liblinear     10.0171s   0.0213s     0.2305
-    GaussianNB    3.1570s    0.1907s     0.3633
-    SGD           0.2317s    0.0050s     0.2300
-    CART          62.7706s   1.7280s     0.0425
+    Liblinear     11.8977s   0.0285s     0.2305
+    GaussianNB    3.5931s    0.6645s     0.3633
+    SGD           0.2924s    0.0114s     0.2300
+    CART          39.9829s   0.0345s     0.0476
+    RandomForest  794.6232s  1.0526s     0.0249
+    Extra-Trees   1401.7051s 1.1181s     0.0230
 
 The same task has been used in a number of papers including:
 
@@ -59,6 +62,7 @@
 from sklearn.linear_model import SGDClassifier
 from sklearn.naive_bayes import GaussianNB
 from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
 from sklearn import metrics
 
 ######################################################################
@@ -171,8 +175,25 @@ def benchmark(clf):
 sgd_err, sgd_train_time, sgd_test_time = benchmark(SGDClassifier(
     **sgd_parameters))
 
+######################################################################
+## Train CART model
 cart_err, cart_train_time, cart_test_time = benchmark(
-    DecisionTreeClassifier(min_split=5, max_depth=100))
+    DecisionTreeClassifier(min_split=5,
+                           max_depth=None))
+
+######################################################################
+## Train RandomForest model
+rf_err, rf_train_time, rf_test_time = benchmark(
+    RandomForestClassifier(n_estimators=20,
+                           min_split=5,
+                           max_depth=None))
+
+######################################################################
+## Train Extra-Trees model
+et_err, et_train_time, et_test_time = benchmark(
+    ExtraTreesClassifier(n_estimators=20,
+                         min_split=5,
+                         max_depth=None))
 
 ######################################################################
 ## Print classification performance
@@ -196,5 +217,7 @@ def print_row(clf_type, train_time, test_time, err):
 print_row("GaussianNB", gnb_train_time, gnb_test_time, gnb_err)
 print_row("SGD", sgd_train_time, sgd_test_time, sgd_err)
 print_row("CART", cart_train_time, cart_test_time, cart_err)
+print_row("RandomForest", rf_train_time, rf_test_time, rf_err)
+print_row("Extra-Trees", et_train_time, et_test_time, et_err)
 print("")
 print("")
diff --git a/doc/about.rst b/doc/about.rst
@@ -5,6 +5,29 @@ About us
 
 .. include:: ../AUTHORS.rst
 
+Citing scikit-learn
+-------------------
+
+If you use scikit-learn in scientific publication, we would appreciate
+citations to the following paper:
+
+ `Scikit-learn: Machine Learning in Python
+ <http://jmlr.csail.mit.edu/papers/v12/pedregosa11a.html>`_, Pedregosa
+ *et al.*, JMLR 12, pp. 2825-2830, 2011.
+
+ Bibtex entry::
+
+   @article{,
+    title={{Scikit-learn: Machine Learning in Python }},
+    author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
+            and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
+            and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
+            Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay E.},
+    journal={Journal of Machine Learning Research},
+    volume={12},
+    pages={2825--2830},
+    year={2011}
+   }
 
 Funding
 -------
@@ -17,13 +40,14 @@ events.
 .. image:: images/inria-logo.jpg
 
 `Google <http://code.google.com/opensource/>`_ sponsored David
-Cournapeau with a Summer of Code Scolarship in the summer of 2007 and
+Cournapeau with a Summer of Code Scholarship in the summer of 2007 and
 `Vlad Niculae <http://venefrombucharest.wordpress.com/>`_ in 2011. If
 you would like to participate in the next Google Summer of code
 program, please see `this page
 <http://github.com/scikit-learn/scikit-learn/wiki/SummerOfCode>`_
 
 The `NeuroDebian <http://neuro.debian.net>`_ project providing `Debian
-<http://www.debian.org>`_ packaging and contributions is supported by Dr. James
-V. Haxby (`Dartmouth College <http://www.dartmouth.edu/~psych/>`_).
+<http://www.debian.org>`_ packaging and contributions is supported by
+`Dr. James V. Haxby <http://haxbylab.dartmouth.edu/>`_ (`Dartmouth
+College <http://www.dartmouth.edu/~psych/>`_).