diff --git a/AUTHORS.rst b/AUTHORS.rst index da5ac288d8905..664e8c101f557 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -11,14 +11,16 @@ This project was started in 2007 as a Google Summer of Code project by David Cournapeau. Later that year, Matthieu Brucher started work on this project as part of his thesis. -In 2010 Fabian Pedregosa, Gael Varoquaux, Alexandre Gramfort and -Vincent Michel took leadership of the project and made the first -public release, February the 1st 2010. Since then, several releases -have appeard following a ~3 month cycle. +In 2010 Fabian Pedregosa, Gael Varoquaux, Alexandre Gramfort and Vincent +Michel of INRIA took leadership of the project and made the first public +release, February the 1st 2010. Since then, several releases have appeard +following a ~3 month cycle, and a striving international community has +been leading the development. People ------ +.. hlist:: * David Cournapeau diff --git a/README.rst b/README.rst index 131b6dbfd2640..a2a9f09bc5eb6 100644 --- a/README.rst +++ b/README.rst @@ -29,8 +29,9 @@ Important links Dependencies ============ -The required dependencies to build the software are Python >= 2.5, -setuptools, Numpy >= 1.2, SciPy >= 0.7 and a working C++ compiler. +The required dependencies to build the software are Python >= 2.6, +setuptools, Numpy >= 1.3, SciPy >= 0.7 and a working C/C++ compiler. +This configuration matches the Ubuntu 10.04 LTS release from April 2010. To run the tests you will also need nose >= 0.10. diff --git a/benchmarks/bench_bayes.py b/benchmarks/bench_bayes.py deleted file mode 100644 index 538b6980a530f..0000000000000 --- a/benchmarks/bench_bayes.py +++ /dev/null @@ -1,29 +0,0 @@ -""" -A comparison of different methods in linear_model methods. - -Data comes from a random square matrix. - -""" -from datetime import datetime -import numpy as np -from sklearn import linear_model - - -if __name__ == '__main__': - - n_iter = 20 - - time_ridge = np.empty(n_iter) - time_ols = np.empty(n_iter) - time_lasso = np.empty(n_iter) - - dimensions = 10 * np.arange(n_iter) - - n_samples, n_features = 100, 100 - - X = np.random.randn(n_samples, n_features) - y = np.random.randn(n_samples) - - start = datetime.now() - ridge = linear_model.BayesianRidge() - ridge.fit(X, y) diff --git a/benchmarks/bench_glm.py b/benchmarks/bench_glm.py index 5bfc7f5629c18..fed5e5bc106f8 100644 --- a/benchmarks/bench_glm.py +++ b/benchmarks/bench_glm.py @@ -20,13 +20,13 @@ time_ols = np.empty(n_iter) time_lasso = np.empty(n_iter) - dimensions = 500 * np.arange(1, n_iter+1) + dimensions = 500 * np.arange(1, n_iter + 1) for i in range(n_iter): print 'Iteration %s of %s' % (i, n_iter) - n_samples, n_features = 10*i + 3, 10*i + 3 + n_samples, n_features = 10 * i + 3, 10 * i + 3 X = np.random.randn(n_samples, n_features) Y = np.random.randn(n_samples) @@ -46,7 +46,6 @@ lasso.fit(X, Y) time_lasso[i] = total_seconds(datetime.now() - start) - pl.xlabel('Dimesions') pl.ylabel('Time (in seconds)') pl.plot(dimensions, time_ridge, color='r') diff --git a/benchmarks/bench_glmnet.py b/benchmarks/bench_glmnet.py index 2b866eb09b9cf..b9c77fbed0609 100644 --- a/benchmarks/bench_glmnet.py +++ b/benchmarks/bench_glmnet.py @@ -77,10 +77,10 @@ def bench(factory, X, Y, X_test, Y_test, ref_coef): glmnet_results.append(bench(GlmnetLasso, X, Y, X_test, Y_test, coef_)) pl.clf() - xx = range(0, n*step, step) + xx = range(0, n * step, step) pl.title('Lasso regression on sample dataset (%d features)' % n_features) pl.plot(xx, scikit_results, 'b-', label='scikit-learn') - pl.plot(xx, glmnet_results,'r-', label='glmnet') + pl.plot(xx, glmnet_results, 'r-', label='glmnet') pl.legend() pl.xlabel('number of samples to classify') pl.ylabel('time (in seconds)') @@ -120,10 +120,9 @@ def bench(factory, X, Y, X_test, Y_test, ref_coef): pl.figure() pl.title('Regression in high dimensional spaces (%d samples)' % n_samples) pl.plot(xx, scikit_results, 'b-', label='scikit-learn') - pl.plot(xx, glmnet_results,'r-', label='glmnet') + pl.plot(xx, glmnet_results, 'r-', label='glmnet') pl.legend() pl.xlabel('number of features') pl.ylabel('time (in seconds)') pl.axis('tight') pl.show() - diff --git a/benchmarks/bench_lasso.py b/benchmarks/bench_lasso.py index b54e6bdeb1e23..d01414a377ddb 100644 --- a/benchmarks/bench_lasso.py +++ b/benchmarks/bench_lasso.py @@ -38,20 +38,21 @@ def compute_bench(alpha, n_samples, n_features, precompute): n_informative=n_informative, noise=0.1, coef=True) - X /= np.sqrt(np.sum(X**2, axis=0)) # Normalize data + X /= np.sqrt(np.sum(X ** 2, axis=0)) # Normalize data gc.collect() print "- benching Lasso" - clf = Lasso(alpha=alpha, fit_intercept=False) + clf = Lasso(alpha=alpha, fit_intercept=False, precompute=precompute) tstart = time() - clf.fit(X, Y, precompute=precompute) + clf.fit(X, Y) lasso_results.append(time() - tstart) gc.collect() print "- benching LassoLars" - clf = LassoLars(alpha=alpha, fit_intercept=False) + clf = LassoLars(alpha=alpha, fit_intercept=False, + normalize=False, precompute=precompute) tstart = time() - clf.fit(X, Y, normalize=False, precompute=precompute) + clf.fit(X, Y) lars_lasso_results.append(time() - tstart) return lasso_results, lars_lasso_results @@ -61,7 +62,7 @@ def compute_bench(alpha, n_samples, n_features, precompute): from sklearn.linear_model import Lasso, LassoLars import pylab as pl - alpha = 0.01 # regularization parameter + alpha = 0.01 # regularization parameter n_features = 10 list_n_samples = np.linspace(100, 1000000, 5).astype(np.int) diff --git a/benchmarks/bench_plot_fastkmeans.py b/benchmarks/bench_plot_fastkmeans.py index 5eff4b7fb3bcb..fea541f04e683 100644 --- a/benchmarks/bench_plot_fastkmeans.py +++ b/benchmarks/bench_plot_fastkmeans.py @@ -20,7 +20,7 @@ def compute_bench(samples_range, features_range): for n_features in features_range: it += 1 print '==============================' - print 'Iteration %03d of %03d' %(it, max_it) + print 'Iteration %03d of %03d' % (it, max_it) print '==============================' print '' data = nr.random_integers(-50, 50, (n_samples, n_features)) @@ -70,7 +70,7 @@ def compute_bench_2(chunks): for chunk in chunks: it += 1 print '==============================' - print 'Iteration %03d of %03d' %(it, max_it) + print 'Iteration %03d of %03d' % (it, max_it) print '==============================' print '' @@ -93,7 +93,7 @@ def compute_bench_2(chunks): if __name__ == '__main__': - from mpl_toolkits.mplot3d import axes3d # register the 3d projection + from mpl_toolkits.mplot3d import axes3d # register the 3d projection import matplotlib.pyplot as plt samples_range = np.linspace(50, 150, 5).astype(np.int) @@ -126,7 +126,6 @@ def compute_bench_2(chunks): ax.set_xlabel('n_samples') ax.set_ylabel('n_features') - i = 0 for c, (label, timings) in zip('br', sorted(results_2.iteritems())): @@ -137,5 +136,4 @@ def compute_bench_2(chunks): ax.set_xlabel('chunks') ax.set_ylabel(label) - plt.show() diff --git a/benchmarks/bench_plot_lasso_path.py b/benchmarks/bench_plot_lasso_path.py index 09dd7f084bc22..845368e9e8e24 100644 --- a/benchmarks/bench_plot_lasso_path.py +++ b/benchmarks/bench_plot_lasso_path.py @@ -43,7 +43,7 @@ def compute_bench(samples_range, features_range): print "benching lars_path (with Gram):", sys.stdout.flush() tstart = time() - G = np.dot(X.T, X) # precomputed Gram matrix + G = np.dot(X.T, X) # precomputed Gram matrix Xy = np.dot(X.T, y) lars_path(X, y, Xy=Xy, Gram=G, method='lasso') delta = time() - tstart @@ -81,7 +81,7 @@ def compute_bench(samples_range, features_range): if __name__ == '__main__': - from mpl_toolkits.mplot3d import axes3d # register the 3d projection + from mpl_toolkits.mplot3d import axes3d # register the 3d projection import matplotlib.pyplot as plt samples_range = np.linspace(10, 2000, 5).astype(np.int) diff --git a/benchmarks/bench_plot_neighbors.py b/benchmarks/bench_plot_neighbors.py index 1fd551a586b3f..3cfc9883997ec 100644 --- a/benchmarks/bench_plot_neighbors.py +++ b/benchmarks/bench_plot_neighbors.py @@ -59,7 +59,6 @@ def barplot_neighbors(Nrange=2 ** np.arange(1, 11), N_results_build[algorithm][i] = (t1 - t0) N_results_query[algorithm][i] = (t2 - t1) - #------------------------------------------------------------ # varying D D_results_build = dict([(alg, np.zeros(len(Drange))) @@ -83,7 +82,6 @@ def barplot_neighbors(Nrange=2 ** np.arange(1, 11), D_results_build[algorithm][i] = (t1 - t0) D_results_query[algorithm][i] = (t2 - t1) - #------------------------------------------------------------ # varying k k_results_build = dict([(alg, np.zeros(len(krange))) diff --git a/benchmarks/bench_plot_omp_lars.py b/benchmarks/bench_plot_omp_lars.py index 74479e426d405..7341c892911bc 100644 --- a/benchmarks/bench_plot_omp_lars.py +++ b/benchmarks/bench_plot_omp_lars.py @@ -108,7 +108,7 @@ def compute_bench(samples_range, features_range): for i, (label, timings) in enumerate(sorted(results.iteritems())): ax = fig.add_subplot(1, 2, i) vmax = max(1 - timings.min(), -1 + timings.max()) - pl.matshow(timings, fignum=False, vmin=1-vmax, vmax=1+vmax) + pl.matshow(timings, fignum=False, vmin=1 - vmax, vmax=1 + vmax) ax.set_xticklabels([''] + map(str, samples_range)) ax.set_yticklabels([''] + map(str, features_range)) pl.xlabel('n_samples') diff --git a/benchmarks/bench_plot_svd.py b/benchmarks/bench_plot_svd.py index 8a9ac7927c3fd..6b24964ed3936 100644 --- a/benchmarks/bench_plot_svd.py +++ b/benchmarks/bench_plot_svd.py @@ -25,7 +25,8 @@ def compute_bench(samples_range, features_range, q=3, rank=50): print '====================' print 'Iteration %03d of %03d' % (it, max_it) print '====================' - X = make_low_rank_matrix(n_samples, n_features, effective_rank=rank, + X = make_low_rank_matrix(n_samples, n_features, + effective_rank=rank, tail_strength=0.2) gc.collect() @@ -50,7 +51,7 @@ def compute_bench(samples_range, features_range, q=3, rank=50): if __name__ == '__main__': - from mpl_toolkits.mplot3d import axes3d # register the 3d projection + from mpl_toolkits.mplot3d import axes3d # register the 3d projection import matplotlib.pyplot as plt samples_range = np.linspace(2, 1000, 4).astype(np.int) diff --git a/benchmarks/bench_plot_ward.py b/benchmarks/bench_plot_ward.py index f6e32673d259d..6f8f4494d072f 100644 --- a/benchmarks/bench_plot_ward.py +++ b/benchmarks/bench_plot_ward.py @@ -29,7 +29,7 @@ hierarchy.ward(X) scipy_time[j, i] = time.time() - t0 -ratio = scikits_time/scipy_time +ratio = scikits_time / scipy_time pl.clf() pl.imshow(np.log(ratio), aspect='auto', origin="lower") diff --git a/benchmarks/bench_sgd_covertype.py b/benchmarks/bench_sgd_covertype.py index 199fa3dfd9473..f7a501ab5a026 100644 --- a/benchmarks/bench_sgd_covertype.py +++ b/benchmarks/bench_sgd_covertype.py @@ -3,12 +3,13 @@ Covertype dataset with dense SGD ================================ -Benchmark stochastic gradient descent (SGD), Liblinear, and Naive Bayes on -the forest covertype dataset of Blackard, Jock, and Dean [1]. The dataset -comprises 581,012 samples. It is low-dimensional with 54 features and a -sparsity of approx. 23%. Here, we consider the task of predicting class 1 -(spruce/fir). The classification performance of SGD is competitive with -Liblinear while being two orders of magnitude faster to train:: +Benchmark stochastic gradient descent (SGD), Liblinear, and Naive Bayes, CART +(decision tree), RandomForest and Extra-Trees on the forest covertype dataset +of Blackard, Jock, and Dean [1]. The dataset comprises 581,012 samples. It is +low dimensional with 54 features and a sparsity of approx. 23%. Here, we +consider the task of predicting class 1 (spruce/fir). The classification +performance of SGD is competitive with Liblinear while being two orders of +magnitude faster to train:: [..] Classification performance: @@ -16,10 +17,12 @@ Classifier train-time test-time error-rate -------------------------------------------- - Liblinear 10.0171s 0.0213s 0.2305 - GaussianNB 3.1570s 0.1907s 0.3633 - SGD 0.2317s 0.0050s 0.2300 - CART 62.7706s 1.7280s 0.0425 + Liblinear 11.8977s 0.0285s 0.2305 + GaussianNB 3.5931s 0.6645s 0.3633 + SGD 0.2924s 0.0114s 0.2300 + CART 39.9829s 0.0345s 0.0476 + RandomForest 794.6232s 1.0526s 0.0249 + Extra-Trees 1401.7051s 1.1181s 0.0230 The same task has been used in a number of papers including: @@ -59,6 +62,7 @@ from sklearn.linear_model import SGDClassifier from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn import metrics ###################################################################### @@ -171,8 +175,25 @@ def benchmark(clf): sgd_err, sgd_train_time, sgd_test_time = benchmark(SGDClassifier( **sgd_parameters)) +###################################################################### +## Train CART model cart_err, cart_train_time, cart_test_time = benchmark( - DecisionTreeClassifier(min_split=5, max_depth=100)) + DecisionTreeClassifier(min_split=5, + max_depth=None)) + +###################################################################### +## Train RandomForest model +rf_err, rf_train_time, rf_test_time = benchmark( + RandomForestClassifier(n_estimators=20, + min_split=5, + max_depth=None)) + +###################################################################### +## Train Extra-Trees model +et_err, et_train_time, et_test_time = benchmark( + ExtraTreesClassifier(n_estimators=20, + min_split=5, + max_depth=None)) ###################################################################### ## Print classification performance @@ -196,5 +217,7 @@ def print_row(clf_type, train_time, test_time, err): print_row("GaussianNB", gnb_train_time, gnb_test_time, gnb_err) print_row("SGD", sgd_train_time, sgd_test_time, sgd_err) print_row("CART", cart_train_time, cart_test_time, cart_err) +print_row("RandomForest", rf_train_time, rf_test_time, rf_err) +print_row("Extra-Trees", et_train_time, et_test_time, et_err) print("") print("") diff --git a/doc/about.rst b/doc/about.rst index e904c9222c471..b22659170827a 100644 --- a/doc/about.rst +++ b/doc/about.rst @@ -5,6 +5,29 @@ About us .. include:: ../AUTHORS.rst +Citing scikit-learn +------------------- + +If you use scikit-learn in scientific publication, we would appreciate +citations to the following paper: + + `Scikit-learn: Machine Learning in Python + `_, Pedregosa + *et al.*, JMLR 12, pp. 2825-2830, 2011. + + Bibtex entry:: + + @article{, + title={{Scikit-learn: Machine Learning in Python }}, + author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. + and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. + and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and + Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay E.}, + journal={Journal of Machine Learning Research}, + volume={12}, + pages={2825--2830}, + year={2011} + } Funding ------- @@ -17,13 +40,14 @@ events. .. image:: images/inria-logo.jpg `Google `_ sponsored David -Cournapeau with a Summer of Code Scolarship in the summer of 2007 and +Cournapeau with a Summer of Code Scholarship in the summer of 2007 and `Vlad Niculae `_ in 2011. If you would like to participate in the next Google Summer of code program, please see `this page `_ The `NeuroDebian `_ project providing `Debian -`_ packaging and contributions is supported by Dr. James -V. Haxby (`Dartmouth College `_). +`_ packaging and contributions is supported by +`Dr. James V. Haxby `_ (`Dartmouth +College `_). diff --git a/doc/conf.py b/doc/conf.py index ca0c3b3f3e3e4..ac78d66cc3f2b 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -3,7 +3,8 @@ # scikit-learn documentation build configuration file, created by # sphinx-quickstart on Fri Jan 8 09:13:42 2010. # -# This file is execfile()d with the current directory set to its containing dir. +# This file is execfile()d with the current directory set to its containing +# dir. # # Note that not all possible configuration values are present in this # autogenerated file. @@ -11,14 +12,16 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys, os +import sys +import os -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. +# If extensions (or modules to document with autodoc) are in another +# directory, add these directories to sys.path here. If the directory +# is relative to the documentation root, use os.path.abspath to make it +# absolute, like shown here. sys.path.insert(0, os.path.abspath('sphinxext')) -# -- General configuration ----------------------------------------------------- +# -- General configuration --------------------------------------------------- # Try to override the matplotlib configuration as early as possible try: @@ -26,8 +29,8 @@ except: pass -# Add any Sphinx extension module names here, as strings. They can be extensions -# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = ['gen_rst', 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', 'sphinx.ext.pngmath', @@ -36,7 +39,7 @@ import numpy_ext.numpydoc extensions.append('numpy_ext.numpydoc') # With older versions of sphinx, this causes a crash - autosummary_generate=True + autosummary_generate = True except: # Older version of sphinx extensions.append('numpy_ext_old.numpydoc') @@ -88,11 +91,12 @@ # List of documents that shouldn't be included in the build. #unused_docs = [] -# List of directories, relative to source directory, that shouldn't be searched -# for source files. +# List of directories, relative to source directory, that shouldn't be +# searched for source files. exclude_trees = ['_build', 'templates', 'includes'] -# The reST default role (used for this markup: `text`) to use for all documents. +# The reST default role (used for this markup: `text`) to use for all +# documents. #default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. @@ -113,7 +117,7 @@ #modindex_common_prefix = [] -# -- Options for HTML output --------------------------------------------------- +# -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. Major themes that come with # Sphinx are currently 'default' and 'sphinxdoc'. @@ -164,10 +168,10 @@ #html_additional_pages = {} # If false, no module index is generated. -#html_use_modindex = True +html_use_modindex = False # If false, no index is generated. -#html_use_index = True +html_use_index = False # If true, the index is split into individual pages for each letter. #html_split_index = False @@ -187,7 +191,7 @@ htmlhelp_basename = 'scikit-learndoc' -# -- Options for LaTeX output -------------------------------------------------- +# -- Options for LaTeX output ------------------------------------------------ # The paper size ('letter' or 'a4'). #latex_paper_size = 'letter' @@ -196,7 +200,8 @@ #latex_font_size = '10pt' # Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, author, documentclass [howto/manual]). +# (source start file, target name, title, author, documentclass +# [howto/manual]). latex_documents = [ ('index', 'user_guide.tex', u'scikit-learn user guide', u'scikit-learn developers', 'manual'), @@ -213,7 +218,7 @@ # Additional stuff for the LaTeX preamble. latex_preamble = """ -\usepackage{amsmath}\usepackage{amsfonts} +\usepackage{amsmath}\usepackage{amsfonts}\usepackage{bm}\usepackage{morefloats} """ # Documents to append as an appendix to all manuals. diff --git a/doc/contents.rst b/doc/contents.rst deleted file mode 100644 index 5d4d8f6cd97a0..0000000000000 --- a/doc/contents.rst +++ /dev/null @@ -1,17 +0,0 @@ -.. - This file should be empty of any layout or explicit linking. It is - just used to be a unique place in which the table of content of the - user manual is defined. - -.. toctree:: - :numbered: - - install - tutorial - supervised_learning.rst - unsupervised_learning.rst - model_selection.rst - Dataset loading utilities - Preprocessing data - modules/feature_extraction.rst - Class Reference diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst index 6dec74da2c03e..b8cf553edf998 100644 --- a/doc/datasets/index.rst +++ b/doc/datasets/index.rst @@ -1,11 +1,3 @@ -.. - For doctests: - - >>> import numpy as np - >>> import os - >>> from sklearn import datasets - >>> datasets.mldata.urllib2 = mock_urllib2 - .. _datasets: ========================= @@ -15,7 +7,7 @@ Dataset loading utilities .. currentmodule:: sklearn.datasets The ``sklearn.datasets`` package embeds some small toy datasets -as introduced in the "Getting Started" section. +as introduced in the :ref:`Getting Started ` section. To evaluate the impact of the scale of the dataset (``n_samples`` and ``n_features``) while controlling the statistical properties of the data @@ -26,6 +18,31 @@ This package also features helpers to fetch larger datasets commonly used by the machine learning community to benchmark algorithm on data that comes from the 'real world'. +General dataset API +=================== + +There are three distinct kinds of dataset interfaces for different types +of datasets. +The simplest one is the interface for sample images, which is described +below in the :ref:`sample_images` section. + +The dataset generation functions and the svmlight loader share a simplistic +interface, returning a tuple ``(X, y)`` consisting of a n_samples x n_features +numpy array X and an array of length n_samples containing the targets y. + +The toy datasets as well as the 'real world' datasets and the datasets +fetched from mldata.org have more sophisticated structure. +These functions return a ``bunch`` (which is a dictionary that is +accessible with the 'dict.key' syntax). +All datasets have at least two keys, ``data``, containg an array of shape +``n_samples x n_features`` (except for 20newsgroups) and ``target``, a numpy +array of length ``n_features``, containing the targets. + +The datasets also contain a description in ``DESCR`` and some contain +``feature_names`` and ``target_names``. +See the dataset descriptions below for details. + + Toy datasets ============ @@ -34,7 +51,7 @@ require to download any file from some external website. .. autosummary:: - :toctree: generated/ + :toctree: ../modules/generated/ :template: function.rst load_boston @@ -47,6 +64,8 @@ These datasets are useful to quickly illustrate the behavior of the various algorithms implemented in the scikit. They are however often too small to be representative of real world machine learning tasks. +.. _sample_images: + Sample images ============= @@ -59,7 +78,13 @@ and pipeline on 2D data. load_sample_images load_sample_image -.. note:: +.. image:: ../auto_examples/cluster/images/plot_color_quantization_1.png + :target: ../auto_examples/cluster/plot_color_quantization.html + :scale: 30 + :align: right + + +.. warning:: The default coding of images is based on the ``uint8`` dtype to spare memory. Often machine learning algorithms work best if the @@ -80,9 +105,14 @@ Sample generators In addition, scikit-learn includes various random sample generators that can be used to build artifical datasets of controled size and complexity. +.. image:: ../auto_examples/images/plot_random_dataset_1.png + :target: ../auto_examples/plot_random_dataset.html + :scale: 50 + :align: center + .. autosummary:: - :toctree: generated/ + :toctree: ../modules/generated/ :template: function.rst make_classification @@ -97,30 +127,30 @@ can be used to build artifical datasets of controled size and complexity. make_spd_matrix make_swiss_roll make_s_curve - + make_sparse_spd_matrix .. _libsvm_loader: Datasets in svmlight / libsvm format ==================================== -scikit-learn includes a fast utility function, ``load_svmlight_format``, to load +scikit-learn includes utility functions for loading datasets in the svmlight / libsvm format. In this format, each line takes the form ``