scikit-learn · robertlayton · Nov 10, 2011 · Oct 14, 2011 · Oct 15, 2011 · Oct 19, 2011
diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
@@ -472,6 +472,143 @@ by defining the adjusted Rand index as follows:
    <http://en.wikipedia.org/wiki/Rand_index#Adjusted_Rand_index>`_
 
 
+Ajusted Mutual Information
+--------------------------
+
+Presentation and usage
+~~~~~~~~~~~~~~~~~~~~~~
+
+Given the knowledge of the ground truth class assignments ``labels_true``
+and our clustering algorithm assignments of the same samples
+``labels_pred``, the **Adjusted Mutual Information** is a function that 
+measures the **agreement** of the two assignements, ignoring permutations
+ and **with chance normalization**::
+
+  >>> from sklearn import metrics
+  >>> labels_true = [0, 0, 0, 1, 1, 1]
+  >>> labels_pred = [0, 0, 1, 1, 2, 2]
+
+  >>> metrics.adjusted_mutual_info_score(labels_true, labels_pred)  # doctest: +ELLIPSIS
+  0.24...
+
+One can permute 0 and 1 in the predicted labels and rename `2` by `3` and get
+the same score::
+
+  >>> labels_pred = [1, 1, 0, 0, 3, 3]
+  >>> metrics.adjusted_mutual_info_score(labels_true, labels_pred)  # doctest: +ELLIPSIS
+  0.24...
+
+Furthermore, :func:`adjusted_mutual_info_score` is **symmetric**: swapping the
+argument does not change the score. It can thus be used as a **consensus
+measure**::
+
+  >>> metrics.adjusted_mutual_info_score(labels_pred, labels_true)  # doctest: +ELLIPSIS
+  0.24...
+
+Perfect labeling is scored 1.0::
+
+  >>> labels_pred = labels_true[:]
+  >>> metrics.adjusted_mutual_info_score(labels_true, labels_pred)
+  1.0
+
+Bad (e.g. independent labelings) have scores of zero::
+
+  >>> labels_true = [0, 1, 2, 0, 3, 4, 5, 1]
+  >>> labels_pred = [1, 1, 0, 0, 2, 2, 2, 2]
+  >>> metrics.adjusted_mutual_info_score(labels_true, labels_pred)  # doctest: +ELLIPSIS
+  0.0...
+
+
+Advantages
+~~~~~~~~~~
+
+- **Random (uniform) label assignements have a AMI score close to 0.0**
+  for any value of ``n_clusters`` and ``n_samples`` (which is not the
+  case for raw Mutual Information or the V-measure for instance).
+
+- **Bounded range [0, 1]**:  Values close to zero indicate two label
+  assignments that are largely independent, while values close to one
+  indicate significant agreement. Further, values of exactly 0 indicate
+  **purely** independent label assignments and a AMI of exactly 1 indicates
+  that the two label assignments are equal (with or without permutation).
+
+- **No assumption is made on the cluster structure**: can be used
+  to compare clustering algorithms such as k-means which assumes isotropic
+  blob shapes with results of spectral clustering algorithms which can
+  find cluster with "folded" shapes.
+
+
+Drawbacks
+~~~~~~~~~
+
+- Contrary to inertia, **AMI requires the knowlege of the ground truth
+  classes** while almost never available in practice or requires manual
+  assignment by human annotators (as in the supervised learning setting).
+
+  However AMI can also be useful in purely unsupervised setting as a
+  building block for a Consensus Index that can be used for clustering
+  model selection.
+
+
+.. topic:: Examples:
+
+ * :ref:`example_cluster_plot_adjusted_for_chance_measures.py`: Analysis of
+   the impact of the dataset size on the value of clustering measures
+   for random assignements. This example also includes the Adjusted Rand 
+   Index.
+
+
+Mathematical formulation
+~~~~~~~~~~~~~~~~~~~~~~~~
+Assume two label assignments (of the same data), :math:`U` with :math:`R`
+classes and :math:`V` with :math:`C` classes. The entropy of either is the
+ amount of uncertaintly for an array, and can be calculated as:
+
+.. math:: H(U) = \sum_{i=1}^{|R|}P(i)log(P(i))
+
+Where P(i) is the number of instances in U that are in class :math:`R_i`.
+Likewise, for :math:`V`:
+.. math:: H(V) = \sum_{j=1}^{|C|}P'(j)log(P'(j))
+Where P'(j) is the number of instances in V that are in class :math:`C_j`.
+
+The (non-adjusted) mutual information between :math:`U` and :math:`V` is
+calculated by:
+
+.. math:: MI(U, V) = \sum_{i=1}^{|R|}\sum_{j=1}^{|C|}P(i, j)log(\frac{P(i,j)}{P(i)P'(j)})
+
+Where P(i, j) is the number of instances with label :math:`R_i` 
+and also with label :math:`C_j`.
+
+This value of the mutual information is not adjusted cfor chance and will tend
+to increase as the number of different labels (clusters) increases, regardless
+of the actual amount of "mutual information" between the label assignments.
+
+The expected value for the mutual information can be calculated using the
+following equation, from Vinh, Epps, and Bailey, (2009). In this equation,
+:math:`a_i` is the number of instances with label :math:`U_i` and
+:math:`b_j` is the number of instances with label :math:`V_j`.
+
+
+.. math:: E\{MI(U,V)\}=\sum_{i=1}^R \sum_{j=1}^C \sum_{n_{ij}=(a_i+b_j-N)^+
+   }^{\min(a_i, b_j)} \frac{n_{ij}}{N}\log ( \frac{ N.n_{ij}}{a_i b_j})
+   \frac{a_i!b_j!(N-a_i)!(N-b_j)!}{N!n_{ij}!(a_i-n_{ij})!(b_j-n_{ij})!
+   (N-a_i-b_j+n_{ij})!}
+
+Using the expected value, the adjusted mutual information can then be 
+calculated using a similar form to that of the adjusted Rand index:
+
+.. math:: AMI = \frac{MI - Expected\_MI}{max(H(U), H(V)) - Expected\_MI}
+
+.. topic:: References
+
+ * Vinh, Epps, and Bailey, J. (2009). "Information theoretic measures 
+   for clusterings comparison". Proceedings of the 26th Annual International
+   Conference on Machine Learning - ICML '09.
+   doi:10.1145/1553374.1553511. ISBN 9781605585161.
+
+ * `Wikipedia entry for the Adjusted Mutual Information
+   <http://en.wikipedia.org/wiki/Adjusted_Mutual_Information>`_
+
 Homogeneity, completeness and V-measure
 ---------------------------------------
 

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -19,7 +19,7 @@ Changelog
    - Faster tests by `Fabian Pedregosa`_.
 
    - Silhouette Coefficient cluster analysis evaluation metric added as
-     ``sklearn.metrics.silhouette_score`` by Robert Layton.
+     ``sklearn.metrics.silhouette_score`` by `Robert Layton`_.
 
    - Fixed a bug in `KMeans` in the handling of the `n_init` parameter:
      the clustering algorithm used to be run `n_init` times but the last
@@ -28,6 +28,9 @@ Changelog
    - Minor refactoring in :ref:`sgd` module; consolidated dense and sparse
      predict methods.
 
+   - Adjusted Mutual Information metric added as
+     ``sklearn.metrics.adjusted_mutual_info_score`` by `Robert Layton`_.
+
 
 API changes summary
 -------------------

diff --git a/examples/cluster/plot_adjusted_for_chance_measures.py b/examples/cluster/plot_adjusted_for_chance_measures.py
@@ -27,11 +27,12 @@
 
 import numpy as np
 import pylab as pl
+from time import time
 from sklearn import metrics
 
 
 def uniform_labelings_scores(score_func, n_samples, n_clusters_range,
-                             fixed_n_classes=None, n_runs=10, seed=42):
+                             fixed_n_classes=None, n_runs=5, seed=42):
     """Compute score for 2 random uniform cluster labelings.
 
     Both random labelings have the same number of clusters for each value
@@ -58,6 +59,8 @@ def uniform_labelings_scores(score_func, n_samples, n_clusters_range,
 score_funcs = [
     metrics.adjusted_rand_score,
     metrics.v_measure_score,
+    metrics.adjusted_mutual_info_score,
+    metrics.mutual_info_score,
 ]
 
 # 2 independent random clusterings with equal cluster number
@@ -73,9 +76,12 @@ def uniform_labelings_scores(score_func, n_samples, n_clusters_range,
     print "Computing %s for %d values of n_clusters and n_samples=%d" % (
         score_func.__name__, len(n_clusters_range), n_samples)
 
+    t0 = time()
     scores = uniform_labelings_scores(score_func, n_samples, n_clusters_range)
+    print "done in %0.3fs" % (time() - t0)
     plots.append(pl.errorbar(
-        n_clusters_range, scores.mean(axis=1), scores.std(axis=1)))
+    #    n_clusters_range, scores.mean(axis=1), scores.std(axis=1)))
+        n_clusters_range, np.median(scores, axis=1), scores.std(axis=1)))
     names.append(score_func.__name__)
 
 pl.title("Clustering measures for 2 random uniform labelings\n"
@@ -86,6 +92,7 @@ def uniform_labelings_scores(score_func, n_samples, n_clusters_range,
 pl.ylim(ymin=-0.05, ymax=1.05)
 pl.show()
 
+
 # Random labeling with varying n_clusters against ground class labels
 # with fixed number of clusters
 
@@ -101,8 +108,10 @@ def uniform_labelings_scores(score_func, n_samples, n_clusters_range,
     print "Computing %s for %d values of n_clusters and n_samples=%d" % (
         score_func.__name__, len(n_clusters_range), n_samples)
 
+    t0 = time()
     scores = uniform_labelings_scores(score_func, n_samples, n_clusters_range,
                                       fixed_n_classes=n_classes)
+    print "done in %0.3fs" % (time() - t0)
     plots.append(pl.errorbar(
         n_clusters_range, scores.mean(axis=1), scores.std(axis=1)))
     names.append(score_func.__name__)

diff --git a/examples/cluster/plot_affinity_propagation.py b/examples/cluster/plot_affinity_propagation.py
@@ -40,6 +40,8 @@
 print "V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)
 print "Adjusted Rand Index: %0.3f" % \
     metrics.adjusted_rand_score(labels_true, labels)
+print "Adjusted Mutual Information: %0.3f" % \
+    metrics.adjusted_mutual_info_score(labels_true, labels)
 D = (S / np.min(S))
 print ("Silhouette Coefficient: %0.3f" %
        metrics.silhouette_score(D, labels, metric='precomputed'))

diff --git a/examples/cluster/plot_color_quantization.py b/examples/cluster/plot_color_quantization.py
@@ -10,7 +10,7 @@
 
 In this example, pixels are represented in a 3D-space and K-means is used to
 find 64 color clusters. In the image processing literature, the codebook
-obtained from K-means (the cluster centers) is called the color palette. Using a
+obtained from K-means (the cluster centers) is called the color palette. Using
 a single byte, up to 256 colors can be addressed, whereas an RGB encoding
 requires 3 bytes per pixel. The GIF file format, for example, uses such a
 palette.
@@ -61,7 +61,7 @@
 print "done in %0.3fs." % (time() - t0)
 
 
-codebook_random = shuffle(image_array, random_state=0)[:n_colors+1]
+codebook_random = shuffle(image_array, random_state=0)[:n_colors + 1]
 print "Predicting color indices on the full image (random)"
 t0 = time()
 dist = euclidean_distances(codebook_random, image_array, squared=True)

diff --git a/examples/cluster/plot_dbscan.py b/examples/cluster/plot_dbscan.py
@@ -41,6 +41,8 @@
 print "V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)
 print "Adjusted Rand Index: %0.3f" % \
     metrics.adjusted_rand_score(labels_true, labels)
+print "Adjusted Mutual Information: %0.3f" % \
+    metrics.adjusted_mutual_info_score(labels_true, labels)
 print ("Silhouette Coefficient: %0.3f" %
        metrics.silhouette_score(D, labels, metric='precomputed'))
 

diff --git a/examples/cluster/plot_kmeans_digits.py b/examples/cluster/plot_kmeans_digits.py
@@ -48,8 +48,8 @@
 print "V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)
 print "Adjusted Rand Index: %0.3f" % \
     metrics.adjusted_rand_score(labels, km.labels_)
-#print ("Silhouette Coefficient: %0.3f" %
-#       metrics.silhouette_score(D, km.labels_, metric='precomputed'))
+print "Adjusted Mutual Information: %0.3f" % \
+    metrics.adjusted_mutual_info_score(labels, km.labels_)
 print ("Silhouette Coefficient: %0.3f" %
        metrics.silhouette_score(data, km.labels_,
                                 metric='euclidean', sample_size=sample_size))
@@ -65,8 +65,8 @@
 print "V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)
 print "Adjusted Rand Index: %0.3f" % \
     metrics.adjusted_rand_score(labels, km.labels_)
-#print ("Silhouette Coefficient: %0.3f" %
-#       metrics.silhouette_score(D, km.labels_, metric='precomputed'))
+print "Adjusted Mutual Information: %0.3f" % \
+    metrics.adjusted_mutual_info_score(labels, km.labels_)
 print ("Silhouette Coefficient: %0.3f" %
        metrics.silhouette_score(data, km.labels_,
                                 metric='euclidean', sample_size=sample_size))
@@ -85,8 +85,8 @@
 print "V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)
 print "Adjusted Rand Index: %0.3f" % \
     metrics.adjusted_rand_score(labels, km.labels_)
-#print ("Silhouette Coefficient: %0.3f" %
-#       metrics.silhouette_score(D, km.labels_, metric='precomputed'))
+print "Adjusted Mutual Information: %0.3f" % \
+    metrics.adjusted_mutual_info_score(labels, km.labels_)
 print ("Silhouette Coefficient: %0.3f" %
        metrics.silhouette_score(data, km.labels_,
                                 metric='euclidean', sample_size=sample_size))

diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
@@ -16,4 +16,6 @@
 from .cluster import completeness_score
 from .cluster import v_measure_score
 from .cluster import silhouette_score
+from .cluster import mutual_info_score
+from .cluster import adjusted_mutual_info_score
 from .pairwise import euclidean_distances, pairwise_distances, pairwise_kernels
diff --git a/sklearn/metrics/cluster/__init__.py b/sklearn/metrics/cluster/__init__.py
@@ -5,7 +5,10 @@
 - supervised, which uses a ground truth class values for each sample.
 - unsupervised, which does not and measures the 'quality' of the model itself.
 """
-from .supervised import (homogeneity_completeness_v_measure,
-                         homogeneity_score, completeness_score,
-                         v_measure_score, adjusted_rand_score)
-from .unsupervised import silhouette_score, silhouette_samples
+from supervised import (homogeneity_completeness_v_measure,
+                        homogeneity_score, completeness_score,
+                        v_measure_score, adjusted_rand_score,
+                        adjusted_mutual_info_score,
+                        expected_mutual_information, mutual_info_score,
+                        contingency_matrix)
+from unsupervised import silhouette_score, silhouette_samples