sklearn/metrics/tests/test_common.py

from functools import partial
from inspect import signature
from itertools import chain, permutations, product

import numpy as np
import pytest

from sklearn._config import config_context
from sklearn.datasets import make_multilabel_classification
from sklearn.metrics import (
    accuracy_score,
    average_precision_score,
    balanced_accuracy_score,
    brier_score_loss,
    cohen_kappa_score,
    confusion_matrix,
    coverage_error,
    d2_absolute_error_score,
    d2_pinball_score,
    d2_tweedie_score,
    dcg_score,
    det_curve,
    explained_variance_score,
    f1_score,
    fbeta_score,
    hamming_loss,
    hinge_loss,
    jaccard_score,
    label_ranking_average_precision_score,
    label_ranking_loss,
    log_loss,
    matthews_corrcoef,
    max_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_gamma_deviance,
    mean_pinball_loss,
    mean_poisson_deviance,
    mean_squared_error,
    mean_tweedie_deviance,
    median_absolute_error,
    multilabel_confusion_matrix,
    ndcg_score,
    precision_recall_curve,
    precision_score,
    r2_score,
    recall_score,
    roc_auc_score,
    roc_curve,
    top_k_accuracy_score,
    zero_one_loss,
)
from sklearn.metrics._base import _average_binary_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import shuffle
from sklearn.utils._array_api import (
    _atol_for_type,
    _convert_to_numpy,
    yield_namespace_device_dtype_combinations,
)
from sklearn.utils._testing import (
    _array_api_for_tests,
    assert_allclose,
    assert_almost_equal,
    assert_array_equal,
    assert_array_less,
    ignore_warnings,
)
from sklearn.utils.fixes import COO_CONTAINERS
from sklearn.utils.multiclass import type_of_target
from sklearn.utils.validation import _num_samples, check_random_state

# Note toward developers about metric testing
# -------------------------------------------
# It is often possible to write one general test for several metrics:
#
#   - invariance properties, e.g. invariance to sample order
#   - common behavior for an argument, e.g. the "normalize" with value True
#     will return the mean of the metrics and with value False will return
#     the sum of the metrics.
#
# In order to improve the overall metric testing, it is a good idea to write
# first a specific test for the given metric and then add a general test for
# all metrics that have the same behavior.
#
# Two types of datastructures are used in order to implement this system:
# dictionaries of metrics and lists of metrics with common properties.
#
# Dictionaries of metrics
# ------------------------
# The goal of having those dictionaries is to have an easy way to call a
# particular metric and associate a name to each function:
#
#   - REGRESSION_METRICS: all regression metrics.
#   - CLASSIFICATION_METRICS: all classification metrics
#     which compare a ground truth and the estimated targets as returned by a
#     classifier.
#   - THRESHOLDED_METRICS: all classification metrics which
#     compare a ground truth and a score, e.g. estimated probabilities or
#     decision function (format might vary)
#
# Those dictionaries will be used to test systematically some invariance
# properties, e.g. invariance toward several input layout.
#

REGRESSION_METRICS = {
    "max_error": max_error,
    "mean_absolute_error": mean_absolute_error,
    "mean_squared_error": mean_squared_error,
    "mean_pinball_loss": mean_pinball_loss,
    "median_absolute_error": median_absolute_error,
    "mean_absolute_percentage_error": mean_absolute_percentage_error,
    "explained_variance_score": explained_variance_score,
    "r2_score": partial(r2_score, multioutput="variance_weighted"),
    "mean_normal_deviance": partial(mean_tweedie_deviance, power=0),
    "mean_poisson_deviance": mean_poisson_deviance,
    "mean_gamma_deviance": mean_gamma_deviance,
    "mean_compound_poisson_deviance": partial(mean_tweedie_deviance, power=1.4),
    "d2_tweedie_score": partial(d2_tweedie_score, power=1.4),
    "d2_pinball_score": d2_pinball_score,
    "d2_absolute_error_score": d2_absolute_error_score,
}

CLASSIFICATION_METRICS = {
    "accuracy_score": accuracy_score,
    "balanced_accuracy_score": balanced_accuracy_score,
    "adjusted_balanced_accuracy_score": partial(balanced_accuracy_score, adjusted=True),
    "unnormalized_accuracy_score": partial(accuracy_score, normalize=False),
    # `confusion_matrix` returns absolute values and hence behaves unnormalized
    # . Naming it with an unnormalized_ prefix is necessary for this module to
    # skip sample_weight scaling checks which will fail for unnormalized
    # metrics.
    "unnormalized_confusion_matrix": confusion_matrix,
    "normalized_confusion_matrix": lambda *args, **kwargs: (
        confusion_matrix(*args, **kwargs).astype("float")
        / confusion_matrix(*args, **kwargs).sum(axis=1)[:, np.newaxis]
    ),
    "unnormalized_multilabel_confusion_matrix": multilabel_confusion_matrix,
    "unnormalized_multilabel_confusion_matrix_sample": partial(
        multilabel_confusion_matrix, samplewise=True
    ),
    "hamming_loss": hamming_loss,
    "zero_one_loss": zero_one_loss,
    "unnormalized_zero_one_loss": partial(zero_one_loss, normalize=False),
    # These are needed to test averaging
    "jaccard_score": jaccard_score,
    "precision_score": precision_score,
    "recall_score": recall_score,
    "f1_score": f1_score,
    "f2_score": partial(fbeta_score, beta=2),
    "f0.5_score": partial(fbeta_score, beta=0.5),
    "matthews_corrcoef_score": matthews_corrcoef,
    "weighted_f0.5_score": partial(fbeta_score, average="weighted", beta=0.5),
    "weighted_f1_score": partial(f1_score, average="weighted"),
    "weighted_f2_score": partial(fbeta_score, average="weighted", beta=2),
    "weighted_precision_score": partial(precision_score, average="weighted"),
    "weighted_recall_score": partial(recall_score, average="weighted"),
    "weighted_jaccard_score": partial(jaccard_score, average="weighted"),
    "micro_f0.5_score": partial(fbeta_score, average="micro", beta=0.5),
    "micro_f1_score": partial(f1_score, average="micro"),
    "micro_f2_score": partial(fbeta_score, average="micro", beta=2),
    "micro_precision_score": partial(precision_score, average="micro"),
    "micro_recall_score": partial(recall_score, average="micro"),
    "micro_jaccard_score": partial(jaccard_score, average="micro"),
    "macro_f0.5_score": partial(fbeta_score, average="macro", beta=0.5),
    "macro_f1_score": partial(f1_score, average="macro"),
    "macro_f2_score": partial(fbeta_score, average="macro", beta=2),
    "macro_precision_score": partial(precision_score, average="macro"),
    "macro_recall_score": partial(recall_score, average="macro"),
    "macro_jaccard_score": partial(jaccard_score, average="macro"),
    "samples_f0.5_score": partial(fbeta_score, average="samples", beta=0.5),
    "samples_f1_score": partial(f1_score, average="samples"),
    "samples_f2_score": partial(fbeta_score, average="samples", beta=2),
    "samples_precision_score": partial(precision_score, average="samples"),
    "samples_recall_score": partial(recall_score, average="samples"),
    "samples_jaccard_score": partial(jaccard_score, average="samples"),
    "cohen_kappa_score": cohen_kappa_score,
}


def precision_recall_curve_padded_thresholds(*args, **kwargs):
    """
    The dimensions of precision-recall pairs and the threshold array as
    returned by the precision_recall_curve do not match. See
    func:`sklearn.metrics.precision_recall_curve`

    This prevents implicit conversion of return value triple to an higher
    dimensional np.array of dtype('float64') (it will be of dtype('object)
    instead). This again is needed for assert_array_equal to work correctly.

    As a workaround we pad the threshold array with NaN values to match
    the dimension of precision and recall arrays respectively.
    """
    precision, recall, thresholds = precision_recall_curve(*args, **kwargs)

    pad_threshholds = len(precision) - len(thresholds)

    return np.array(
        [
            precision,
            recall,
            np.pad(
                thresholds.astype(np.float64),
                pad_width=(0, pad_threshholds),
                mode="constant",
                constant_values=[np.nan],
            ),
        ]
    )


CURVE_METRICS = {
    "roc_curve": roc_curve,
    "precision_recall_curve": precision_recall_curve_padded_thresholds,
    "det_curve": det_curve,
}

THRESHOLDED_METRICS = {
    "coverage_error": coverage_error,
    "label_ranking_loss": label_ranking_loss,
    "log_loss": log_loss,
    "unnormalized_log_loss": partial(log_loss, normalize=False),
    "hinge_loss": hinge_loss,
    "brier_score_loss": brier_score_loss,
    "roc_auc_score": roc_auc_score,  # default: average="macro"
    "weighted_roc_auc": partial(roc_auc_score, average="weighted"),
    "samples_roc_auc": partial(roc_auc_score, average="samples"),
    "micro_roc_auc": partial(roc_auc_score, average="micro"),
    "ovr_roc_auc": partial(roc_auc_score, average="macro", multi_class="ovr"),
    "weighted_ovr_roc_auc": partial(
        roc_auc_score, average="weighted", multi_class="ovr"
    ),
    "ovo_roc_auc": partial(roc_auc_score, average="macro", multi_class="ovo"),
    "weighted_ovo_roc_auc": partial(
        roc_auc_score, average="weighted", multi_class="ovo"
    ),
    "partial_roc_auc": partial(roc_auc_score, max_fpr=0.5),
    "average_precision_score": average_precision_score,  # default: average="macro"
    "weighted_average_precision_score": partial(
        average_precision_score, average="weighted"
    ),
    "samples_average_precision_score": partial(
        average_precision_score, average="samples"
    ),
    "micro_average_precision_score": partial(average_precision_score, average="micro"),
    "label_ranking_average_precision_score": label_ranking_average_precision_score,
    "ndcg_score": ndcg_score,
    "dcg_score": dcg_score,
    "top_k_accuracy_score": top_k_accuracy_score,
}

ALL_METRICS = dict()
ALL_METRICS.update(THRESHOLDED_METRICS)
ALL_METRICS.update(CLASSIFICATION_METRICS)
ALL_METRICS.update(REGRESSION_METRICS)
ALL_METRICS.update(CURVE_METRICS)

# Lists of metrics with common properties
# ---------------------------------------
# Lists of metrics with common properties are used to test systematically some
# functionalities and invariance, e.g. SYMMETRIC_METRICS lists all metrics that
# are symmetric with respect to their input argument y_true and y_pred.
#
# When you add a new metric or functionality, check if a general test
# is already written.

# Those metrics don't support binary inputs
METRIC_UNDEFINED_BINARY = {
    "samples_f0.5_score",
    "samples_f1_score",
    "samples_f2_score",
    "samples_precision_score",
    "samples_recall_score",
    "samples_jaccard_score",
    "coverage_error",
    "unnormalized_multilabel_confusion_matrix_sample",
    "label_ranking_loss",
    "label_ranking_average_precision_score",
    "dcg_score",
    "ndcg_score",
}

# Those metrics don't support multiclass inputs
METRIC_UNDEFINED_MULTICLASS = {
    "brier_score_loss",
    "micro_roc_auc",
    "samples_roc_auc",
    "partial_roc_auc",
    "roc_auc_score",
    "weighted_roc_auc",
    "jaccard_score",
    # with default average='binary', multiclass is prohibited
    "precision_score",
    "recall_score",
    "f1_score",
    "f2_score",
    "f0.5_score",
    # curves
    "roc_curve",
    "precision_recall_curve",
    "det_curve",
}

# Metric undefined with "binary" or "multiclass" input
METRIC_UNDEFINED_BINARY_MULTICLASS = METRIC_UNDEFINED_BINARY.union(
    METRIC_UNDEFINED_MULTICLASS
)

# Metrics with an "average" argument
METRICS_WITH_AVERAGING = {
    "precision_score",
    "recall_score",
    "f1_score",
    "f2_score",
    "f0.5_score",
    "jaccard_score",
}

# Threshold-based metrics with an "average" argument
THRESHOLDED_METRICS_WITH_AVERAGING = {
    "roc_auc_score",
    "average_precision_score",
    "partial_roc_auc",
}

# Metrics with a "pos_label" argument
METRICS_WITH_POS_LABEL = {
    "roc_curve",
    "precision_recall_curve",
    "det_curve",
    "brier_score_loss",
    "precision_score",
    "recall_score",
    "f1_score",
    "f2_score",
    "f0.5_score",
    "jaccard_score",
    "average_precision_score",
    "weighted_average_precision_score",
    "micro_average_precision_score",
    "samples_average_precision_score",
}

# Metrics with a "labels" argument
# TODO: Handle multi_class metrics that has a labels argument as well as a
# decision function argument. e.g hinge_loss
METRICS_WITH_LABELS = {
    "unnormalized_confusion_matrix",
    "normalized_confusion_matrix",
    "roc_curve",
    "precision_recall_curve",
    "det_curve",
    "precision_score",
    "recall_score",
    "f1_score",
    "f2_score",
    "f0.5_score",
    "jaccard_score",
    "weighted_f0.5_score",
    "weighted_f1_score",
    "weighted_f2_score",
    "weighted_precision_score",
    "weighted_recall_score",
    "weighted_jaccard_score",
    "micro_f0.5_score",
    "micro_f1_score",
    "micro_f2_score",
    "micro_precision_score",
    "micro_recall_score",
    "micro_jaccard_score",
    "macro_f0.5_score",
    "macro_f1_score",
    "macro_f2_score",
    "macro_precision_score",
    "macro_recall_score",
    "macro_jaccard_score",
    "unnormalized_multilabel_confusion_matrix",
    "unnormalized_multilabel_confusion_matrix_sample",
    "cohen_kappa_score",
}

# Metrics with a "normalize" option
METRICS_WITH_NORMALIZE_OPTION = {
    "accuracy_score",
    "top_k_accuracy_score",
    "zero_one_loss",
}

# Threshold-based metrics with "multilabel-indicator" format support
THRESHOLDED_MULTILABEL_METRICS = {
    "log_loss",
    "unnormalized_log_loss",
    "roc_auc_score",
    "weighted_roc_auc",
    "samples_roc_auc",
    "micro_roc_auc",
    "partial_roc_auc",
    "average_precision_score",
    "weighted_average_precision_score",
    "samples_average_precision_score",
    "micro_average_precision_score",
    "coverage_error",
    "label_ranking_loss",
    "ndcg_score",
    "dcg_score",
    "label_ranking_average_precision_score",
}

# Classification metrics with  "multilabel-indicator" format
MULTILABELS_METRICS = {
    "accuracy_score",
    "unnormalized_accuracy_score",
    "hamming_loss",
    "zero_one_loss",
    "unnormalized_zero_one_loss",
    "weighted_f0.5_score",
    "weighted_f1_score",
    "weighted_f2_score",
    "weighted_precision_score",
    "weighted_recall_score",
    "weighted_jaccard_score",
    "macro_f0.5_score",
    "macro_f1_score",
    "macro_f2_score",
    "macro_precision_score",
    "macro_recall_score",
    "macro_jaccard_score",
    "micro_f0.5_score",
    "micro_f1_score",
    "micro_f2_score",
    "micro_precision_score",
    "micro_recall_score",
    "micro_jaccard_score",
    "unnormalized_multilabel_confusion_matrix",
    "samples_f0.5_score",
    "samples_f1_score",
    "samples_f2_score",
    "samples_precision_score",
    "samples_recall_score",
    "samples_jaccard_score",
}

# Regression metrics with "multioutput-continuous" format support
MULTIOUTPUT_METRICS = {
    "mean_absolute_error",
    "median_absolute_error",
    "mean_squared_error",
    "r2_score",
    "explained_variance_score",
    "mean_absolute_percentage_error",
    "mean_pinball_loss",
    "d2_pinball_score",
    "d2_absolute_error_score",
}

# Symmetric with respect to their input arguments y_true and y_pred
# metric(y_true, y_pred) == metric(y_pred, y_true).
SYMMETRIC_METRICS = {
    "accuracy_score",
    "unnormalized_accuracy_score",
    "hamming_loss",
    "zero_one_loss",
    "unnormalized_zero_one_loss",
    "micro_jaccard_score",
    "macro_jaccard_score",
    "jaccard_score",
    "samples_jaccard_score",
    "f1_score",
    "micro_f1_score",
    "macro_f1_score",
    "weighted_recall_score",
    # P = R = F = accuracy in multiclass case
    "micro_f0.5_score",
    "micro_f1_score",
    "micro_f2_score",
    "micro_precision_score",
    "micro_recall_score",
    "matthews_corrcoef_score",
    "mean_absolute_error",
    "mean_squared_error",
    "median_absolute_error",
    "max_error",
    # Pinball loss is only symmetric for alpha=0.5 which is the default.
    "mean_pinball_loss",
    "cohen_kappa_score",
    "mean_normal_deviance",
}

# Asymmetric with respect to their input arguments y_true and y_pred
# metric(y_true, y_pred) != metric(y_pred, y_true).
NOT_SYMMETRIC_METRICS = {
    "balanced_accuracy_score",
    "adjusted_balanced_accuracy_score",
    "explained_variance_score",
    "r2_score",
    "unnormalized_confusion_matrix",
    "normalized_confusion_matrix",
    "roc_curve",
    "precision_recall_curve",
    "det_curve",
    "precision_score",
    "recall_score",
    "f2_score",
    "f0.5_score",
    "weighted_f0.5_score",
    "weighted_f1_score",
    "weighted_f2_score",
    "weighted_precision_score",
    "weighted_jaccard_score",
    "unnormalized_multilabel_confusion_matrix",
    "macro_f0.5_score",
    "macro_f2_score",
    "macro_precision_score",
    "macro_recall_score",
    "hinge_loss",
    "mean_gamma_deviance",
    "mean_poisson_deviance",
    "mean_compound_poisson_deviance",
    "d2_tweedie_score",
    "d2_pinball_score",
    "d2_absolute_error_score",
    "mean_absolute_percentage_error",
}


# No Sample weight support
METRICS_WITHOUT_SAMPLE_WEIGHT = {
    "median_absolute_error",
    "max_error",
    "ovo_roc_auc",
    "weighted_ovo_roc_auc",
}

METRICS_REQUIRE_POSITIVE_Y = {
    "mean_poisson_deviance",
    "mean_gamma_deviance",
    "mean_compound_poisson_deviance",
    "d2_tweedie_score",
}


def _require_positive_targets(y1, y2):
    """Make targets strictly positive"""
    offset = abs(min(y1.min(), y2.min())) + 1
    y1 += offset
    y2 += offset
    return y1, y2


def test_symmetry_consistency():
    # We shouldn't forget any metrics
    assert (
        SYMMETRIC_METRICS
        | NOT_SYMMETRIC_METRICS
        | set(THRESHOLDED_METRICS)
        | METRIC_UNDEFINED_BINARY_MULTICLASS
    ) == set(ALL_METRICS)

    assert (SYMMETRIC_METRICS & NOT_SYMMETRIC_METRICS) == set()


@pytest.mark.parametrize("name", sorted(SYMMETRIC_METRICS))
def test_symmetric_metric(name):
    # Test the symmetry of score and loss functions
    random_state = check_random_state(0)
    y_true = random_state.randint(0, 2, size=(20,))
    y_pred = random_state.randint(0, 2, size=(20,))

    if name in METRICS_REQUIRE_POSITIVE_Y:
        y_true, y_pred = _require_positive_targets(y_true, y_pred)

    y_true_bin = random_state.randint(0, 2, size=(20, 25))
    y_pred_bin = random_state.randint(0, 2, size=(20, 25))

    metric = ALL_METRICS[name]
    if name in METRIC_UNDEFINED_BINARY:
        if name in MULTILABELS_METRICS:
            assert_allclose(
                metric(y_true_bin, y_pred_bin),
                metric(y_pred_bin, y_true_bin),
                err_msg="%s is not symmetric" % name,
            )
        else:
            assert False, "This case is currently unhandled"
    else:
        assert_allclose(
            metric(y_true, y_pred),
            metric(y_pred, y_true),
            err_msg="%s is not symmetric" % name,
        )


@pytest.mark.parametrize("name", sorted(NOT_SYMMETRIC_METRICS))
def test_not_symmetric_metric(name):
    # Test the symmetry of score and loss functions
    random_state = check_random_state(0)
    y_true = random_state.randint(0, 2, size=(20,))
    y_pred = random_state.randint(0, 2, size=(20,))

    if name in METRICS_REQUIRE_POSITIVE_Y:
        y_true, y_pred = _require_positive_targets(y_true, y_pred)

    metric = ALL_METRICS[name]

    # use context manager to supply custom error message
    with pytest.raises(AssertionError):
        assert_array_equal(metric(y_true, y_pred), metric(y_pred, y_true))
        raise ValueError("%s seems to be symmetric" % name)


@pytest.mark.parametrize(
    "name", sorted(set(ALL_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS)
)
def test_sample_order_invariance(name):
    random_state = check_random_state(0)
    y_true = random_state.randint(0, 2, size=(20,))
    y_pred = random_state.randint(0, 2, size=(20,))

    if name in METRICS_REQUIRE_POSITIVE_Y:
        y_true, y_pred = _require_positive_targets(y_true, y_pred)

    y_true_shuffle, y_pred_shuffle = shuffle(y_true, y_pred, random_state=0)

    with ignore_warnings():
        metric = ALL_METRICS[name]
        assert_allclose(
            metric(y_true, y_pred),
            metric(y_true_shuffle, y_pred_shuffle),
            err_msg="%s is not sample order invariant" % name,
        )


@ignore_warnings
def test_sample_order_invariance_multilabel_and_multioutput():
    random_state = check_random_state(0)

    # Generate some data
    y_true = random_state.randint(0, 2, size=(20, 25))
    y_pred = random_state.randint(0, 2, size=(20, 25))
    y_score = random_state.uniform(size=y_true.shape)

    # Some metrics (e.g. log_loss) require y_score to be probabilities (sum to 1)
    y_score /= y_score.sum(axis=1, keepdims=True)

    y_true_shuffle, y_pred_shuffle, y_score_shuffle = shuffle(
        y_true, y_pred, y_score, random_state=0
    )

    for name in MULTILABELS_METRICS:
        metric = ALL_METRICS[name]
        assert_allclose(
            metric(y_true, y_pred),
            metric(y_true_shuffle, y_pred_shuffle),
            err_msg="%s is not sample order invariant" % name,
        )

    for name in THRESHOLDED_MULTILABEL_METRICS:
        metric = ALL_METRICS[name]
        assert_allclose(
            metric(y_true, y_score),
            metric(y_true_shuffle, y_score_shuffle),
            err_msg="%s is not sample order invariant" % name,
        )

    for name in MULTIOUTPUT_METRICS:
        metric = ALL_METRICS[name]
        assert_allclose(
            metric(y_true, y_score),
            metric(y_true_shuffle, y_score_shuffle),
            err_msg="%s is not sample order invariant" % name,
        )
        assert_allclose(
            metric(y_true, y_pred),
            metric(y_true_shuffle, y_pred_shuffle),
            err_msg="%s is not sample order invariant" % name,
        )


@pytest.mark.parametrize(
    "name", sorted(set(ALL_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS)
)
def test_format_invariance_with_1d_vectors(name):
    random_state = check_random_state(0)
    y1 = random_state.randint(0, 2, size=(20,))
    y2 = random_state.randint(0, 2, size=(20,))

    if name in METRICS_REQUIRE_POSITIVE_Y:
        y1, y2 = _require_positive_targets(y1, y2)

    y1_list = list(y1)
    y2_list = list(y2)

    y1_1d, y2_1d = np.array(y1), np.array(y2)
    assert_array_equal(y1_1d.ndim, 1)
    assert_array_equal(y2_1d.ndim, 1)
    y1_column = np.reshape(y1_1d, (-1, 1))
    y2_column = np.reshape(y2_1d, (-1, 1))
    y1_row = np.reshape(y1_1d, (1, -1))
    y2_row = np.reshape(y2_1d, (1, -1))

    with ignore_warnings():
        metric = ALL_METRICS[name]

        measure = metric(y1, y2)

        assert_allclose(
            metric(y1_list, y2_list),
            measure,
            err_msg="%s is not representation invariant with list" % name,
        )

        assert_allclose(
            metric(y1_1d, y2_1d),
            measure,
            err_msg="%s is not representation invariant with np-array-1d" % name,
        )

        assert_allclose(
            metric(y1_column, y2_column),
            measure,
            err_msg="%s is not representation invariant with np-array-column" % name,
        )

        # Mix format support
        assert_allclose(
            metric(y1_1d, y2_list),
            measure,
            err_msg="%s is not representation invariant with mix np-array-1d and list"
            % name,
        )

        assert_allclose(
            metric(y1_list, y2_1d),
            measure,
            err_msg="%s is not representation invariant with mix np-array-1d and list"
            % name,
        )

        assert_allclose(
            metric(y1_1d, y2_column),
            measure,
            err_msg=(
                "%s is not representation invariant with mix "
                "np-array-1d and np-array-column"
            )
            % name,
        )

        assert_allclose(
            metric(y1_column, y2_1d),
            measure,
            err_msg=(
                "%s is not representation invariant with mix "
                "np-array-1d and np-array-column"
            )
            % name,
        )

        assert_allclose(
            metric(y1_list, y2_column),
            measure,
            err_msg=(
                "%s is not representation invariant with mix list and np-array-column"
            )
            % name,
        )

        assert_allclose(
            metric(y1_column, y2_list),
            measure,
            err_msg=(
                "%s is not representation invariant with mix list and np-array-column"
            )
            % name,
        )

        # These mix representations aren't allowed
        with pytest.raises(ValueError):
            metric(y1_1d, y2_row)
        with pytest.raises(ValueError):
            metric(y1_row, y2_1d)
        with pytest.raises(ValueError):
            metric(y1_list, y2_row)
        with pytest.raises(ValueError):
            metric(y1_row, y2_list)
        with pytest.raises(ValueError):
            metric(y1_column, y2_row)
        with pytest.raises(ValueError):
            metric(y1_row, y2_column)

        # NB: We do not test for y1_row, y2_row as these may be
        # interpreted as multilabel or multioutput data.
        if name not in (
            MULTIOUTPUT_METRICS | THRESHOLDED_MULTILABEL_METRICS | MULTILABELS_METRICS
        ):
            with pytest.raises(ValueError):
                metric(y1_row, y2_row)


@pytest.mark.parametrize(
    "name", sorted(set(CLASSIFICATION_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS)
)
def test_classification_invariance_string_vs_numbers_labels(name):
    # Ensure that classification metrics with string labels are invariant
    random_state = check_random_state(0)
    y1 = random_state.randint(0, 2, size=(20,))
    y2 = random_state.randint(0, 2, size=(20,))

    y1_str = np.array(["eggs", "spam"])[y1]
    y2_str = np.array(["eggs", "spam"])[y2]

    pos_label_str = "spam"
    labels_str = ["eggs", "spam"]

    with ignore_warnings():
        metric = CLASSIFICATION_METRICS[name]
        measure_with_number = metric(y1, y2)

        # Ugly, but handle case with a pos_label and label
        metric_str = metric
        if name in METRICS_WITH_POS_LABEL:
            metric_str = partial(metric_str, pos_label=pos_label_str)

        measure_with_str = metric_str(y1_str, y2_str)

        assert_array_equal(
            measure_with_number,
            measure_with_str,
            err_msg="{0} failed string vs number invariance test".format(name),
        )

        measure_with_strobj = metric_str(y1_str.astype("O"), y2_str.astype("O"))
        assert_array_equal(
            measure_with_number,
            measure_with_strobj,
            err_msg="{0} failed string object vs number invariance test".format(name),
        )

        if name in METRICS_WITH_LABELS:
            metric_str = partial(metric_str, labels=labels_str)
            measure_with_str = metric_str(y1_str, y2_str)
            assert_array_equal(
                measure_with_number,
                measure_with_str,
                err_msg="{0} failed string vs number  invariance test".format(name),
            )

            measure_with_strobj = metric_str(y1_str.astype("O"), y2_str.astype("O"))
            assert_array_equal(
                measure_with_number,
                measure_with_strobj,
                err_msg="{0} failed string vs number  invariance test".format(name),
            )


@pytest.mark.parametrize("name", THRESHOLDED_METRICS)
def test_thresholded_invariance_string_vs_numbers_labels(name):
    # Ensure that thresholded metrics with string labels are invariant
    random_state = check_random_state(0)
    y1 = random_state.randint(0, 2, size=(20,))
    y2 = random_state.randint(0, 2, size=(20,))

    y1_str = np.array(["eggs", "spam"])[y1]

    pos_label_str = "spam"

    with ignore_warnings():
        metric = THRESHOLDED_METRICS[name]
        if name not in METRIC_UNDEFINED_BINARY:
            # Ugly, but handle case with a pos_label and label
            metric_str = metric
            if name in METRICS_WITH_POS_LABEL:
                metric_str = partial(metric_str, pos_label=pos_label_str)

            measure_with_number = metric(y1, y2)
            measure_with_str = metric_str(y1_str, y2)
            assert_array_equal(
                measure_with_number,
                measure_with_str,
                err_msg="{0} failed string vs number invariance test".format(name),
            )

            measure_with_strobj = metric_str(y1_str.astype("O"), y2)
            assert_array_equal(
                measure_with_number,
                measure_with_strobj,
                err_msg="{0} failed string object vs number invariance test".format(
                    name
                ),
            )
        else:
            # TODO those metrics doesn't support string label yet
            with pytest.raises(ValueError):
                metric(y1_str, y2)
            with pytest.raises(ValueError):
                metric(y1_str.astype("O"), y2)


invalids_nan_inf = [
    ([0, 1], [np.inf, np.inf]),
    ([0, 1], [np.nan, np.nan]),
    ([0, 1], [np.nan, np.inf]),
    ([0, 1], [np.inf, 1]),
    ([0, 1], [np.nan, 1]),
]


@pytest.mark.parametrize(
    "metric", chain(THRESHOLDED_METRICS.values(), REGRESSION_METRICS.values())
)
@pytest.mark.parametrize("y_true, y_score", invalids_nan_inf)
def test_regression_thresholded_inf_nan_input(metric, y_true, y_score):
    # Reshape since coverage_error only accepts 2D arrays.
    if metric == coverage_error:
        y_true = [y_true]
        y_score = [y_score]
    with pytest.raises(ValueError, match=r"contains (NaN|infinity)"):
        metric(y_true, y_score)


@pytest.mark.parametrize("metric", CLASSIFICATION_METRICS.values())
@pytest.mark.parametrize(
    "y_true, y_score",
    invalids_nan_inf +
    # Add an additional case for classification only
    # non-regression test for:
    # https://github.com/scikit-learn/scikit-learn/issues/6809
    [
        ([np.nan, 1, 2], [1, 2, 3]),
        ([np.inf, 1, 2], [1, 2, 3]),
    ],  # type: ignore
)
def test_classification_inf_nan_input(metric, y_true, y_score):
    """check that classification metrics raise a message mentioning the
    occurrence of non-finite values in the target vectors."""
    if not np.isfinite(y_true).all():
        input_name = "y_true"
        if np.isnan(y_true).any():
            unexpected_value = "NaN"
        else:
            unexpected_value = "infinity or a value too large"
    else:
        input_name = "y_pred"
        if np.isnan(y_score).any():
            unexpected_value = "NaN"
        else:
            unexpected_value = "infinity or a value too large"

    err_msg = f"Input {input_name} contains {unexpected_value}"

    with pytest.raises(ValueError, match=err_msg):
        metric(y_true, y_score)


@pytest.mark.parametrize("metric", CLASSIFICATION_METRICS.values())
def test_classification_binary_continuous_input(metric):
    """check that classification metrics raise a message of mixed type data
    with continuous/binary target vectors."""
    y_true, y_score = ["a", "b", "a"], [0.1, 0.2, 0.3]
    err_msg = (
        "Classification metrics can't handle a mix of binary and continuous targets"
    )
    with pytest.raises(ValueError, match=err_msg):
        metric(y_true, y_score)


@ignore_warnings
def check_single_sample(name):
    # Non-regression test: scores should work with a single sample.
    # This is important for leave-one-out cross validation.
    # Score functions tested are those that formerly called np.squeeze,
    # which turns an array of size 1 into a 0-d array (!).
    metric = ALL_METRICS[name]

    # assert that no exception is thrown
    if name in METRICS_REQUIRE_POSITIVE_Y:
        values = [1, 2]
    else:
        values = [0, 1]
    for i, j in product(values, repeat=2):
        metric([i], [j])


@ignore_warnings
def check_single_sample_multioutput(name):
    metric = ALL_METRICS[name]
    for i, j, k, l in product([0, 1], repeat=4):
        metric(np.array([[i, j]]), np.array([[k, l]]))


@pytest.mark.parametrize(
    "name",
    sorted(
        set(ALL_METRICS)
        # Those metrics are not always defined with one sample
        # or in multiclass classification
        - METRIC_UNDEFINED_BINARY_MULTICLASS
        - set(THRESHOLDED_METRICS)
    ),
)
def test_single_sample(name):
    check_single_sample(name)


@pytest.mark.parametrize("name", sorted(MULTIOUTPUT_METRICS | MULTILABELS_METRICS))
def test_single_sample_multioutput(name):
    check_single_sample_multioutput(name)


@pytest.mark.parametrize("name", sorted(MULTIOUTPUT_METRICS))
def test_multioutput_number_of_output_differ(name):
    y_true = np.array([[1, 0, 0, 1], [0, 1, 1, 1], [1, 1, 0, 1]])
    y_pred = np.array([[0, 0], [1, 0], [0, 0]])

    metric = ALL_METRICS[name]
    with pytest.raises(ValueError):
        metric(y_true, y_pred)


@pytest.mark.parametrize("name", sorted(MULTIOUTPUT_METRICS))
def test_multioutput_regression_invariance_to_dimension_shuffling(name):
    # test invariance to dimension shuffling
    random_state = check_random_state(0)
    y_true = random_state.uniform(0, 2, size=(20, 5))
    y_pred = random_state.uniform(0, 2, size=(20, 5))

    metric = ALL_METRICS[name]
    error = metric(y_true, y_pred)

    for _ in range(3):
        perm = random_state.permutation(y_true.shape[1])
        assert_allclose(
            metric(y_true[:, perm], y_pred[:, perm]),
            error,
            err_msg="%s is not dimension shuffling invariant" % (name),
        )


@ignore_warnings
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
def test_multilabel_representation_invariance(coo_container):
    # Generate some data
    n_classes = 4
    n_samples = 50

    _, y1 = make_multilabel_classification(
        n_features=1,
        n_classes=n_classes,
        random_state=0,
        n_samples=n_samples,
        allow_unlabeled=True,
    )
    _, y2 = make_multilabel_classification(
        n_features=1,
        n_classes=n_classes,
        random_state=1,
        n_samples=n_samples,
        allow_unlabeled=True,
    )

    # To make sure at least one empty label is present
    y1 = np.vstack([y1, [[0] * n_classes]])
    y2 = np.vstack([y2, [[0] * n_classes]])

    y1_sparse_indicator = coo_container(y1)
    y2_sparse_indicator = coo_container(y2)

    y1_list_array_indicator = list(y1)
    y2_list_array_indicator = list(y2)

    y1_list_list_indicator = [list(a) for a in y1_list_array_indicator]
    y2_list_list_indicator = [list(a) for a in y2_list_array_indicator]

    for name in MULTILABELS_METRICS:
        metric = ALL_METRICS[name]

        # XXX cruel hack to work with partial functions
        if isinstance(metric, partial):
            metric.__module__ = "tmp"
            metric.__name__ = name

        measure = metric(y1, y2)

        # Check representation invariance
        assert_allclose(
            metric(y1_sparse_indicator, y2_sparse_indicator),
            measure,
            err_msg=(
                "%s failed representation invariance between "
                "dense and sparse indicator formats."
            )
            % name,
        )
        assert_almost_equal(
            metric(y1_list_list_indicator, y2_list_list_indicator),
            measure,
            err_msg=(
                "%s failed representation invariance  "
                "between dense array and list of list "
                "indicator formats."
            )
            % name,
        )
        assert_almost_equal(
            metric(y1_list_array_indicator, y2_list_array_indicator),
            measure,
            err_msg=(
                "%s failed representation invariance  "
                "between dense and list of array "
                "indicator formats."
            )
            % name,
        )


@pytest.mark.parametrize("name", sorted(MULTILABELS_METRICS))
def test_raise_value_error_multilabel_sequences(name):
    # make sure the multilabel-sequence format raises ValueError
    multilabel_sequences = [
        [[1], [2], [0, 1]],
        [(), (2), (0, 1)],
        [[]],
        [()],
        np.array([[], [1, 2]], dtype="object"),
    ]

    metric = ALL_METRICS[name]
    for seq in multilabel_sequences:
        with pytest.raises(ValueError):
            metric(seq, seq)


@pytest.mark.parametrize("name", sorted(METRICS_WITH_NORMALIZE_OPTION))
def test_normalize_option_binary_classification(name):
    # Test in the binary case
    n_classes = 2
    n_samples = 20
    random_state = check_random_state(0)

    y_true = random_state.randint(0, n_classes, size=(n_samples,))
    y_pred = random_state.randint(0, n_classes, size=(n_samples,))
    y_score = random_state.normal(size=y_true.shape)

    metrics = ALL_METRICS[name]
    pred = y_score if name in THRESHOLDED_METRICS else y_pred
    measure_normalized = metrics(y_true, pred, normalize=True)
    measure_not_normalized = metrics(y_true, pred, normalize=False)

    assert_array_less(
        -1.0 * measure_normalized,
        0,
        err_msg="We failed to test correctly the normalize option",
    )

    assert_allclose(
        measure_normalized,
        measure_not_normalized / n_samples,
        err_msg=f"Failed with {name}",
    )


@pytest.mark.parametrize("name", sorted(METRICS_WITH_NORMALIZE_OPTION))
def test_normalize_option_multiclass_classification(name):
    # Test in the multiclass case
    n_classes = 4
    n_samples = 20
    random_state = check_random_state(0)

    y_true = random_state.randint(0, n_classes, size=(n_samples,))
    y_pred = random_state.randint(0, n_classes, size=(n_samples,))
    y_score = random_state.uniform(size=(n_samples, n_classes))

    metrics = ALL_METRICS[name]
    pred = y_score if name in THRESHOLDED_METRICS else y_pred
    measure_normalized = metrics(y_true, pred, normalize=True)
    measure_not_normalized = metrics(y_true, pred, normalize=False)

    assert_array_less(
        -1.0 * measure_normalized,
        0,
        err_msg="We failed to test correctly the normalize option",
    )

    assert_allclose(
        measure_normalized,
        measure_not_normalized / n_samples,
        err_msg=f"Failed with {name}",
    )


@pytest.mark.parametrize(
    "name", sorted(METRICS_WITH_NORMALIZE_OPTION.intersection(MULTILABELS_METRICS))
)
def test_normalize_option_multilabel_classification(name):
    # Test in the multilabel case
    n_classes = 4
    n_samples = 100
    random_state = check_random_state(0)

    # for both random_state 0 and 1, y_true and y_pred has at least one
    # unlabelled entry
    _, y_true = make_multilabel_classification(
        n_features=1,
        n_classes=n_classes,
        random_state=0,
        allow_unlabeled=True,
        n_samples=n_samples,
    )
    _, y_pred = make_multilabel_classification(
        n_features=1,
        n_classes=n_classes,
        random_state=1,
        allow_unlabeled=True,
        n_samples=n_samples,
    )

    y_score = random_state.uniform(size=y_true.shape)

    # To make sure at least one empty label is present
    y_true += [0] * n_classes
    y_pred += [0] * n_classes

    metrics = ALL_METRICS[name]
    pred = y_score if name in THRESHOLDED_METRICS else y_pred
    measure_normalized = metrics(y_true, pred, normalize=True)
    measure_not_normalized = metrics(y_true, pred, normalize=False)

    assert_array_less(
        -1.0 * measure_normalized,
        0,
        err_msg="We failed to test correctly the normalize option",
    )

    assert_allclose(
        measure_normalized,
        measure_not_normalized / n_samples,
        err_msg=f"Failed with {name}",
    )


@ignore_warnings
def _check_averaging(
    metric, y_true, y_pred, y_true_binarize, y_pred_binarize, is_multilabel
):
    n_samples, n_classes = y_true_binarize.shape

    # No averaging
    label_measure = metric(y_true, y_pred, average=None)
    assert_allclose(
        label_measure,
        [
            metric(y_true_binarize[:, i], y_pred_binarize[:, i])
            for i in range(n_classes)
        ],
    )

    # Micro measure
    micro_measure = metric(y_true, y_pred, average="micro")
    assert_allclose(
        micro_measure, metric(y_true_binarize.ravel(), y_pred_binarize.ravel())
    )

    # Macro measure
    macro_measure = metric(y_true, y_pred, average="macro")
    assert_allclose(macro_measure, np.mean(label_measure))

    # Weighted measure
    weights = np.sum(y_true_binarize, axis=0, dtype=int)

    if np.sum(weights) != 0:
        weighted_measure = metric(y_true, y_pred, average="weighted")
        assert_allclose(weighted_measure, np.average(label_measure, weights=weights))
    else:
        weighted_measure = metric(y_true, y_pred, average="weighted")
        assert_allclose(weighted_measure, 0)

    # Sample measure
    if is_multilabel:
        sample_measure = metric(y_true, y_pred, average="samples")
        assert_allclose(
            sample_measure,
            np.mean(
                [
                    metric(y_true_binarize[i], y_pred_binarize[i])
                    for i in range(n_samples)
                ]
            ),
        )

    with pytest.raises(ValueError):
        metric(y_true, y_pred, average="unknown")
    with pytest.raises(ValueError):
        metric(y_true, y_pred, average="garbage")


def check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score):
    is_multilabel = type_of_target(y_true).startswith("multilabel")

    metric = ALL_METRICS[name]

    if name in METRICS_WITH_AVERAGING:
        _check_averaging(
            metric, y_true, y_pred, y_true_binarize, y_pred_binarize, is_multilabel
        )
    elif name in THRESHOLDED_METRICS_WITH_AVERAGING:
        _check_averaging(
            metric, y_true, y_score, y_true_binarize, y_score, is_multilabel
        )
    else:
        raise ValueError("Metric is not recorded as having an average option")


@pytest.mark.parametrize("name", sorted(METRICS_WITH_AVERAGING))
def test_averaging_multiclass(name):
    n_samples, n_classes = 50, 3
    random_state = check_random_state(0)
    y_true = random_state.randint(0, n_classes, size=(n_samples,))
    y_pred = random_state.randint(0, n_classes, size=(n_samples,))
    y_score = random_state.uniform(size=(n_samples, n_classes))

    lb = LabelBinarizer().fit(y_true)
    y_true_binarize = lb.transform(y_true)
    y_pred_binarize = lb.transform(y_pred)

    check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score)


@pytest.mark.parametrize(
    "name", sorted(METRICS_WITH_AVERAGING | THRESHOLDED_METRICS_WITH_AVERAGING)
)
def test_averaging_multilabel(name):
    n_samples, n_classes = 40, 5
    _, y = make_multilabel_classification(
        n_features=1,
        n_classes=n_classes,
        random_state=5,
        n_samples=n_samples,
        allow_unlabeled=False,
    )
    y_true = y[:20]
    y_pred = y[20:]
    y_score = check_random_state(0).normal(size=(20, n_classes))
    y_true_binarize = y_true
    y_pred_binarize = y_pred

    check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score)


@pytest.mark.parametrize("name", sorted(METRICS_WITH_AVERAGING))
def test_averaging_multilabel_all_zeroes(name):
    y_true = np.zeros((20, 3))
    y_pred = np.zeros((20, 3))
    y_score = np.zeros((20, 3))
    y_true_binarize = y_true
    y_pred_binarize = y_pred

    check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score)


def test_averaging_binary_multilabel_all_zeroes():
    y_true = np.zeros((20, 3))
    y_pred = np.zeros((20, 3))
    y_true_binarize = y_true
    y_pred_binarize = y_pred
    # Test _average_binary_score for weight.sum() == 0
    binary_metric = lambda y_true, y_score, average="macro": _average_binary_score(
        precision_score, y_true, y_score, average
    )
    _check_averaging(
        binary_metric,
        y_true,
        y_pred,
        y_true_binarize,
        y_pred_binarize,
        is_multilabel=True,
    )


@pytest.mark.parametrize("name", sorted(METRICS_WITH_AVERAGING))
def test_averaging_multilabel_all_ones(name):
    y_true = np.ones((20, 3))
    y_pred = np.ones((20, 3))
    y_score = np.ones((20, 3))
    y_true_binarize = y_true
    y_pred_binarize = y_pred

    check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize, y_score)


@ignore_warnings
def check_sample_weight_invariance(name, metric, y1, y2):
    rng = np.random.RandomState(0)
    sample_weight = rng.randint(1, 10, size=len(y1))

    # top_k_accuracy_score always lead to a perfect score for k > 1 in the
    # binary case
    metric = partial(metric, k=1) if name == "top_k_accuracy_score" else metric

    # check that unit weights gives the same score as no weight
    unweighted_score = metric(y1, y2, sample_weight=None)

    assert_allclose(
        unweighted_score,
        metric(y1, y2, sample_weight=np.ones(shape=len(y1))),
        err_msg="For %s sample_weight=None is not equivalent to sample_weight=ones"
        % name,
    )

    # check that the weighted and unweighted scores are unequal
    weighted_score = metric(y1, y2, sample_weight=sample_weight)

    # use context manager to supply custom error message
    with pytest.raises(AssertionError):
        assert_allclose(unweighted_score, weighted_score)
        raise ValueError(
            "Unweighted and weighted scores are unexpectedly "
            "almost equal (%s) and (%s) "
            "for %s" % (unweighted_score, weighted_score, name)
        )

    # check that sample_weight can be a list
    weighted_score_list = metric(y1, y2, sample_weight=sample_weight.tolist())
    assert_allclose(
        weighted_score,
        weighted_score_list,
        err_msg=(
            "Weighted scores for array and list "
            "sample_weight input are not equal (%s != %s) for %s"
        )
        % (weighted_score, weighted_score_list, name),
    )

    # check that integer weights is the same as repeated samples
    repeat_weighted_score = metric(
        np.repeat(y1, sample_weight, axis=0),
        np.repeat(y2, sample_weight, axis=0),
        sample_weight=None,
    )
    assert_allclose(
        weighted_score,
        repeat_weighted_score,
        err_msg="Weighting %s is not equal to repeating samples" % name,
    )

    # check that ignoring a fraction of the samples is equivalent to setting
    # the corresponding weights to zero
    sample_weight_subset = sample_weight[1::2]
    sample_weight_zeroed = np.copy(sample_weight)
    sample_weight_zeroed[::2] = 0
    y1_subset = y1[1::2]
    y2_subset = y2[1::2]
    weighted_score_subset = metric(
        y1_subset, y2_subset, sample_weight=sample_weight_subset
    )
    weighted_score_zeroed = metric(y1, y2, sample_weight=sample_weight_zeroed)
    assert_allclose(
        weighted_score_subset,
        weighted_score_zeroed,
        err_msg=(
            "Zeroing weights does not give the same result as "
            "removing the corresponding samples (%s != %s) for %s"
        )
        % (weighted_score_zeroed, weighted_score_subset, name),
    )

    if not name.startswith("unnormalized"):
        # check that the score is invariant under scaling of the weights by a
        # common factor
        for scaling in [2, 0.3]:
            assert_allclose(
                weighted_score,
                metric(y1, y2, sample_weight=sample_weight * scaling),
                err_msg="%s sample_weight is not invariant under scaling" % name,
            )

    # Check that if number of samples in y_true and sample_weight are not
    # equal, meaningful error is raised.
    error_message = (
        r"Found input variables with inconsistent numbers of "
        r"samples: \[{}, {}, {}\]".format(
            _num_samples(y1), _num_samples(y2), _num_samples(sample_weight) * 2
        )
    )
    with pytest.raises(ValueError, match=error_message):
        metric(y1, y2, sample_weight=np.hstack([sample_weight, sample_weight]))


@pytest.mark.parametrize(
    "name",
    sorted(
        set(ALL_METRICS).intersection(set(REGRESSION_METRICS))
        - METRICS_WITHOUT_SAMPLE_WEIGHT
    ),
)
def test_regression_sample_weight_invariance(name):
    n_samples = 50
    random_state = check_random_state(0)
    # regression
    y_true = random_state.random_sample(size=(n_samples,))
    y_pred = random_state.random_sample(size=(n_samples,))
    metric = ALL_METRICS[name]
    check_sample_weight_invariance(name, metric, y_true, y_pred)


@pytest.mark.parametrize(
    "name",
    sorted(
        set(ALL_METRICS)
        - set(REGRESSION_METRICS)
        - METRICS_WITHOUT_SAMPLE_WEIGHT
        - METRIC_UNDEFINED_BINARY
    ),
)
def test_binary_sample_weight_invariance(name):
    # binary
    n_samples = 50
    random_state = check_random_state(0)
    y_true = random_state.randint(0, 2, size=(n_samples,))
    y_pred = random_state.randint(0, 2, size=(n_samples,))
    y_score = random_state.random_sample(size=(n_samples,))
    metric = ALL_METRICS[name]
    if name in THRESHOLDED_METRICS:
        check_sample_weight_invariance(name, metric, y_true, y_score)
    else:
        check_sample_weight_invariance(name, metric, y_true, y_pred)


@pytest.mark.parametrize(
    "name",
    sorted(
        set(ALL_METRICS)
        - set(REGRESSION_METRICS)
        - METRICS_WITHOUT_SAMPLE_WEIGHT
        - METRIC_UNDEFINED_BINARY_MULTICLASS
    ),
)
def test_multiclass_sample_weight_invariance(name):
    # multiclass
    n_samples = 50
    random_state = check_random_state(0)
    y_true = random_state.randint(0, 5, size=(n_samples,))
    y_pred = random_state.randint(0, 5, size=(n_samples,))
    y_score = random_state.random_sample(size=(n_samples, 5))
    metric = ALL_METRICS[name]
    if name in THRESHOLDED_METRICS:
        # softmax
        temp = np.exp(-y_score)
        y_score_norm = temp / temp.sum(axis=-1).reshape(-1, 1)
        check_sample_weight_invariance(name, metric, y_true, y_score_norm)
    else:
        check_sample_weight_invariance(name, metric, y_true, y_pred)


@pytest.mark.parametrize(
    "name",
    sorted(
        (MULTILABELS_METRICS | THRESHOLDED_MULTILABEL_METRICS | MULTIOUTPUT_METRICS)
        - METRICS_WITHOUT_SAMPLE_WEIGHT
    ),
)
def test_multilabel_sample_weight_invariance(name):
    # multilabel indicator
    random_state = check_random_state(0)
    _, ya = make_multilabel_classification(
        n_features=1, n_classes=10, random_state=0, n_samples=50, allow_unlabeled=False
    )
    _, yb = make_multilabel_classification(
        n_features=1, n_classes=10, random_state=1, n_samples=50, allow_unlabeled=False
    )
    y_true = np.vstack([ya, yb])
    y_pred = np.vstack([ya, ya])
    y_score = random_state.uniform(size=y_true.shape)

    # Some metrics (e.g. log_loss) require y_score to be probabilities (sum to 1)
    y_score /= y_score.sum(axis=1, keepdims=True)

    metric = ALL_METRICS[name]
    if name in THRESHOLDED_METRICS:
        check_sample_weight_invariance(name, metric, y_true, y_score)
    else:
        check_sample_weight_invariance(name, metric, y_true, y_pred)


@ignore_warnings
def test_no_averaging_labels():
    # test labels argument when not using averaging
    # in multi-class and multi-label cases
    y_true_multilabel = np.array([[1, 1, 0, 0], [1, 1, 0, 0]])
    y_pred_multilabel = np.array([[0, 0, 1, 1], [0, 1, 1, 0]])
    y_true_multiclass = np.array([0, 1, 2])
    y_pred_multiclass = np.array([0, 2, 3])
    labels = np.array([3, 0, 1, 2])
    _, inverse_labels = np.unique(labels, return_inverse=True)

    for name in METRICS_WITH_AVERAGING:
        for y_true, y_pred in [
            [y_true_multiclass, y_pred_multiclass],
            [y_true_multilabel, y_pred_multilabel],
        ]:
            if name not in MULTILABELS_METRICS and y_pred.ndim > 1:
                continue

            metric = ALL_METRICS[name]

            score_labels = metric(y_true, y_pred, labels=labels, average=None)
            score = metric(y_true, y_pred, average=None)
            assert_array_equal(score_labels, score[inverse_labels])


@pytest.mark.parametrize(
    "name", sorted(MULTILABELS_METRICS - {"unnormalized_multilabel_confusion_matrix"})
)
def test_multilabel_label_permutations_invariance(name):
    random_state = check_random_state(0)
    n_samples, n_classes = 20, 4

    y_true = random_state.randint(0, 2, size=(n_samples, n_classes))
    y_score = random_state.randint(0, 2, size=(n_samples, n_classes))

    metric = ALL_METRICS[name]
    score = metric(y_true, y_score)

    for perm in permutations(range(n_classes), n_classes):
        y_score_perm = y_score[:, perm]
        y_true_perm = y_true[:, perm]

        current_score = metric(y_true_perm, y_score_perm)
        assert_almost_equal(score, current_score)


@pytest.mark.parametrize(
    "name", sorted(THRESHOLDED_MULTILABEL_METRICS | MULTIOUTPUT_METRICS)
)
def test_thresholded_multilabel_multioutput_permutations_invariance(name):
    random_state = check_random_state(0)
    n_samples, n_classes = 20, 4
    y_true = random_state.randint(0, 2, size=(n_samples, n_classes))
    y_score = random_state.uniform(size=y_true.shape)

    # Some metrics (e.g. log_loss) require y_score to be probabilities (sum to 1)
    y_score /= y_score.sum(axis=1, keepdims=True)

    # Makes sure all samples have at least one label. This works around errors
    # when running metrics where average="sample"
    y_true[y_true.sum(1) == 4, 0] = 0
    y_true[y_true.sum(1) == 0, 0] = 1

    metric = ALL_METRICS[name]
    score = metric(y_true, y_score)

    for perm in permutations(range(n_classes), n_classes):
        y_score_perm = y_score[:, perm]
        y_true_perm = y_true[:, perm]

        current_score = metric(y_true_perm, y_score_perm)
        if metric == mean_absolute_percentage_error:
            assert np.isfinite(current_score)
            assert current_score > 1e6
            # Here we are not comparing the values in case of MAPE because
            # whenever y_true value is exactly zero, the MAPE value doesn't
            # signify anything. Thus, in this case we are just expecting
            # very large finite value.
        else:
            assert_almost_equal(score, current_score)


@pytest.mark.parametrize(
    "name", sorted(set(THRESHOLDED_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS)
)
def test_thresholded_metric_permutation_invariance(name):
    n_samples, n_classes = 100, 3
    random_state = check_random_state(0)

    y_score = random_state.rand(n_samples, n_classes)
    temp = np.exp(-y_score)
    y_score = temp / temp.sum(axis=-1).reshape(-1, 1)
    y_true = random_state.randint(0, n_classes, size=n_samples)

    metric = ALL_METRICS[name]
    score = metric(y_true, y_score)
    for perm in permutations(range(n_classes), n_classes):
        inverse_perm = np.zeros(n_classes, dtype=int)
        inverse_perm[list(perm)] = np.arange(n_classes)
        y_score_perm = y_score[:, inverse_perm]
        y_true_perm = np.take(perm, y_true)

        current_score = metric(y_true_perm, y_score_perm)
        assert_almost_equal(score, current_score)


@pytest.mark.parametrize("metric_name", CLASSIFICATION_METRICS)
def test_metrics_consistent_type_error(metric_name):
    # check that an understable message is raised when the type between y_true
    # and y_pred mismatch
    rng = np.random.RandomState(42)
    y1 = np.array(["spam"] * 3 + ["eggs"] * 2, dtype=object)
    y2 = rng.randint(0, 2, size=y1.size)

    err_msg = "Labels in y_true and y_pred should be of the same type."
    with pytest.raises(TypeError, match=err_msg):
        CLASSIFICATION_METRICS[metric_name](y1, y2)


@pytest.mark.parametrize(
    "metric, y_pred_threshold",
    [
        (average_precision_score, True),
        (brier_score_loss, True),
        (f1_score, False),
        (partial(fbeta_score, beta=1), False),
        (jaccard_score, False),
        (precision_recall_curve, True),
        (precision_score, False),
        (recall_score, False),
        (roc_curve, True),
    ],
)
@pytest.mark.parametrize("dtype_y_str", [str, object])
def test_metrics_pos_label_error_str(metric, y_pred_threshold, dtype_y_str):
    # check that the error message if `pos_label` is not specified and the
    # targets is made of strings.
    rng = np.random.RandomState(42)
    y1 = np.array(["spam"] * 3 + ["eggs"] * 2, dtype=dtype_y_str)
    y2 = rng.randint(0, 2, size=y1.size)

    if not y_pred_threshold:
        y2 = np.array(["spam", "eggs"], dtype=dtype_y_str)[y2]

    err_msg_pos_label_None = (
        "y_true takes value in {'eggs', 'spam'} and pos_label is not "
        "specified: either make y_true take value in {0, 1} or {-1, 1} or "
        "pass pos_label explicit"
    )
    err_msg_pos_label_1 = (
        r"pos_label=1 is not a valid label. It should be one of " r"\['eggs', 'spam'\]"
    )

    pos_label_default = signature(metric).parameters["pos_label"].default

    err_msg = err_msg_pos_label_1 if pos_label_default == 1 else err_msg_pos_label_None
    with pytest.raises(ValueError, match=err_msg):
        metric(y1, y2)


def check_array_api_metric(
    metric, array_namespace, device, dtype_name, y_true_np, y_pred_np, sample_weight
):
    xp = _array_api_for_tests(array_namespace, device)

    y_true_xp = xp.asarray(y_true_np, device=device)
    y_pred_xp = xp.asarray(y_pred_np, device=device)

    metric_np = metric(y_true_np, y_pred_np, sample_weight=sample_weight)

    if sample_weight is not None:
        sample_weight = xp.asarray(sample_weight, device=device)

    with config_context(array_api_dispatch=True):
        metric_xp = metric(y_true_xp, y_pred_xp, sample_weight=sample_weight)

        assert_allclose(
            _convert_to_numpy(xp.asarray(metric_xp), xp),
            metric_np,
            atol=_atol_for_type(dtype_name),
        )


def check_array_api_binary_classification_metric(
    metric, array_namespace, device, dtype_name
):
    y_true_np = np.array([0, 0, 1, 1])
    y_pred_np = np.array([0, 1, 0, 1])

    check_array_api_metric(
        metric,
        array_namespace,
        device,
        dtype_name,
        y_true_np=y_true_np,
        y_pred_np=y_pred_np,
        sample_weight=None,
    )

    sample_weight = np.array([0.0, 0.1, 2.0, 1.0], dtype=dtype_name)

    check_array_api_metric(
        metric,
        array_namespace,
        device,
        dtype_name,
        y_true_np=y_true_np,
        y_pred_np=y_pred_np,
        sample_weight=sample_weight,
    )


def check_array_api_multiclass_classification_metric(
    metric, array_namespace, device, dtype_name
):
    y_true_np = np.array([0, 1, 2, 3])
    y_pred_np = np.array([0, 1, 0, 2])

    check_array_api_metric(
        metric,
        array_namespace,
        device,
        dtype_name,
        y_true_np=y_true_np,
        y_pred_np=y_pred_np,
        sample_weight=None,
    )

    sample_weight = np.array([0.0, 0.1, 2.0, 1.0], dtype=dtype_name)

    check_array_api_metric(
        metric,
        array_namespace,
        device,
        dtype_name,
        y_true_np=y_true_np,
        y_pred_np=y_pred_np,
        sample_weight=sample_weight,
    )


def check_array_api_regression_metric(metric, array_namespace, device, dtype_name):
    y_true_np = np.array([[1, 3], [1, 2]], dtype=dtype_name)
    y_pred_np = np.array([[1, 4], [1, 1]], dtype=dtype_name)

    check_array_api_metric(
        metric,
        array_namespace,
        device,
        dtype_name,
        y_true_np=y_true_np,
        y_pred_np=y_pred_np,
        sample_weight=None,
    )

    sample_weight = np.array([0.1, 2.0], dtype=dtype_name)

    check_array_api_metric(
        metric,
        array_namespace,
        device,
        dtype_name,
        y_true_np=y_true_np,
        y_pred_np=y_pred_np,
        sample_weight=sample_weight,
    )


array_api_metric_checkers = {
    accuracy_score: [
        check_array_api_binary_classification_metric,
        check_array_api_multiclass_classification_metric,
    ],
    zero_one_loss: [
        check_array_api_binary_classification_metric,
        check_array_api_multiclass_classification_metric,
    ],
    r2_score: [check_array_api_regression_metric],
}


def yield_metric_checker_combinations(metric_checkers=array_api_metric_checkers):
    for metric, checkers in metric_checkers.items():
        for checker in checkers:
            yield metric, checker


@pytest.mark.parametrize(
    "array_namespace, device, dtype_name", yield_namespace_device_dtype_combinations()
)
@pytest.mark.parametrize("metric, check_func", yield_metric_checker_combinations())
def test_array_api_compliance(metric, array_namespace, device, dtype_name, check_func):
    check_func(metric, array_namespace, device, dtype_name)