From a25d3356d40e29302363dde1e63580da50b85eea Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 12 Jan 2022 18:24:21 +0800
Subject: [PATCH 01/14] init

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/__init__.py            |   4 +-
 mlflow/models/evaluation/__init__.py |   6 +-
 mlflow/models/evaluation/base.py     | 135 +++++++++++++++++----------
 tests/models/test_evaluation.py      | 135 ++++++++++++++++-----------
 4 files changed, 174 insertions(+), 106 deletions(-)

diff --git a/mlflow/models/__init__.py b/mlflow/models/__init__.py
index 2ee517f135068..e26a07fd1c4cb 100644
--- a/mlflow/models/__init__.py
+++ b/mlflow/models/__init__.py
@@ -24,14 +24,14 @@
 from .model import Model
 from .flavor_backend import FlavorBackend
 from ..utils.environment import infer_pip_requirements
-from .evaluation import evaluate, EvaluationDataset
+from .evaluation import evaluate, _EvaluationDataset
 
 __all__ = [
     "Model",
     "FlavorBackend",
     "infer_pip_requirements",
     "evaluate",
-    "EvaluationDataset",
+    "_EvaluationDataset",
 ]
 
 
diff --git a/mlflow/models/evaluation/__init__.py b/mlflow/models/evaluation/__init__.py
index cf64e897f622f..df7ce48fd7238 100644
--- a/mlflow/models/evaluation/__init__.py
+++ b/mlflow/models/evaluation/__init__.py
@@ -1,19 +1,21 @@
 from mlflow.models.evaluation.base import (
     ModelEvaluator,
-    EvaluationDataset,
+    _EvaluationDataset,
     EvaluationResult,
     EvaluationMetrics,
     EvaluationArtifact,
     evaluate,
     list_evaluators,
+    get_last_failed_evaluator,
 )
 
 __all__ = [
     "ModelEvaluator",
-    "EvaluationDataset",
+    "_EvaluationDataset",
     "EvaluationResult",
     "EvaluationMetrics",
     "EvaluationArtifact",
     "evaluate",
     "list_evaluators",
+    "get_last_failed_evaluator",
 ]
diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 074da7537e7d6..6f20eee5399f7 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -221,16 +221,16 @@ def _gen_md5_for_arraylike_obj(md5_gen, data):
 
     len_bytes = _hash_uint64_ndarray_as_bytes(np.array([len(data)], dtype="uint64"))
     md5_gen.update(len_bytes)
-    if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
+    if len(data) < _EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
         md5_gen.update(_hash_array_like_obj_as_bytes(data))
     else:
-        head_rows = data[: EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH]
-        tail_rows = data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH :]
+        head_rows = data[: _EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH]
+        tail_rows = data[-_EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH:]
         md5_gen.update(_hash_array_like_obj_as_bytes(head_rows))
         md5_gen.update(_hash_array_like_obj_as_bytes(tail_rows))
 
 
-class EvaluationDataset:
+class _EvaluationDataset:
     """
     An input dataset for model evaluation. This is intended for use with the
     :py:func:`mlflow.models.evaluation.evaluate()`
@@ -240,7 +240,7 @@ class EvaluationDataset:
     NUM_SAMPLE_ROWS_FOR_HASH = 5
     SPARK_DATAFRAME_LIMIT = 10000
 
-    def __init__(self, data, labels, name=None, path=None, feature_names=None):
+    def __init__(self, data, *, targets=None, label_col=None, name=None, path=None, feature_names=None):
         """
         :param data: One of the following:
          - A numpy array or list of evaluation features, excluding labels.
@@ -250,20 +250,23 @@ def __init__(self, data, labels, name=None, path=None, feature_names=None):
          Note: If the mlflow model to be evaluated is a pyspark ML model, then the input data must
            be a spark DataFrame contains a feature column of "Vector" type, and a label column.
 
-        :param labels: One of the following:
-         - A numpy array or list of evaluation labels, if `data` is also a numpy array or list.
-         - The string name of a column from `data` that contains evaluation labels, if `data`
-           is a DataFrame.
+        :param targets: (Optional) A numpy array or list of evaluation labels, if `data` is also a
+          numpy array or list.
+
+        :param label_col: (Optional) The string name of a column from `data` that contains
+          evaluation labels, if `data` is a DataFrame.
 
         :param name: (Optional) The name of the dataset (must not contain ").
 
         :param path: (Optional) the path to a serialized DataFrame (must not contain ").
           (e.g. a delta table, parquet file)
 
-        :param feature_names: (Optional) A list of the feature names attached to the numpy array
-          input data. The argument is only useful in the case the input data is numpy array.
-          For pandas DataFrame input case, the pandas column name will be used as feature name.
-          The feature names will be shown in model explainability plots.
+        :param feature_names: (Optional) If `data` argument is a feature data numpy array or list,
+          `feature_names` argument is a list of the feature names for each feature. If None, then
+          the `feature_names` will be generated using the format "feature_{feature_index}".
+          if `data` argument is a pandas dataframe or a spark dataframe, `feature_names` argument
+          is a list of the column names of the feature columns in the dataframe. If None, then
+          all columns except the label column will be regarded as feature columns.
         """
         import numpy as np
         import pandas as pd
@@ -291,11 +294,15 @@ def __init__(self, data, labels, name=None, path=None, feature_names=None):
         except ImportError:
             supported_dataframe_types = (pd.DataFrame,)
 
+        if feature_names is not None and len(set(feature_names)) < len(list(feature_names)):
+            raise ValueError('`feature_names` argument must be a list containing unique feature '
+                             'names.')
+
         if isinstance(data, (np.ndarray, list)):
-            if not isinstance(labels, (np.ndarray, list)):
+            if not isinstance(targets, (np.ndarray, list)):
                 raise ValueError(
                     "If data is a numpy array or list of evaluation features, "
-                    "labels must be a numpy array or list of evaluation labels"
+                    "`targets` argument must be a numpy array or list of evaluation labels."
                 )
             if isinstance(data, list):
                 data = np.array(data)
@@ -309,7 +316,7 @@ def __init__(self, data, labels, name=None, path=None, feature_names=None):
                 )
 
             self._features_data = data
-            self._labels_data = labels if isinstance(labels, np.ndarray) else np.array(labels)
+            self._labels_data = targets if isinstance(targets, np.ndarray) else np.array(targets)
 
             if len(self._features_data) != len(self._labels_data):
                 raise ValueError(
@@ -329,30 +336,29 @@ def __init__(self, data, labels, name=None, path=None, feature_names=None):
                     for i in range(num_features)
                 ]
         elif isinstance(data, supported_dataframe_types):
-            if not isinstance(labels, str):
+            if not isinstance(label_col, str):
                 raise ValueError(
-                    "If data is a Pandas DataFrame or Spark DataFrame, labels must be the "
-                    "string name of a column from `data` that contains evaluation labels"
+                    "If data is a Pandas DataFrame or Spark DataFrame, `label_col` argument must "
+                    "be the name of the column which contains evaluation labels in the `data` "
+                    "dataframe."
                 )
             if isinstance(data, spark_df_type):
                 _logger.warning(
                     f"Specified Spark DataFrame is too large for model evaluation. Only "
-                    f"the first {EvaluationDataset.SPARK_DATAFRAME_LIMIT} rows will be used."
+                    f"the first {_EvaluationDataset.SPARK_DATAFRAME_LIMIT} rows will be used."
                     "If you want evaluate on the whole spark dataframe, please manually call "
                     "`spark_dataframe.toPandas()`."
                 )
-                data = data.limit(EvaluationDataset.SPARK_DATAFRAME_LIMIT).toPandas()
+                data = data.limit(_EvaluationDataset.SPARK_DATAFRAME_LIMIT).toPandas()
 
-            self._features_data = data.drop(labels, axis=1, inplace=False)
-            self._labels_data = data[labels].to_numpy()
+            self._labels_data = data[label_col].to_numpy()
 
             if feature_names is not None:
-                raise ValueError(
-                    "If `data` argument is pandas/spark dataframe, you cannot specify the "
-                    "`feature_names` argument, instead, the column names of the input "
-                    "dataframe will be used as the feature names."
-                )
-            self._feature_names = list(self._features_data.columns)
+                self._features_data = data[list(feature_names)]
+                self._feature_names = feature_names
+            else:
+                self._features_data = data.drop(label_col, axis=1, inplace=False)
+                self._feature_names = list(self._features_data.columns)
         else:
             raise ValueError(
                 "The data argument must be a numpy array, a list or a Pandas DataFrame, or "
@@ -446,6 +452,27 @@ def _log_dataset_tag(self, client, run_id, model_uuid):
             tags=[RunTag("mlflow.datasets", dataset_metadata_str)],
         )
 
+    def __hash__(self):
+        return hash(self.hash)
+
+    def __eq__(self, other):
+        import numpy as np
+        import pandas as pd
+
+        if not isinstance(other, _EvaluationDataset):
+            return False
+
+        if isinstance(self._features_data, np.ndarray):
+            is_features_data_equal = np.array_equal(self._features_data, other._features_data)
+        else:
+            is_features_data_equal = self._features_data.equals(other._features_data)
+
+        return is_features_data_equal and \
+            np.array_equal(self._labels_data, other._labels_data) and \
+            self.name == other.name and \
+            self.path == other.path and \
+            self._feature_names == other._feature_names
+
 
 class ModelEvaluator(metaclass=ABCMeta):
     @abstractmethod
@@ -496,7 +523,7 @@ def list_evaluators():
 
 
 @contextmanager
-def _start_run_or_reuse_active_run(run_id):
+def _start_run_or_reuse_active_run():
     """
     A manager context return:
      - If there's an active run, return the active run id.
@@ -506,13 +533,9 @@ def _start_run_or_reuse_active_run(run_id):
     active_run = mlflow.active_run()
     if not active_run:
         # Note `mlflow.start_run` throws if `run_id` is not found.
-        with mlflow.start_run(run_id=run_id) as run:
+        with mlflow.start_run() as run:
             yield run.info.run_id
     else:
-        if run_id and active_run.info.run_id != run_id:
-            raise ValueError(
-                "An active run exists, you cannot specify another run_id when evaluating."
-            )
         yield active_run.info.run_id
 
 
@@ -589,7 +612,7 @@ def check_nesting_config_dict(_evaluator_name_list, _evaluator_name_to_conf_map)
 _last_failed_evaluator = None
 
 
-def _get_last_failed_evaluator():
+def get_last_failed_evaluator():
     """
     Return the evaluator name of the last failed evaluator when calling `evalaute`.
     This can be used to check which evaluator fail when `evaluate` API fail.
@@ -598,7 +621,7 @@ def _get_last_failed_evaluator():
 
 
 def _evaluate(
-    *, model, model_type, dataset, actual_run_id, evaluator_name_list, evaluator_name_to_conf_map
+    *, model, model_type, dataset, run_id, evaluator_name_list, evaluator_name_to_conf_map
 ):
     """
     The public API "evaluate" will verify argument first, and then pass normalized arguments
@@ -612,7 +635,7 @@ def _evaluate(
 
     client = mlflow.tracking.MlflowClient()
     model_uuid = model.metadata.model_uuid
-    dataset._log_dataset_tag(client, actual_run_id, model_uuid)
+    dataset._log_dataset_tag(client, run_id, model_uuid)
 
     eval_results = []
     for evaluator_name in evaluator_name_list:
@@ -630,7 +653,7 @@ def _evaluate(
                 model=model,
                 model_type=model_type,
                 dataset=dataset,
-                run_id=actual_run_id,
+                run_id=run_id,
                 evaluator_config=config,
             )
             eval_results.append(result)
@@ -653,11 +676,15 @@ def _evaluate(
 
 @experimental
 def evaluate(
-    *,
     model: Union[str, "mlflow.pyfunc.PyFuncModel"],
+    data,
+    *,
     model_type: str,
-    dataset: "mlflow.models.evaluation.EvaluationDataset",
-    run_id=None,
+    targets=None,
+    label_col=None,
+    dataset_name=None,
+    dataset_path=None,
+    feature_names: list=None,
     evaluators=None,
     evaluator_config=None,
 ) -> "mlflow.models.evaluation.EvaluationResult":
@@ -669,8 +696,7 @@ def evaluate(
 
     :param model_type: A string describing the model type. The default evaluator
                        supports "regressor" and "classifier" as model types.
-    :param dataset: An instance of :py:class:`mlflow.models.evaluation.EvaluationDataset`
-                    containing features labels (optional) for model evaluation.
+    :param data:
     :param run_id: The ID of the MLflow Run to which to log results. If
                    unspecified, behavior depends on the specified `evaluator`.
                    When `run_id` is unspecified, the default evaluator logs
@@ -681,7 +707,8 @@ def evaluate(
                        capable of evaluating the specified model on the specified
                        dataset are used. The default evaluator can be referred to
                        by the name 'default'. If this argument is unspecified, then
-                       fetch all evaluators from the registry.
+                       fetch all evaluators from the registry. To get all available
+                       evaluators, call :py:func:`mlflow.models.evaluation.list_evaluators`
     :param evaluator_config: A dictionary of additional configurations to supply
                              to the evaluator. If multiple evaluators are
                              specified, each configuration should be supplied as
@@ -712,6 +739,15 @@ def evaluate(
        false_positives/false_negatives/true_positives/recall/precision/roc_auc,
        precision_recall_auc), precision-recall merged curves plot, ROC merged curves plot.
 
+    The logged mlflow metric keys are constructed using the format: "{metric_name}_on_{dataset_name}".
+
+    The metrics/artifacts listed above will be logged into the current active mlflow run,
+    if no active run exists, a new mlflow run will be created for logging these metrics/artifacts.
+
+    Besides the metrics and artifacts, the value of the tag `mlflow.datasets` will be logged
+    or appended. The content of the tag value includes the dataset metadata (name/path/hash) and
+    the model uuid.
+
     The available `evaluator_config` options for the default evaluator include:
 
      - **log_model_explainability**: A boolean value specifying whether or not to log model
@@ -769,12 +805,17 @@ def evaluate(
         evaluator_name_to_conf_map,
     ) = _normalize_evaluators_and_evaluator_config_args(evaluators, evaluator_config)
 
-    with _start_run_or_reuse_active_run(run_id) as actual_run_id:
+    dataset = _EvaluationDataset(
+        data, targets=targets, label_col=label_col, name=dataset_name,
+        path=dataset_path, feature_names=feature_names
+    )
+
+    with _start_run_or_reuse_active_run() as run_id:
         return _evaluate(
             model=model,
             model_type=model_type,
             dataset=dataset,
-            actual_run_id=actual_run_id,
+            run_id=run_id,
             evaluator_name_list=evaluator_name_list,
             evaluator_name_to_conf_map=evaluator_name_to_conf_map,
         )
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index 0555ca82a9a47..37ebd4857f208 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -3,7 +3,7 @@
 
 from mlflow.models.evaluation import (
     evaluate,
-    EvaluationDataset,
+    _EvaluationDataset,
     EvaluationResult,
     ModelEvaluator,
     EvaluationArtifact,
@@ -99,27 +99,41 @@ def spark_session():
 def iris_dataset():
     X, y = get_iris()
     eval_X, eval_y = X[0::3], y[0::3]
-    return EvaluationDataset(data=eval_X, labels=eval_y, name="iris_dataset")
+    constructor_args = {
+        'data': eval_X, 'targets': eval_y, 'name': "iris_dataset"
+    }
+    ds = _EvaluationDataset(**constructor_args)
+    ds._constructor_args = constructor_args
+    return ds
 
 
 @pytest.fixture(scope="module")
 def diabetes_dataset():
     X, y = get_diabetes_dataset()
     eval_X, eval_y = X[0::3], y[0::3]
-    return EvaluationDataset(data=eval_X, labels=eval_y, name="diabetes_dataset")
+    constructor_args = {'data': eval_X, 'targets': eval_y, 'name': "diabetes_dataset"}
+    ds = _EvaluationDataset(**constructor_args)
+    ds._constructor_args = constructor_args
+    return ds
 
 
 @pytest.fixture(scope="module")
 def diabetes_spark_dataset():
     spark_df = get_diabetes_spark_dataset().sample(fraction=0.3, seed=1)
-    return EvaluationDataset(data=spark_df, labels="label", name="diabetes_spark_dataset")
+    constructor_args = {'data': spark_df, 'label_col': "label", 'name': "diabetes_spark_dataset"}
+    ds = _EvaluationDataset(**constructor_args)
+    ds._constructor_args = constructor_args
+    return ds
 
 
 @pytest.fixture(scope="module")
 def breast_cancer_dataset():
     X, y = get_breast_cancer_dataset()
     eval_X, eval_y = X[0::3], y[0::3]
-    return EvaluationDataset(data=eval_X, labels=eval_y, name="breast_cancer_dataset")
+    constructor_args = {'data': eval_X, 'targets': eval_y, 'name': "breast_cancer_dataset"}
+    ds = _EvaluationDataset(**constructor_args)
+    ds._constructor_args = constructor_args
+    return ds
 
 
 @pytest.fixture
@@ -200,8 +214,7 @@ def iris_pandas_df_dataset():
             "y": eval_y,
         }
     )
-    labels = "y"
-    return EvaluationDataset(data=data, labels=labels, name="iris_pandas_df_dataset")
+    return _EvaluationDataset(data=data, label_col="y", name="iris_pandas_df_dataset")
 
 
 def test_classifier_evaluate(multiclass_logistic_regressor_model_uri, iris_dataset):
@@ -220,10 +233,11 @@ def test_classifier_evaluate(multiclass_logistic_regressor_model_uri, iris_datas
 
     with mlflow.start_run() as run:
         eval_result = evaluate(
-            model=classifier_model,
+            classifier_model,
+            iris_dataset._constructor_args['data'],
             model_type="classifier",
-            dataset=iris_dataset,
-            run_id=None,
+            targets=iris_dataset._constructor_args['targets'],
+            dataset_name=iris_dataset.name,
             evaluators="dummy_evaluator",
         )
 
@@ -292,10 +306,11 @@ def test_regressor_evaluate(linear_regressor_model_uri, diabetes_dataset):
     for model in [regressor_model, linear_regressor_model_uri]:
         with mlflow.start_run() as run:
             eval_result = evaluate(
-                model=model,
+                model,
+                diabetes_dataset._constructor_args['data'],
                 model_type="regressor",
-                dataset=diabetes_dataset,
-                run_id=None,
+                targets=diabetes_dataset._constructor_args['targets'],
+                dataset_name=diabetes_dataset.name,
                 evaluators="dummy_evaluator",
             )
         _, saved_metrics, _, _ = get_run_data(run.info.run_id)
@@ -305,12 +320,22 @@ def test_regressor_evaluate(linear_regressor_model_uri, diabetes_dataset):
 
 def test_dataset_name():
     X, y = get_iris()
-    d1 = EvaluationDataset(data=X, labels=y, name="a1")
+    d1 = _EvaluationDataset(data=X, targets=y, name="a1")
     assert d1.name == "a1"
-    d2 = EvaluationDataset(data=X, labels=y)
+    d2 = _EvaluationDataset(data=X, targets=y)
     assert d2.name == d2.hash
 
 
+def test_dataset_metadata():
+    X, y = get_iris()
+    d1 = _EvaluationDataset(data=X, targets=y, name="a1", path="/path/to/a1")
+    assert d1._metadata == {
+        'hash': '6bdf4e119bf1a37e7907dfd9f0e68733',
+        'name': 'a1',
+        'path': '/path/to/a1'
+    }
+
+
 def test_gen_md5_for_arraylike_obj():
     def get_md5(data):
         md5_gen = hashlib.md5()
@@ -334,54 +359,63 @@ def test_dataset_hash(iris_dataset, iris_pandas_df_dataset, diabetes_spark_datas
     assert diabetes_spark_dataset.hash == "e646b03e976240bd0c79c6bcc1ae0bda"
 
 
-def test_datasset_with_pandas_dataframe():
-    data = pd.DataFrame({"f1": [1, 2], "f2": [3, 4], "label": [0, 1]})
-    eval_dataset = EvaluationDataset(data=data, labels="label")
+def test_dataset_with_pandas_dataframe():
+    data = pd.DataFrame({"f1": [1, 2], "f2": [3, 4], "f3": [5, 6], "label": [0, 1]})
+    eval_dataset = _EvaluationDataset(data=data, label_col="label")
 
-    assert list(eval_dataset.features_data.columns) == ["f1", "f2"]
+    assert list(eval_dataset.features_data.columns) == ["f1", "f2", "f3"]
     assert np.array_equal(eval_dataset.features_data.f1.to_numpy(), [1, 2])
     assert np.array_equal(eval_dataset.features_data.f2.to_numpy(), [3, 4])
+    assert np.array_equal(eval_dataset.features_data.f3.to_numpy(), [5, 6])
     assert np.array_equal(eval_dataset.labels_data, [0, 1])
 
+    eval_dataset2 = _EvaluationDataset(data=data, label_col="label", feature_names=["f3", "f2"])
+    assert list(eval_dataset2.features_data.columns) == ["f3", "f2"]
+    assert np.array_equal(eval_dataset2.features_data.f2.to_numpy(), [3, 4])
+    assert np.array_equal(eval_dataset2.features_data.f3.to_numpy(), [5, 6])
 
-def test_datasset_with_array_data():
+
+def test_dataset_with_array_data():
     features = [[1, 2], [3, 4]]
     labels = [0, 1]
 
     for input_data in [features, np.array(features)]:
-        eval_dataset1 = EvaluationDataset(data=input_data, labels=labels)
+        eval_dataset1 = _EvaluationDataset(data=input_data, targets=labels)
         assert np.array_equal(eval_dataset1.features_data, features)
         assert np.array_equal(eval_dataset1.labels_data, labels)
         assert list(eval_dataset1.feature_names) == ["feature_1", "feature_2"]
 
+    assert _EvaluationDataset(data=input_data, targets=labels, feature_names=['a', 'b']) \
+        .feature_names == ['a', 'b']
+
     with pytest.raises(ValueError, match="all element must has the same length"):
-        EvaluationDataset(data=[[1, 2], [3, 4, 5]], labels=labels)
+        _EvaluationDataset(data=[[1, 2], [3, 4, 5]], targets=labels)
 
 
-def test_autogen_feature_names():
+def test_dataset_autogen_feature_names():
     labels = [0]
-    eval_dataset2 = EvaluationDataset(data=[list(range(9))], labels=labels)
+    eval_dataset2 = _EvaluationDataset(data=[list(range(9))], targets=labels)
     assert eval_dataset2.feature_names == [f"feature_{i + 1}" for i in range(9)]
 
-    eval_dataset2 = EvaluationDataset(data=[list(range(10))], labels=labels)
+    eval_dataset2 = _EvaluationDataset(data=[list(range(10))], targets=labels)
     assert eval_dataset2.feature_names == [f"feature_{i + 1:02d}" for i in range(10)]
 
-    eval_dataset2 = EvaluationDataset(data=[list(range(99))], labels=labels)
+    eval_dataset2 = _EvaluationDataset(data=[list(range(99))], targets=labels)
     assert eval_dataset2.feature_names == [f"feature_{i + 1:02d}" for i in range(99)]
 
-    eval_dataset2 = EvaluationDataset(data=[list(range(100))], labels=labels)
+    eval_dataset2 = _EvaluationDataset(data=[list(range(100))], targets=labels)
     assert eval_dataset2.feature_names == [f"feature_{i + 1:03d}" for i in range(100)]
 
     with pytest.raises(
         ValueError, match="features example rows must be the same length with labels array"
     ):
-        EvaluationDataset(data=[[1, 2], [3, 4]], labels=[1, 2, 3])
+        _EvaluationDataset(data=[[1, 2], [3, 4]], targets=[1, 2, 3])
 
 
-def test_spark_df_dataset(spark_session):
+def test_dataset_from_spark_df(spark_session):
     spark_df = spark_session.createDataFrame([(1.0, 2.0, 3.0)] * 10, ["f1", "f2", "y"])
-    with mock.patch.object(EvaluationDataset, "SPARK_DATAFRAME_LIMIT", 5):
-        dataset = EvaluationDataset(spark_df, "y")
+    with mock.patch.object(_EvaluationDataset, "SPARK_DATAFRAME_LIMIT", 5):
+        dataset = _EvaluationDataset(spark_df, label_col="y")
         assert list(dataset.features_data.columns) == ["f1", "f2"]
         assert list(dataset.features_data["f1"]) == [1.0] * 5
         assert list(dataset.features_data["f2"]) == [2.0] * 5
@@ -472,10 +506,11 @@ def test_evaluator_interface(multiclass_logistic_regressor_model_uri, iris_datas
                     match="The model could not be evaluated by any of the registered evaluators",
                 ):
                     evaluate(
-                        model=multiclass_logistic_regressor_model_uri,
+                        multiclass_logistic_regressor_model_uri,
+                        data=iris_dataset._constructor_args['data'],
                         model_type="classifier",
-                        dataset=iris_dataset,
-                        run_id=None,
+                        targets=iris_dataset._constructor_args['targets'],
+                        dataset_name=iris_dataset.name,
                         evaluators="test_evaluator1",
                         evaluator_config=evaluator1_config,
                     )
@@ -491,10 +526,11 @@ def test_evaluator_interface(multiclass_logistic_regressor_model_uri, iris_datas
             classifier_model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri)
             with mlflow.start_run() as run:
                 eval1_result = evaluate(
-                    model=classifier_model,
+                    classifier_model,
+                    iris_dataset._constructor_args['data'],
                     model_type="classifier",
-                    dataset=iris_dataset,
-                    run_id=None,
+                    targets=iris_dataset._constructor_args['targets'],
+                    dataset_name=iris_dataset.name,
                     evaluators="test_evaluator1",
                     evaluator_config=evaluator1_config,
                 )
@@ -544,10 +580,11 @@ def test_evaluate_with_multi_evaluators(multiclass_logistic_regressor_model_uri,
                 classifier_model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri)
                 with mlflow.start_run() as run:
                     eval_result = evaluate(
-                        model=classifier_model,
+                        classifier_model,
+                        iris_dataset._constructor_args['data'],
                         model_type="classifier",
-                        dataset=iris_dataset,
-                        run_id=None,
+                        targets=iris_dataset._constructor_args['targets'],
+                        dataset_name=iris_dataset.name,
                         evaluators=evaluators,
                         evaluator_config={
                             "test_evaluator1": evaluator1_config,
@@ -586,30 +623,18 @@ def test_evaluate_with_multi_evaluators(multiclass_logistic_regressor_model_uri,
 
 
 def test_start_run_or_reuse_active_run():
-    with _start_run_or_reuse_active_run(run_id=None) as run_id:
-        assert mlflow.active_run().info.run_id == run_id
-
-    with mlflow.start_run() as run:
-        pass
-    previous_run_id = run.info.run_id
-
-    with _start_run_or_reuse_active_run(run_id=previous_run_id) as run_id:
-        assert previous_run_id == run_id
+    with _start_run_or_reuse_active_run() as run_id:
         assert mlflow.active_run().info.run_id == run_id
 
     with mlflow.start_run() as run:
         active_run_id = run.info.run_id
 
-        with _start_run_or_reuse_active_run(run_id=None) as run_id:
+        with _start_run_or_reuse_active_run() as run_id:
             assert run_id == active_run_id
 
-        with _start_run_or_reuse_active_run(run_id=active_run_id) as run_id:
+        with _start_run_or_reuse_active_run() as run_id:
             assert run_id == active_run_id
 
-        with pytest.raises(ValueError, match="An active run exists"):
-            with _start_run_or_reuse_active_run(run_id=previous_run_id):
-                pass
-
 
 def test_normalize_evaluators_and_evaluator_config_args():
     from mlflow.models.evaluation.default_evaluator import DefaultEvaluator

From d9088b956524acb338fd109d40d4b8e8b0251ea8 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 12 Jan 2022 19:18:20 +0800
Subject: [PATCH 02/14] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py       | 33 +++++++++++++--------
 tests/models/test_default_evaluator.py | 30 ++++++++++++-------
 tests/models/test_evaluation.py        | 41 +++++++++++++-------------
 3 files changed, 61 insertions(+), 43 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 6f20eee5399f7..550a91a5de0e3 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -225,7 +225,7 @@ def _gen_md5_for_arraylike_obj(md5_gen, data):
         md5_gen.update(_hash_array_like_obj_as_bytes(data))
     else:
         head_rows = data[: _EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH]
-        tail_rows = data[-_EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH:]
+        tail_rows = data[-_EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH :]
         md5_gen.update(_hash_array_like_obj_as_bytes(head_rows))
         md5_gen.update(_hash_array_like_obj_as_bytes(tail_rows))
 
@@ -240,7 +240,9 @@ class _EvaluationDataset:
     NUM_SAMPLE_ROWS_FOR_HASH = 5
     SPARK_DATAFRAME_LIMIT = 10000
 
-    def __init__(self, data, *, targets=None, label_col=None, name=None, path=None, feature_names=None):
+    def __init__(
+        self, data, *, targets=None, label_col=None, name=None, path=None, feature_names=None
+    ):
         """
         :param data: One of the following:
          - A numpy array or list of evaluation features, excluding labels.
@@ -295,8 +297,9 @@ def __init__(self, data, *, targets=None, label_col=None, name=None, path=None,
             supported_dataframe_types = (pd.DataFrame,)
 
         if feature_names is not None and len(set(feature_names)) < len(list(feature_names)):
-            raise ValueError('`feature_names` argument must be a list containing unique feature '
-                             'names.')
+            raise ValueError(
+                "`feature_names` argument must be a list containing unique feature " "names."
+            )
 
         if isinstance(data, (np.ndarray, list)):
             if not isinstance(targets, (np.ndarray, list)):
@@ -467,11 +470,13 @@ def __eq__(self, other):
         else:
             is_features_data_equal = self._features_data.equals(other._features_data)
 
-        return is_features_data_equal and \
-            np.array_equal(self._labels_data, other._labels_data) and \
-            self.name == other.name and \
-            self.path == other.path and \
-            self._feature_names == other._feature_names
+        return (
+            is_features_data_equal
+            and np.array_equal(self._labels_data, other._labels_data)
+            and self.name == other.name
+            and self.path == other.path
+            and self._feature_names == other._feature_names
+        )
 
 
 class ModelEvaluator(metaclass=ABCMeta):
@@ -684,7 +689,7 @@ def evaluate(
     label_col=None,
     dataset_name=None,
     dataset_path=None,
-    feature_names: list=None,
+    feature_names: list = None,
     evaluators=None,
     evaluator_config=None,
 ) -> "mlflow.models.evaluation.EvaluationResult":
@@ -806,8 +811,12 @@ def evaluate(
     ) = _normalize_evaluators_and_evaluator_config_args(evaluators, evaluator_config)
 
     dataset = _EvaluationDataset(
-        data, targets=targets, label_col=label_col, name=dataset_name,
-        path=dataset_path, feature_names=feature_names
+        data,
+        targets=targets,
+        label_col=label_col,
+        name=dataset_name,
+        path=dataset_path,
+        feature_names=feature_names,
     )
 
     with _start_run_or_reuse_active_run() as run_id:
diff --git a/tests/models/test_default_evaluator.py b/tests/models/test_default_evaluator.py
index 872f599ecf717..b1cacb6c72d2a 100644
--- a/tests/models/test_default_evaluator.py
+++ b/tests/models/test_default_evaluator.py
@@ -40,9 +40,11 @@ def assert_dict_equal(d1, d2, rtol):
 def test_regressor_evaluation(linear_regressor_model_uri, diabetes_dataset):
     with mlflow.start_run() as run:
         result = evaluate(
-            model=linear_regressor_model_uri,
+            linear_regressor_model_uri,
+            diabetes_dataset._constructor_args["data"],
             model_type="regressor",
-            dataset=diabetes_dataset,
+            targets=diabetes_dataset._constructor_args["targets"],
+            dataset_name=diabetes_dataset.name,
             evaluators="default",
         )
 
@@ -81,9 +83,11 @@ def test_regressor_evaluation(linear_regressor_model_uri, diabetes_dataset):
 def test_multi_classifier_evaluation(multiclass_logistic_regressor_model_uri, iris_dataset):
     with mlflow.start_run() as run:
         result = evaluate(
-            model=multiclass_logistic_regressor_model_uri,
+            multiclass_logistic_regressor_model_uri,
+            iris_dataset._constructor_args["data"],
             model_type="classifier",
-            dataset=iris_dataset,
+            targets=iris_dataset._constructor_args["targets"],
+            dataset_name=iris_dataset.name,
             evaluators="default",
         )
 
@@ -132,9 +136,11 @@ def test_multi_classifier_evaluation(multiclass_logistic_regressor_model_uri, ir
 def test_bin_classifier_evaluation(binary_logistic_regressor_model_uri, breast_cancer_dataset):
     with mlflow.start_run() as run:
         result = evaluate(
-            model=binary_logistic_regressor_model_uri,
+            binary_logistic_regressor_model_uri,
+            breast_cancer_dataset._constructor_args["data"],
             model_type="classifier",
-            dataset=breast_cancer_dataset,
+            targets=breast_cancer_dataset._constructor_args["targets"],
+            dataset_name=breast_cancer_dataset.name,
             evaluators="default",
         )
 
@@ -184,9 +190,11 @@ def test_bin_classifier_evaluation(binary_logistic_regressor_model_uri, breast_c
 def test_spark_regressor_model_evaluation(spark_linear_regressor_model_uri, diabetes_spark_dataset):
     with mlflow.start_run() as run:
         result = evaluate(
-            model=spark_linear_regressor_model_uri,
+            spark_linear_regressor_model_uri,
+            diabetes_spark_dataset._constructor_args["data"],
             model_type="regressor",
-            dataset=diabetes_spark_dataset,
+            label_col=diabetes_spark_dataset._constructor_args["label_col"],
+            dataset_name=diabetes_spark_dataset.name,
             evaluators="default",
             evaluator_config={"log_model_explainability": True},
         )
@@ -222,9 +230,11 @@ def test_spark_regressor_model_evaluation(spark_linear_regressor_model_uri, diab
 def test_svm_classifier_evaluation(svm_model_uri, breast_cancer_dataset):
     with mlflow.start_run() as run:
         result = evaluate(
-            model=svm_model_uri,
+            svm_model_uri,
+            breast_cancer_dataset._constructor_args["data"],
             model_type="classifier",
-            dataset=breast_cancer_dataset,
+            targets=breast_cancer_dataset._constructor_args["targets"],
+            dataset_name=breast_cancer_dataset.name,
             evaluators="default",
         )
 
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index 37ebd4857f208..07386b280e4d7 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -99,9 +99,7 @@ def spark_session():
 def iris_dataset():
     X, y = get_iris()
     eval_X, eval_y = X[0::3], y[0::3]
-    constructor_args = {
-        'data': eval_X, 'targets': eval_y, 'name': "iris_dataset"
-    }
+    constructor_args = {"data": eval_X, "targets": eval_y, "name": "iris_dataset"}
     ds = _EvaluationDataset(**constructor_args)
     ds._constructor_args = constructor_args
     return ds
@@ -111,7 +109,7 @@ def iris_dataset():
 def diabetes_dataset():
     X, y = get_diabetes_dataset()
     eval_X, eval_y = X[0::3], y[0::3]
-    constructor_args = {'data': eval_X, 'targets': eval_y, 'name': "diabetes_dataset"}
+    constructor_args = {"data": eval_X, "targets": eval_y, "name": "diabetes_dataset"}
     ds = _EvaluationDataset(**constructor_args)
     ds._constructor_args = constructor_args
     return ds
@@ -120,7 +118,7 @@ def diabetes_dataset():
 @pytest.fixture(scope="module")
 def diabetes_spark_dataset():
     spark_df = get_diabetes_spark_dataset().sample(fraction=0.3, seed=1)
-    constructor_args = {'data': spark_df, 'label_col': "label", 'name': "diabetes_spark_dataset"}
+    constructor_args = {"data": spark_df, "label_col": "label", "name": "diabetes_spark_dataset"}
     ds = _EvaluationDataset(**constructor_args)
     ds._constructor_args = constructor_args
     return ds
@@ -130,7 +128,7 @@ def diabetes_spark_dataset():
 def breast_cancer_dataset():
     X, y = get_breast_cancer_dataset()
     eval_X, eval_y = X[0::3], y[0::3]
-    constructor_args = {'data': eval_X, 'targets': eval_y, 'name': "breast_cancer_dataset"}
+    constructor_args = {"data": eval_X, "targets": eval_y, "name": "breast_cancer_dataset"}
     ds = _EvaluationDataset(**constructor_args)
     ds._constructor_args = constructor_args
     return ds
@@ -234,9 +232,9 @@ def test_classifier_evaluate(multiclass_logistic_regressor_model_uri, iris_datas
     with mlflow.start_run() as run:
         eval_result = evaluate(
             classifier_model,
-            iris_dataset._constructor_args['data'],
+            iris_dataset._constructor_args["data"],
             model_type="classifier",
-            targets=iris_dataset._constructor_args['targets'],
+            targets=iris_dataset._constructor_args["targets"],
             dataset_name=iris_dataset.name,
             evaluators="dummy_evaluator",
         )
@@ -307,9 +305,9 @@ def test_regressor_evaluate(linear_regressor_model_uri, diabetes_dataset):
         with mlflow.start_run() as run:
             eval_result = evaluate(
                 model,
-                diabetes_dataset._constructor_args['data'],
+                diabetes_dataset._constructor_args["data"],
                 model_type="regressor",
-                targets=diabetes_dataset._constructor_args['targets'],
+                targets=diabetes_dataset._constructor_args["targets"],
                 dataset_name=diabetes_dataset.name,
                 evaluators="dummy_evaluator",
             )
@@ -330,9 +328,9 @@ def test_dataset_metadata():
     X, y = get_iris()
     d1 = _EvaluationDataset(data=X, targets=y, name="a1", path="/path/to/a1")
     assert d1._metadata == {
-        'hash': '6bdf4e119bf1a37e7907dfd9f0e68733',
-        'name': 'a1',
-        'path': '/path/to/a1'
+        "hash": "6bdf4e119bf1a37e7907dfd9f0e68733",
+        "name": "a1",
+        "path": "/path/to/a1",
     }
 
 
@@ -385,8 +383,9 @@ def test_dataset_with_array_data():
         assert np.array_equal(eval_dataset1.labels_data, labels)
         assert list(eval_dataset1.feature_names) == ["feature_1", "feature_2"]
 
-    assert _EvaluationDataset(data=input_data, targets=labels, feature_names=['a', 'b']) \
-        .feature_names == ['a', 'b']
+    assert _EvaluationDataset(
+        data=input_data, targets=labels, feature_names=["a", "b"]
+    ).feature_names == ["a", "b"]
 
     with pytest.raises(ValueError, match="all element must has the same length"):
         _EvaluationDataset(data=[[1, 2], [3, 4, 5]], targets=labels)
@@ -507,9 +506,9 @@ def test_evaluator_interface(multiclass_logistic_regressor_model_uri, iris_datas
                 ):
                     evaluate(
                         multiclass_logistic_regressor_model_uri,
-                        data=iris_dataset._constructor_args['data'],
+                        data=iris_dataset._constructor_args["data"],
                         model_type="classifier",
-                        targets=iris_dataset._constructor_args['targets'],
+                        targets=iris_dataset._constructor_args["targets"],
                         dataset_name=iris_dataset.name,
                         evaluators="test_evaluator1",
                         evaluator_config=evaluator1_config,
@@ -527,9 +526,9 @@ def test_evaluator_interface(multiclass_logistic_regressor_model_uri, iris_datas
             with mlflow.start_run() as run:
                 eval1_result = evaluate(
                     classifier_model,
-                    iris_dataset._constructor_args['data'],
+                    iris_dataset._constructor_args["data"],
                     model_type="classifier",
-                    targets=iris_dataset._constructor_args['targets'],
+                    targets=iris_dataset._constructor_args["targets"],
                     dataset_name=iris_dataset.name,
                     evaluators="test_evaluator1",
                     evaluator_config=evaluator1_config,
@@ -581,9 +580,9 @@ def test_evaluate_with_multi_evaluators(multiclass_logistic_regressor_model_uri,
                 with mlflow.start_run() as run:
                     eval_result = evaluate(
                         classifier_model,
-                        iris_dataset._constructor_args['data'],
+                        iris_dataset._constructor_args["data"],
                         model_type="classifier",
-                        targets=iris_dataset._constructor_args['targets'],
+                        targets=iris_dataset._constructor_args["targets"],
                         dataset_name=iris_dataset.name,
                         evaluators=evaluators,
                         evaluator_config={

From e367960a3b096ec258cc495c7aed06f542ef2d2d Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 12 Jan 2022 19:33:48 +0800
Subject: [PATCH 03/14] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/__init__.py                        |  3 +--
 mlflow/models/evaluation/__init__.py             |  4 ----
 mlflow/models/evaluation/base.py                 | 16 ++++------------
 mlflow/models/evaluation/default_evaluator.py    |  3 +--
 tests/models/test_evaluation.py                  |  9 ++++-----
 .../mlflow_test_plugin/dummy_evaluator.py        | 10 +++++-----
 6 files changed, 15 insertions(+), 30 deletions(-)

diff --git a/mlflow/models/__init__.py b/mlflow/models/__init__.py
index e26a07fd1c4cb..c40f249b23cc0 100644
--- a/mlflow/models/__init__.py
+++ b/mlflow/models/__init__.py
@@ -24,14 +24,13 @@
 from .model import Model
 from .flavor_backend import FlavorBackend
 from ..utils.environment import infer_pip_requirements
-from .evaluation import evaluate, _EvaluationDataset
+from .evaluation import evaluate
 
 __all__ = [
     "Model",
     "FlavorBackend",
     "infer_pip_requirements",
     "evaluate",
-    "_EvaluationDataset",
 ]
 
 
diff --git a/mlflow/models/evaluation/__init__.py b/mlflow/models/evaluation/__init__.py
index df7ce48fd7238..5e098b1a7bb24 100644
--- a/mlflow/models/evaluation/__init__.py
+++ b/mlflow/models/evaluation/__init__.py
@@ -1,8 +1,6 @@
 from mlflow.models.evaluation.base import (
     ModelEvaluator,
-    _EvaluationDataset,
     EvaluationResult,
-    EvaluationMetrics,
     EvaluationArtifact,
     evaluate,
     list_evaluators,
@@ -11,9 +9,7 @@
 
 __all__ = [
     "ModelEvaluator",
-    "_EvaluationDataset",
     "EvaluationResult",
-    "EvaluationMetrics",
     "EvaluationArtifact",
     "evaluate",
     "list_evaluators",
diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 550a91a5de0e3..3a1a30f8370fe 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -1,4 +1,4 @@
-from typing import Dict, Union
+from typing import Dict, Union, Any
 import mlflow
 import hashlib
 import json
@@ -22,14 +22,6 @@
 _logger = logging.getLogger(__name__)
 
 
-class EvaluationMetrics(dict):
-    """
-    A dictionary of model evaluation metrics.
-    """
-
-    pass
-
-
 class EvaluationArtifact(metaclass=ABCMeta):
     """
     A model evaluation artifact containing an artifact uri and content.
@@ -99,7 +91,7 @@ def __init__(self, metrics, artifacts):
     def load(cls, path):
         """Load the evaluation results from the specified local filesystem path"""
         with open(os.path.join(path, "metrics.json"), "r") as fp:
-            metrics = EvaluationMetrics(json.load(fp))
+            metrics = json.load(fp)
 
         with open(os.path.join(path, "artifacts_metadata.json"), "r") as fp:
             artifacts_metadata = json.load(fp)
@@ -140,7 +132,7 @@ def save(self, path):
             artifact.save(os.path.join(artifacts_dir, artifact_name))
 
     @property
-    def metrics(self) -> "mlflow.models.evaluation.EvaluationMetrics":
+    def metrics(self) -> Dict[str, Any]:
         """
         A dictionary mapping scalar metric names to scalar metric values
         """
@@ -671,7 +663,7 @@ def _evaluate(
             "verify that the model type and other configs are set correctly."
         )
 
-    merged_eval_result = EvaluationResult(EvaluationMetrics(), dict())
+    merged_eval_result = EvaluationResult(dict(), dict())
     for eval_result in eval_results:
         merged_eval_result.metrics.update(eval_result.metrics)
         merged_eval_result.artifacts.update(eval_result.artifacts)
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 6ee604aed8988..578e961f131f3 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -1,7 +1,6 @@
 import mlflow
 from mlflow.models.evaluation.base import (
     ModelEvaluator,
-    EvaluationMetrics,
     EvaluationResult,
 )
 from mlflow.entities.metric import Metric
@@ -652,7 +651,7 @@ def evaluate(
 
             self.X = dataset.features_data
             self.y = dataset.labels_data
-            self.metrics = EvaluationMetrics()
+            self.metrics = dict()
             self.artifacts = {}
 
             infered_model_type = _infer_model_type_by_labels(self.y)
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index 07386b280e4d7..958f7629ec1d6 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -3,13 +3,12 @@
 
 from mlflow.models.evaluation import (
     evaluate,
-    _EvaluationDataset,
     EvaluationResult,
     ModelEvaluator,
     EvaluationArtifact,
-    EvaluationMetrics,
 )
 from mlflow.models.evaluation.base import (
+    _EvaluationDataset,
     _normalize_evaluators_and_evaluator_config_args as _normalize_config,
 )
 import hashlib
@@ -491,7 +490,7 @@ def test_evaluator_interface(multiclass_logistic_regressor_model_uri, iris_datas
     ):
         evaluator1_config = {"eval1_confg_a": 3, "eval1_confg_b": 4}
         evaluator1_return_value = EvaluationResult(
-            metrics=EvaluationMetrics({"m1": 5, "m2": 6}),
+            metrics={"m1": 5, "m2": 6},
             artifacts={"a1": FakeArtifact1(uri="uri1"), "a2": FakeArtifact2(uri="uri2")},
         )
         with mock.patch.object(
@@ -557,10 +556,10 @@ def test_evaluate_with_multi_evaluators(multiclass_logistic_regressor_model_uri,
         evaluator1_config = {"eval1_confg": 3}
         evaluator2_config = {"eval2_confg": 4}
         evaluator1_return_value = EvaluationResult(
-            metrics=EvaluationMetrics({"m1": 5}), artifacts={"a1": FakeArtifact1(uri="uri1")}
+            metrics={"m1": 5}, artifacts={"a1": FakeArtifact1(uri="uri1")}
         )
         evaluator2_return_value = EvaluationResult(
-            metrics=EvaluationMetrics({"m2": 6}), artifacts={"a2": FakeArtifact2(uri="uri2")}
+            metrics={"m2": 6}, artifacts={"a2": FakeArtifact2(uri="uri2")}
         )
 
         # evaluators = None is the case evaluators unspecified, it should fetch all registered
diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
index c88bd21d09321..69f4a08daf754 100644
--- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -1,7 +1,6 @@
 import mlflow
 from mlflow.models.evaluation import (
     ModelEvaluator,
-    EvaluationMetrics,
     EvaluationArtifact,
     EvaluationResult,
 )
@@ -52,7 +51,7 @@ def evaluate(
         if model_type == "classifier":
             accuracy_score = sk_metrics.accuracy_score(y, y_pred)
 
-            metrics = EvaluationMetrics(accuracy_score=accuracy_score)
+            metrics = {'accuracy_score': accuracy_score}
             self._log_metrics(run_id, metrics, dataset.name)
             confusion_matrix = sk_metrics.confusion_matrix(y, y_pred)
             confusion_matrix_artifact_name = f"confusion_matrix_on_{dataset.name}.csv"
@@ -69,9 +68,10 @@ def evaluate(
         elif model_type == "regressor":
             mean_absolute_error = sk_metrics.mean_absolute_error(y, y_pred)
             mean_squared_error = sk_metrics.mean_squared_error(y, y_pred)
-            metrics = EvaluationMetrics(
-                mean_absolute_error=mean_absolute_error, mean_squared_error=mean_squared_error
-            )
+            metrics = {
+                'mean_absolute_error': mean_absolute_error,
+                'mean_squared_error': mean_squared_error,
+            }
             self._log_metrics(run_id, metrics, dataset.name)
             artifacts = {}
         else:

From 308568fe8ba9e51147d88825c7a1b756f7e25d6f Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 12 Jan 2022 20:03:55 +0800
Subject: [PATCH 04/14] update doc

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py | 59 +++++++++++++++-----------------
 1 file changed, 28 insertions(+), 31 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 3a1a30f8370fe..2b1bc59ca4538 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -236,31 +236,7 @@ def __init__(
         self, data, *, targets=None, label_col=None, name=None, path=None, feature_names=None
     ):
         """
-        :param data: One of the following:
-         - A numpy array or list of evaluation features, excluding labels.
-         - A Pandas DataFrame, or a spark DataFrame,
-           containing evaluation features and labels. All columns will be regarded as feature
-           columns except the "labels" column.
-         Note: If the mlflow model to be evaluated is a pyspark ML model, then the input data must
-           be a spark DataFrame contains a feature column of "Vector" type, and a label column.
-
-        :param targets: (Optional) A numpy array or list of evaluation labels, if `data` is also a
-          numpy array or list.
-
-        :param label_col: (Optional) The string name of a column from `data` that contains
-          evaluation labels, if `data` is a DataFrame.
-
-        :param name: (Optional) The name of the dataset (must not contain ").
-
-        :param path: (Optional) the path to a serialized DataFrame (must not contain ").
-          (e.g. a delta table, parquet file)
-
-        :param feature_names: (Optional) If `data` argument is a feature data numpy array or list,
-          `feature_names` argument is a list of the feature names for each feature. If None, then
-          the `feature_names` will be generated using the format "feature_{feature_index}".
-          if `data` argument is a pandas dataframe or a spark dataframe, `feature_names` argument
-          is a list of the column names of the feature columns in the dataframe. If None, then
-          all columns except the label column will be regarded as feature columns.
+        The values of the constructor arguments comes from the `evaluate` call.
         """
         import numpy as np
         import pandas as pd
@@ -691,14 +667,35 @@ def evaluate(
 
     :param model: A pyfunc model instance, or a URI referring to such a model.
 
+    :param data: One of the following:
+     - A numpy array or list of evaluation features, excluding labels.
+     - A Pandas DataFrame, or a spark DataFrame,
+       containing evaluation features and labels. All columns will be regarded as feature
+       columns except the "labels" column.
+     Note: If the mlflow model to be evaluated is a pyspark ML model, then the input data must
+       be a spark DataFrame contains a feature column of "Vector" type, and a label column.
+
     :param model_type: A string describing the model type. The default evaluator
                        supports "regressor" and "classifier" as model types.
-    :param data:
-    :param run_id: The ID of the MLflow Run to which to log results. If
-                   unspecified, behavior depends on the specified `evaluator`.
-                   When `run_id` is unspecified, the default evaluator logs
-                   results to the current active run, creating a new active run if
-                   one does not exist.
+
+    :param targets: (Optional) A numpy array or list of evaluation labels, if `data` is also a
+      numpy array or list.
+
+    :param label_col: (Optional) The string name of a column from `data` that contains
+      evaluation labels, if `data` is a DataFrame.
+
+    :param dataset_name: (Optional) The name of the dataset, must not contain double quotes (“).
+.
+    :param dataset_path: (Optional) the path to a serialized DataFrame
+      (e.g. a delta table, parquet file), must not contain double quotes (“).
+
+    :param feature_names: (Optional) If `data` argument is a feature data numpy array or list,
+      `feature_names` argument is a list of the feature names for each feature. If None, then
+      the `feature_names` will be generated using the format "feature_{feature_index}".
+      if `data` argument is a pandas dataframe or a spark dataframe, `feature_names` argument
+      is a list of the column names of the feature columns in the dataframe. If None, then
+      all columns except the label column will be regarded as feature columns.
+
     :param evaluators: The name of the evaluator to use for model evaluations, or
                        a list of evaluator names. If unspecified, all evaluators
                        capable of evaluating the specified model on the specified

From 37b3c693fb258741bba34320c0cab050e2c25622 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 12 Jan 2022 20:12:31 +0800
Subject: [PATCH 05/14] update doc

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 2b1bc59ca4538..8c3f0e2b00e2c 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -670,10 +670,9 @@ def evaluate(
     :param data: One of the following:
      - A numpy array or list of evaluation features, excluding labels.
      - A Pandas DataFrame, or a spark DataFrame,
-       containing evaluation features and labels. All columns will be regarded as feature
-       columns except the "labels" column.
-     Note: If the mlflow model to be evaluated is a pyspark ML model, then the input data must
-       be a spark DataFrame contains a feature column of "Vector" type, and a label column.
+       containing evaluation features and labels. If `feature_names` argument not specified,
+       all columns will be regarded as feature columns, otherwise column names which match
+       `feature_names` will be regarded as feature columns.
 
     :param model_type: A string describing the model type. The default evaluator
                        supports "regressor" and "classifier" as model types.

From 165932069944e09124a9c3a966b8df54462d27e9 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 13 Jan 2022 11:39:50 +0800
Subject: [PATCH 06/14] address comments

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/__init__.py            |   5 +-
 mlflow/models/evaluation/__init__.py |   3 +-
 mlflow/models/evaluation/base.py     | 112 +++++++++++++--------------
 3 files changed, 57 insertions(+), 63 deletions(-)

diff --git a/mlflow/models/__init__.py b/mlflow/models/__init__.py
index c40f249b23cc0..f2037bd06e61a 100644
--- a/mlflow/models/__init__.py
+++ b/mlflow/models/__init__.py
@@ -24,13 +24,16 @@
 from .model import Model
 from .flavor_backend import FlavorBackend
 from ..utils.environment import infer_pip_requirements
-from .evaluation import evaluate
+from .evaluation import evaluate, EvaluationArtifact, EvaluationResult, list_evaluators
 
 __all__ = [
     "Model",
     "FlavorBackend",
     "infer_pip_requirements",
     "evaluate",
+    "EvaluationArtifact",
+    "EvaluationResult",
+    "list_evaluators",
 ]
 
 
diff --git a/mlflow/models/evaluation/__init__.py b/mlflow/models/evaluation/__init__.py
index 5e098b1a7bb24..1212a00a61ca5 100644
--- a/mlflow/models/evaluation/__init__.py
+++ b/mlflow/models/evaluation/__init__.py
@@ -4,7 +4,7 @@
     EvaluationArtifact,
     evaluate,
     list_evaluators,
-    get_last_failed_evaluator,
+    _get_last_failed_evaluator,
 )
 
 __all__ = [
@@ -13,5 +13,4 @@
     "EvaluationArtifact",
     "evaluate",
     "list_evaluators",
-    "get_last_failed_evaluator",
 ]
diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 8c3f0e2b00e2c..e840cf38d3111 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -233,7 +233,7 @@ class _EvaluationDataset:
     SPARK_DATAFRAME_LIMIT = 10000
 
     def __init__(
-        self, data, *, targets=None, label_col=None, name=None, path=None, feature_names=None
+        self, data, *, targets, name=None, path=None, feature_names=None
     ):
         """
         The values of the constructor arguments comes from the `evaluate` call.
@@ -266,7 +266,7 @@ def __init__(
 
         if feature_names is not None and len(set(feature_names)) < len(list(feature_names)):
             raise ValueError(
-                "`feature_names` argument must be a list containing unique feature " "names."
+                "`feature_names` argument must be a list containing unique feature names."
             )
 
         if isinstance(data, (np.ndarray, list)):
@@ -307,28 +307,28 @@ def __init__(
                     for i in range(num_features)
                 ]
         elif isinstance(data, supported_dataframe_types):
-            if not isinstance(label_col, str):
+            if not isinstance(targets, str):
                 raise ValueError(
-                    "If data is a Pandas DataFrame or Spark DataFrame, `label_col` argument must "
+                    "If data is a Pandas DataFrame or Spark DataFrame, `targets` argument must "
                     "be the name of the column which contains evaluation labels in the `data` "
                     "dataframe."
                 )
             if isinstance(data, spark_df_type):
                 _logger.warning(
-                    f"Specified Spark DataFrame is too large for model evaluation. Only "
+                    "Specified Spark DataFrame is too large for model evaluation. Only "
                     f"the first {_EvaluationDataset.SPARK_DATAFRAME_LIMIT} rows will be used."
                     "If you want evaluate on the whole spark dataframe, please manually call "
                     "`spark_dataframe.toPandas()`."
                 )
                 data = data.limit(_EvaluationDataset.SPARK_DATAFRAME_LIMIT).toPandas()
 
-            self._labels_data = data[label_col].to_numpy()
+            self._labels_data = data[targets].to_numpy()
 
             if feature_names is not None:
                 self._features_data = data[list(feature_names)]
                 self._feature_names = feature_names
             else:
-                self._features_data = data.drop(label_col, axis=1, inplace=False)
+                self._features_data = data.drop(targets, axis=1, inplace=False)
                 self._feature_names = list(self._features_data.columns)
         else:
             raise ValueError(
@@ -451,13 +451,11 @@ class ModelEvaluator(metaclass=ABCMeta):
     @abstractmethod
     def can_evaluate(self, *, model_type, evaluator_config, **kwargs) -> bool:
         """
-        :param model_type: A string describing the model type (e.g., "regressor",
-                           "classifier", …).
+        :param model_type: A string describing the model type (e.g., "regressor", "classifier", …).
         :param evaluator_config: A dictionary of additional configurations for
                                  the evaluator.
-        :param kwargs: For forwards compatibility, a placeholder for additional
-                         arguments that may be added to the evaluation interface
-                         in the future.
+        :param kwargs: For forwards compatibility, a placeholder for additional arguments
+                       that may be added to the evaluation interface in the future.
         :return: True if the evaluator can evaluate the specified model on the
                  specified dataset. False otherwise.
         """
@@ -469,16 +467,14 @@ def evaluate(self, *, model, model_type, dataset, run_id, evaluator_config, **kw
         The abstract API to log metrics and artifacts, and return evaluation results.
 
         :param model: A pyfunc model instance.
-        :param model_type: A string describing the model type (e.g., "regressor",
-                   "classifier", …).
+        :param model_type: A string describing the model type (e.g., "regressor", "classifier", …).
         :param dataset: An instance of :py:class:`mlflow.models.evaluation.EvaluationDataset`
                         containing features and labels (optional) for model evaluation.
         :param run_id: The ID of the MLflow Run to which to log results.
         :param evaluator_config: A dictionary of additional configurations for
                                  the evaluator.
-        :param kwargs: For forwards compatibility, a placeholder for additional
-                         arguments that may be added to the evaluation interface
-                         in the future.
+        :param kwargs: For forwards compatibility, a placeholder for additional arguments that
+                       may be added to the evaluation interface in the future.
         :return: An :py:class:`mlflow.models.evaluation.EvaluationResult` instance containing
                  evaluation results.
         """
@@ -585,7 +581,7 @@ def check_nesting_config_dict(_evaluator_name_list, _evaluator_name_to_conf_map)
 _last_failed_evaluator = None
 
 
-def get_last_failed_evaluator():
+def _get_last_failed_evaluator():
     """
     Return the evaluator name of the last failed evaluator when calling `evalaute`.
     This can be used to check which evaluator fail when `evaluate` API fail.
@@ -653,8 +649,7 @@ def evaluate(
     data,
     *,
     model_type: str,
-    targets=None,
-    label_col=None,
+    targets,
     dataset_name=None,
     dataset_path=None,
     feature_names: list = None,
@@ -667,45 +662,46 @@ def evaluate(
 
     :param model: A pyfunc model instance, or a URI referring to such a model.
 
-    :param data: One of the following:
-     - A numpy array or list of evaluation features, excluding labels.
-     - A Pandas DataFrame, or a spark DataFrame,
-       containing evaluation features and labels. If `feature_names` argument not specified,
-       all columns will be regarded as feature columns, otherwise column names which match
-       `feature_names` will be regarded as feature columns.
+    :param data: One of the following: A numpy array or list of evaluation features, excluding
+                 labels. Or a Pandas DataFrame, or a spark DataFrame, containing evaluation
+                 features and labels. If `feature_names` argument not specified, all columns are
+                 regarded as feature columns, otherwise column names which match `feature_names`
+                 are regarded as feature columns.
 
     :param model_type: A string describing the model type. The default evaluator
                        supports "regressor" and "classifier" as model types.
 
-    :param targets: (Optional) A numpy array or list of evaluation labels, if `data` is also a
-      numpy array or list.
-
-    :param label_col: (Optional) The string name of a column from `data` that contains
-      evaluation labels, if `data` is a DataFrame.
+    :param targets: If `data` is also a numpy array or list, A numpy array or list of evaluation
+                    labels. If `data` is a DataFrame, the string name of a column from `data`
+                    that contains evaluation labels.
 
     :param dataset_name: (Optional) The name of the dataset, must not contain double quotes (“).
-.
-    :param dataset_path: (Optional) the path to a serialized DataFrame
-      (e.g. a delta table, parquet file), must not contain double quotes (“).
-
-    :param feature_names: (Optional) If `data` argument is a feature data numpy array or list,
-      `feature_names` argument is a list of the feature names for each feature. If None, then
-      the `feature_names` will be generated using the format "feature_{feature_index}".
-      if `data` argument is a pandas dataframe or a spark dataframe, `feature_names` argument
-      is a list of the column names of the feature columns in the dataframe. If None, then
-      all columns except the label column will be regarded as feature columns.
-
-    :param evaluators: The name of the evaluator to use for model evaluations, or
-                       a list of evaluator names. If unspecified, all evaluators
-                       capable of evaluating the specified model on the specified
-                       dataset are used. The default evaluator can be referred to
-                       by the name 'default'. If this argument is unspecified, then
-                       fetch all evaluators from the registry. To get all available
-                       evaluators, call :py:func:`mlflow.models.evaluation.list_evaluators`
-    :param evaluator_config: A dictionary of additional configurations to supply
-                             to the evaluator. If multiple evaluators are
-                             specified, each configuration should be supplied as
-                             a nested dictionary whose key is the evaluator name.
+                         the name is logged to the `mlflow.datasets` tag. If not specified, the
+                         dataset hash is used as the dataset name.
+
+    :param dataset_path: (Optional) The path to a serialized DataFrame (e.g. a delta table,
+                         parquet file), must not contain double quotes (“). If specified,
+                         the path is logged to the `mlflow.datasets` tag.
+
+    :param feature_names: (Optional) If the `data` argument is a feature data numpy array or list,
+                          `feature_names` is a list of the feature names for each feature, if
+                          `None`, then the `feature_names` are generated using the format
+                          `feature_{feature_index}`. If the `data` argument is a Pandas DataFrame
+                          or a Spark DataFrame, `feature_names` is a list of the names of the
+                          feature columns in the DataFrame. If `None`, then all columns except
+                          the label column are regarded as feature columns.
+
+    :param evaluators: The name of the evaluator to use for model evaluations, or a list of
+                       evaluator names. If unspecified, all evaluators capable of evaluating the
+                       specified model on the specified dataset are used. The default evaluator
+                       can be referred to by the name 'default'. If this argument is unspecified,
+                       then fetch all evaluators from the registry. To get all available
+                       evaluators, call :py:func:`mlflow.models.list_evaluators`
+
+    :param evaluator_config: A dictionary of additional configurations to supply to the evaluator.
+                             If multiple evaluators are specified, each configuration should be
+                             supplied as a nested dictionary whose key is the evaluator name.
+
     :return: An :py:class:`mlflow.models.evaluation.EvaluationDataset` instance containing
              evaluation results.
 
@@ -737,9 +733,9 @@ def evaluate(
     The metrics/artifacts listed above will be logged into the current active mlflow run,
     if no active run exists, a new mlflow run will be created for logging these metrics/artifacts.
 
-    Besides the metrics and artifacts, the value of the tag `mlflow.datasets` will be logged
-    or appended. The content of the tag value includes the dataset metadata (name/path/hash) and
-    the model uuid.
+    Additionally, information about the specified dataset - hash, name (if specified), path
+    (if specified), and the UUID of the model that evaluated it - is logged to the
+    `mlflow.datasets` tag.
 
     The available `evaluator_config` options for the default evaluator include:
 
@@ -758,9 +754,6 @@ def evaluate(
     Limitations of evaluation dataset:
      - If the input dataset is pandas dataframe, the feature columns in pandas dataframe must be
        scalar value columns, other object types (nd.array/list/etc.) are not supported yet.
-     - If the mlflow model to be evaluated is a pyspark ML model, then the input data must
-       be a spark DataFrame or pandas DataFrame contains a feature column with values of type
-       "pyspark.ml.linalg.Vector", and a label column.
      - For classifier, evaluation dataset labels must contains all distinct values, the dataset
        labels data will be used to infer the number of classes. For binary classifier, the
        negative label value must be 0 or -1 or False, and the positive label value must be
@@ -801,7 +794,6 @@ def evaluate(
     dataset = _EvaluationDataset(
         data,
         targets=targets,
-        label_col=label_col,
         name=dataset_name,
         path=dataset_path,
         feature_names=feature_names,

From 77c5e5177a50789a5c86e15624a9e275ff8af379 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 13 Jan 2022 11:42:53 +0800
Subject: [PATCH 07/14] update doc

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index e840cf38d3111..2f9724ad820c1 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -728,10 +728,11 @@ def evaluate(
        false_positives/false_negatives/true_positives/recall/precision/roc_auc,
        precision_recall_auc), precision-recall merged curves plot, ROC merged curves plot.
 
-    The logged mlflow metric keys are constructed using the format: "{metric_name}_on_{dataset_name}".
+    The logged mlflow metric keys are constructed using the format: `{metric_name}_on_{dataset_name}`.
+    Any preexisting metrics with the same name are overwritten.
 
-    The metrics/artifacts listed above will be logged into the current active mlflow run,
-    if no active run exists, a new mlflow run will be created for logging these metrics/artifacts.
+    The metrics/artifacts listed above are logged to the active MLflow run.
+    If no active run exists, a new MLflow run is created for logging these metrics and artifacts.
 
     Additionally, information about the specified dataset - hash, name (if specified), path
     (if specified), and the UUID of the model that evaluated it - is logged to the

From fbdecff5530c9c37c259b02089e4641730818d72 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 13 Jan 2022 12:01:12 +0800
Subject: [PATCH 08/14] add shap limitation on value type

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              | 12 ++++++------
 mlflow/models/evaluation/default_evaluator.py | 17 +++++++++++++----
 tests/models/test_default_evaluator.py        |  2 +-
 tests/models/test_evaluation.py               | 10 +++++-----
 4 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 2f9724ad820c1..3259d4dce71a4 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -648,8 +648,8 @@ def evaluate(
     model: Union[str, "mlflow.pyfunc.PyFuncModel"],
     data,
     *,
-    model_type: str,
     targets,
+    model_type: str,
     dataset_name=None,
     dataset_path=None,
     feature_names: list = None,
@@ -753,14 +753,14 @@ def evaluate(
        ROC curve and Precision-Recall curve.
 
     Limitations of evaluation dataset:
-     - If the input dataset is pandas dataframe, the feature columns in pandas dataframe must be
-       scalar value columns, other object types (nd.array/list/etc.) are not supported yet.
      - For classifier, evaluation dataset labels must contains all distinct values, the dataset
-       labels data will be used to infer the number of classes. For binary classifier, the
+       labels data will be used to infer the number of classes.
+     - For binary classifier, the
        negative label value must be 0 or -1 or False, and the positive label value must be
        1 or True.
-       For multiclass classifier, if logging explainability insights enabled, the label values
-       must be number type.
+     - If logging explainability insights enabled, the label values
+       must be number type, and all feature values must be number type and each feature column
+       must only contain scaler values.
 
     Limitations of metrics/artifacts computation:
      - For classifier, some metrics and plot computation require model provides
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 578e961f131f3..d91386216b700 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -332,15 +332,24 @@ def _log_model_explainability(self):
             )
             return
 
-        if self.model_type == "classifier" and not all(
-            [isinstance(label, (numbers.Number, np.bool_)) for label in self.label_list]
-        ):
+        if not (np.issubdtype(self.y.dtype, np.number) or self.y.dtype == np.bool_):
+            # Note: python bool type inherits number type but np.bool_ does not inherit np.number.
             _logger.warning(
                 "Skip logging model explainability insights because it requires all label "
-                "values to be Number type."
+                "values to be number type or bool type."
             )
             return
 
+        feature_dtypes = list(self.X.dtypes) if isinstance(self.X, pd.DataFrame) else [self.X.dtype]
+        for feature_dtype in feature_dtypes:
+            if not np.issubdtype(feature_dtype, np.number):
+                _logger.warning(
+                    "Skip logging model explainability insights because it requires all feature "
+                    "values to be number type, and each feature column must only contain scaler "
+                    "values."
+                )
+                return
+
         try:
             import shap
             import matplotlib.pyplot as pyplot
diff --git a/tests/models/test_default_evaluator.py b/tests/models/test_default_evaluator.py
index b1cacb6c72d2a..f12a5c698a036 100644
--- a/tests/models/test_default_evaluator.py
+++ b/tests/models/test_default_evaluator.py
@@ -193,7 +193,7 @@ def test_spark_regressor_model_evaluation(spark_linear_regressor_model_uri, diab
             spark_linear_regressor_model_uri,
             diabetes_spark_dataset._constructor_args["data"],
             model_type="regressor",
-            label_col=diabetes_spark_dataset._constructor_args["label_col"],
+            targets=diabetes_spark_dataset._constructor_args["targets"],
             dataset_name=diabetes_spark_dataset.name,
             evaluators="default",
             evaluator_config={"log_model_explainability": True},
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index 958f7629ec1d6..bfbf18307f43f 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -117,7 +117,7 @@ def diabetes_dataset():
 @pytest.fixture(scope="module")
 def diabetes_spark_dataset():
     spark_df = get_diabetes_spark_dataset().sample(fraction=0.3, seed=1)
-    constructor_args = {"data": spark_df, "label_col": "label", "name": "diabetes_spark_dataset"}
+    constructor_args = {"data": spark_df, "targets": "label", "name": "diabetes_spark_dataset"}
     ds = _EvaluationDataset(**constructor_args)
     ds._constructor_args = constructor_args
     return ds
@@ -211,7 +211,7 @@ def iris_pandas_df_dataset():
             "y": eval_y,
         }
     )
-    return _EvaluationDataset(data=data, label_col="y", name="iris_pandas_df_dataset")
+    return _EvaluationDataset(data=data, targets="y", name="iris_pandas_df_dataset")
 
 
 def test_classifier_evaluate(multiclass_logistic_regressor_model_uri, iris_dataset):
@@ -358,7 +358,7 @@ def test_dataset_hash(iris_dataset, iris_pandas_df_dataset, diabetes_spark_datas
 
 def test_dataset_with_pandas_dataframe():
     data = pd.DataFrame({"f1": [1, 2], "f2": [3, 4], "f3": [5, 6], "label": [0, 1]})
-    eval_dataset = _EvaluationDataset(data=data, label_col="label")
+    eval_dataset = _EvaluationDataset(data=data, targets="label")
 
     assert list(eval_dataset.features_data.columns) == ["f1", "f2", "f3"]
     assert np.array_equal(eval_dataset.features_data.f1.to_numpy(), [1, 2])
@@ -366,7 +366,7 @@ def test_dataset_with_pandas_dataframe():
     assert np.array_equal(eval_dataset.features_data.f3.to_numpy(), [5, 6])
     assert np.array_equal(eval_dataset.labels_data, [0, 1])
 
-    eval_dataset2 = _EvaluationDataset(data=data, label_col="label", feature_names=["f3", "f2"])
+    eval_dataset2 = _EvaluationDataset(data=data, targets="label", feature_names=["f3", "f2"])
     assert list(eval_dataset2.features_data.columns) == ["f3", "f2"]
     assert np.array_equal(eval_dataset2.features_data.f2.to_numpy(), [3, 4])
     assert np.array_equal(eval_dataset2.features_data.f3.to_numpy(), [5, 6])
@@ -413,7 +413,7 @@ def test_dataset_autogen_feature_names():
 def test_dataset_from_spark_df(spark_session):
     spark_df = spark_session.createDataFrame([(1.0, 2.0, 3.0)] * 10, ["f1", "f2", "y"])
     with mock.patch.object(_EvaluationDataset, "SPARK_DATAFRAME_LIMIT", 5):
-        dataset = _EvaluationDataset(spark_df, label_col="y")
+        dataset = _EvaluationDataset(spark_df, targets="y")
         assert list(dataset.features_data.columns) == ["f1", "f2"]
         assert list(dataset.features_data["f1"]) == [1.0] * 5
         assert list(dataset.features_data["f2"]) == [2.0] * 5

From 953612fbaae3e2ee3cb24cf156acdba26bc5326b Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 13 Jan 2022 12:23:55 +0800
Subject: [PATCH 09/14] fix format

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py                            | 4 +---
 .../mlflow_test_plugin/dummy_evaluator.py                   | 6 +++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 3259d4dce71a4..e229e76d96986 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -232,9 +232,7 @@ class _EvaluationDataset:
     NUM_SAMPLE_ROWS_FOR_HASH = 5
     SPARK_DATAFRAME_LIMIT = 10000
 
-    def __init__(
-        self, data, *, targets, name=None, path=None, feature_names=None
-    ):
+    def __init__(self, data, *, targets, name=None, path=None, feature_names=None):
         """
         The values of the constructor arguments comes from the `evaluate` call.
         """
diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
index 69f4a08daf754..1722e92146bfe 100644
--- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -51,7 +51,7 @@ def evaluate(
         if model_type == "classifier":
             accuracy_score = sk_metrics.accuracy_score(y, y_pred)
 
-            metrics = {'accuracy_score': accuracy_score}
+            metrics = {"accuracy_score": accuracy_score}
             self._log_metrics(run_id, metrics, dataset.name)
             confusion_matrix = sk_metrics.confusion_matrix(y, y_pred)
             confusion_matrix_artifact_name = f"confusion_matrix_on_{dataset.name}.csv"
@@ -69,8 +69,8 @@ def evaluate(
             mean_absolute_error = sk_metrics.mean_absolute_error(y, y_pred)
             mean_squared_error = sk_metrics.mean_squared_error(y, y_pred)
             metrics = {
-                'mean_absolute_error': mean_absolute_error,
-                'mean_squared_error': mean_squared_error,
+                "mean_absolute_error": mean_absolute_error,
+                "mean_squared_error": mean_squared_error,
             }
             self._log_metrics(run_id, metrics, dataset.name)
             artifacts = {}

From 3572198f204fb3d69a3181eb7c83e21838467de5 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 13 Jan 2022 15:45:38 +0800
Subject: [PATCH 10/14] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 .../mlflow.models.evaluation.base.rst         |  7 ------
 mlflow/models/evaluation/__init__.py          |  1 -
 mlflow/models/evaluation/base.py              | 22 +++++++++----------
 mlflow/models/evaluation/default_evaluator.py |  4 ++--
 4 files changed, 12 insertions(+), 22 deletions(-)
 delete mode 100644 docs/source/python_api/mlflow.models.evaluation.base.rst

diff --git a/docs/source/python_api/mlflow.models.evaluation.base.rst b/docs/source/python_api/mlflow.models.evaluation.base.rst
deleted file mode 100644
index 501303a39735a..0000000000000
--- a/docs/source/python_api/mlflow.models.evaluation.base.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-mlflow.models.evaluation.base
-=============================
-
-.. automodule:: mlflow.models.evaluation.base
-    :members:
-    :undoc-members:
-    :show-inheritance:
diff --git a/mlflow/models/evaluation/__init__.py b/mlflow/models/evaluation/__init__.py
index 1212a00a61ca5..aacce2391145b 100644
--- a/mlflow/models/evaluation/__init__.py
+++ b/mlflow/models/evaluation/__init__.py
@@ -4,7 +4,6 @@
     EvaluationArtifact,
     evaluate,
     list_evaluators,
-    _get_last_failed_evaluator,
 )
 
 __all__ = [
diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index e229e76d96986..63e25fc4802e1 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -139,7 +139,7 @@ def metrics(self) -> Dict[str, Any]:
         return self._metrics
 
     @property
-    def artifacts(self) -> Dict[str, "mlflow.models.evaluation.EvaluationArtifact"]:
+    def artifacts(self) -> Dict[str, "mlflow.models.EvaluationArtifact"]:
         """
         A dictionary mapping standardized artifact names (e.g. "roc_data") to
         artifact content and location information
@@ -225,7 +225,7 @@ def _gen_md5_for_arraylike_obj(md5_gen, data):
 class _EvaluationDataset:
     """
     An input dataset for model evaluation. This is intended for use with the
-    :py:func:`mlflow.models.evaluation.evaluate()`
+    :py:func:`mlflow.models.evaluate()`
     API.
     """
 
@@ -426,7 +426,6 @@ def __hash__(self):
 
     def __eq__(self, other):
         import numpy as np
-        import pandas as pd
 
         if not isinstance(other, _EvaluationDataset):
             return False
@@ -466,14 +465,14 @@ def evaluate(self, *, model, model_type, dataset, run_id, evaluator_config, **kw
 
         :param model: A pyfunc model instance.
         :param model_type: A string describing the model type (e.g., "regressor", "classifier", …).
-        :param dataset: An instance of :py:class:`mlflow.models.evaluation.EvaluationDataset`
+        :param dataset: An instance of `mlflow.models.evaluation.base._EvaluationDataset`
                         containing features and labels (optional) for model evaluation.
         :param run_id: The ID of the MLflow Run to which to log results.
         :param evaluator_config: A dictionary of additional configurations for
                                  the evaluator.
         :param kwargs: For forwards compatibility, a placeholder for additional arguments that
                        may be added to the evaluation interface in the future.
-        :return: An :py:class:`mlflow.models.evaluation.EvaluationResult` instance containing
+        :return: An :py:class:`mlflow.models.EvaluationResult` instance containing
                  evaluation results.
         """
         raise NotImplementedError()
@@ -653,7 +652,7 @@ def evaluate(
     feature_names: list = None,
     evaluators=None,
     evaluator_config=None,
-) -> "mlflow.models.evaluation.EvaluationResult":
+):
     """
     Evaluate a PyFunc model on the specified dataset using one or more specified evaluators, and
     log resulting metrics & artifacts to MLflow Tracking.
@@ -700,7 +699,7 @@ def evaluate(
                              If multiple evaluators are specified, each configuration should be
                              supplied as a nested dictionary whose key is the evaluator name.
 
-    :return: An :py:class:`mlflow.models.evaluation.EvaluationDataset` instance containing
+    :return: An :py:class:`mlflow.models.EvaluationResult` instance containing
              evaluation results.
 
     The default evaluator supports the 'regressor' and 'classifer' model types.
@@ -726,8 +725,8 @@ def evaluate(
        false_positives/false_negatives/true_positives/recall/precision/roc_auc,
        precision_recall_auc), precision-recall merged curves plot, ROC merged curves plot.
 
-    The logged mlflow metric keys are constructed using the format: `{metric_name}_on_{dataset_name}`.
-    Any preexisting metrics with the same name are overwritten.
+    The logged mlflow metric keys are constructed using the format:
+    `{metric_name}_on_{dataset_name}`. Any preexisting metrics with the same name are overwritten.
 
     The metrics/artifacts listed above are logged to the active MLflow run.
     If no active run exists, a new MLflow run is created for logging these metrics and artifacts.
@@ -756,9 +755,6 @@ def evaluate(
      - For binary classifier, the
        negative label value must be 0 or -1 or False, and the positive label value must be
        1 or True.
-     - If logging explainability insights enabled, the label values
-       must be number type, and all feature values must be number type and each feature column
-       must only contain scaler values.
 
     Limitations of metrics/artifacts computation:
      - For classifier, some metrics and plot computation require model provides
@@ -772,6 +768,8 @@ def evaluate(
        support multi-class classifier, in this case, default evaluator will fallback to use
        shap Exact or Permutation explainer.
      - Logging model explainability insights is not currently supported for PySpark models.
+     - The evaluation dataset label values must be number type, and all feature values must be
+       number type and each feature column must only contain scalar values.
     """
     from mlflow.pyfunc import PyFuncModel
 
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index d91386216b700..fb4c33efbff40 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -325,7 +325,7 @@ def _log_model_explainability(self):
             #  but spark model input dataframe contains Vector type feature column
             #  which shap explainer does not support.
             #  To support this, we need expand the Vector type feature column into
-            #  multiple scaler feature columns and pass it to shap explainer.
+            #  multiple scalar feature columns and pass it to shap explainer.
             _logger.warning(
                 "Logging model explainability insights is not currently supported for PySpark "
                 "models."
@@ -345,7 +345,7 @@ def _log_model_explainability(self):
             if not np.issubdtype(feature_dtype, np.number):
                 _logger.warning(
                     "Skip logging model explainability insights because it requires all feature "
-                    "values to be number type, and each feature column must only contain scaler "
+                    "values to be number type, and each feature column must only contain scalar "
                     "values."
                 )
                 return

From 99146cf5c3c95259a6e6079a67aa7a48042a06a1 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 13 Jan 2022 16:22:08 +0800
Subject: [PATCH 11/14] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 63e25fc4802e1..4fcd98b5c70ff 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -665,13 +665,13 @@ def evaluate(
                  regarded as feature columns, otherwise column names which match `feature_names`
                  are regarded as feature columns.
 
-    :param model_type: A string describing the model type. The default evaluator
-                       supports "regressor" and "classifier" as model types.
-
     :param targets: If `data` is also a numpy array or list, A numpy array or list of evaluation
                     labels. If `data` is a DataFrame, the string name of a column from `data`
                     that contains evaluation labels.
 
+    :param model_type: A string describing the model type. The default evaluator
+                       supports "regressor" and "classifier" as model types.
+
     :param dataset_name: (Optional) The name of the dataset, must not contain double quotes (“).
                          the name is logged to the `mlflow.datasets` tag. If not specified, the
                          dataset hash is used as the dataset name.

From 8cd1182bdf39b06e94818c3f31ee2edb077173aa Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Fri, 14 Jan 2022 12:08:44 +0800
Subject: [PATCH 12/14] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 .../python_api/mlflow.models.evaluation.rst   |   8 -
 mlflow/models/evaluation/__init__.py          |   2 +
 mlflow/models/evaluation/artifacts.py         |   4 +-
 mlflow/models/evaluation/base.py              | 193 +++++++++---------
 mlflow/models/evaluation/default_evaluator.py |   8 +-
 tests/models/test_evaluation.py               |  50 ++---
 .../mlflow_test_plugin/dummy_evaluator.py     |   4 +-
 7 files changed, 131 insertions(+), 138 deletions(-)
 delete mode 100644 docs/source/python_api/mlflow.models.evaluation.rst

diff --git a/docs/source/python_api/mlflow.models.evaluation.rst b/docs/source/python_api/mlflow.models.evaluation.rst
deleted file mode 100644
index 4034f6a5d0c4c..0000000000000
--- a/docs/source/python_api/mlflow.models.evaluation.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-mlflow.models.evaluation
-========================
-
-.. automodule:: mlflow.models.evaluation
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
diff --git a/mlflow/models/evaluation/__init__.py b/mlflow/models/evaluation/__init__.py
index aacce2391145b..094fd4d98f3ff 100644
--- a/mlflow/models/evaluation/__init__.py
+++ b/mlflow/models/evaluation/__init__.py
@@ -1,5 +1,6 @@
 from mlflow.models.evaluation.base import (
     ModelEvaluator,
+    EvaluationDataset,
     EvaluationResult,
     EvaluationArtifact,
     evaluate,
@@ -8,6 +9,7 @@
 
 __all__ = [
     "ModelEvaluator",
+    "EvaluationDataset",
     "EvaluationResult",
     "EvaluationArtifact",
     "evaluate",
diff --git a/mlflow/models/evaluation/artifacts.py b/mlflow/models/evaluation/artifacts.py
index 6343d6e15aa48..3765df56cde04 100644
--- a/mlflow/models/evaluation/artifacts.py
+++ b/mlflow/models/evaluation/artifacts.py
@@ -4,7 +4,7 @@
 
 
 class ImageEvaluationArtifact(EvaluationArtifact):
-    def save(self, output_artifact_path):
+    def _save(self, output_artifact_path):
         self._content.save(output_artifact_path)
 
     def _load_content_from_file(self, local_artifact_path):
@@ -15,7 +15,7 @@ def _load_content_from_file(self, local_artifact_path):
 
 
 class CsvEvaluationArtifact(EvaluationArtifact):
-    def save(self, output_artifact_path):
+    def _save(self, output_artifact_path):
         self._content.to_csv(output_artifact_path, index=False)
 
     def _load_content_from_file(self, local_artifact_path):
diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 4fcd98b5c70ff..918538a118b00 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -39,7 +39,7 @@ def _load_content_from_file(self, local_artifact_path):
         """
         pass
 
-    def load(self, local_artifact_path=None):
+    def _load(self, local_artifact_path=None):
         """
         If `local_artifact_path` is None, download artifact from the artifact uri,
         otherwise load artifact content from specified path.
@@ -56,7 +56,7 @@ def load(self, local_artifact_path=None):
         return self._content
 
     @abstractmethod
-    def save(self, output_artifact_path):
+    def _save(self, output_artifact_path):
         """Save artifact content into specified path."""
         pass
 
@@ -66,7 +66,7 @@ def content(self):
         The content of the artifact (representation varies)
         """
         if self._content is None:
-            self.load()
+            self._load()
         return self._content
 
     @property
@@ -79,8 +79,8 @@ def uri(self) -> str:
 
 class EvaluationResult:
     """
-    Represent an return value of `mlflow.evaluate()` API. Contains metrics dict and
-    artifact dict.
+    Represents the model evaluation outputs of a `mlflow.evaluate()` API call, containing
+    both scalar metrics and output artifacts such as performance plots.
     """
 
     def __init__(self, metrics, artifacts):
@@ -104,7 +104,7 @@ def load(cls, path):
             uri = meta["uri"]
             ArtifactCls = _get_class_from_string(meta["class_name"])
             artifact = ArtifactCls(uri=uri)
-            artifact.load(os.path.join(artifacts_dir, artifact_name))
+            artifact._load(os.path.join(artifacts_dir, artifact_name))
             artifacts[artifact_name] = artifact
 
         return EvaluationResult(metrics=metrics, artifacts=artifacts)
@@ -129,7 +129,7 @@ def save(self, path):
         os.mkdir(artifacts_dir)
 
         for artifact_name, artifact in self.artifacts.items():
-            artifact.save(os.path.join(artifacts_dir, artifact_name))
+            artifact._save(os.path.join(artifacts_dir, artifact_name))
 
     @property
     def metrics(self) -> Dict[str, Any]:
@@ -213,16 +213,16 @@ def _gen_md5_for_arraylike_obj(md5_gen, data):
 
     len_bytes = _hash_uint64_ndarray_as_bytes(np.array([len(data)], dtype="uint64"))
     md5_gen.update(len_bytes)
-    if len(data) < _EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
+    if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
         md5_gen.update(_hash_array_like_obj_as_bytes(data))
     else:
-        head_rows = data[: _EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH]
-        tail_rows = data[-_EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH :]
+        head_rows = data[: EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH]
+        tail_rows = data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH :]
         md5_gen.update(_hash_array_like_obj_as_bytes(head_rows))
         md5_gen.update(_hash_array_like_obj_as_bytes(tail_rows))
 
 
-class _EvaluationDataset:
+class EvaluationDataset:
     """
     An input dataset for model evaluation. This is intended for use with the
     :py:func:`mlflow.models.evaluate()`
@@ -314,11 +314,11 @@ def __init__(self, data, *, targets, name=None, path=None, feature_names=None):
             if isinstance(data, spark_df_type):
                 _logger.warning(
                     "Specified Spark DataFrame is too large for model evaluation. Only "
-                    f"the first {_EvaluationDataset.SPARK_DATAFRAME_LIMIT} rows will be used."
+                    f"the first {EvaluationDataset.SPARK_DATAFRAME_LIMIT} rows will be used."
                     "If you want evaluate on the whole spark dataframe, please manually call "
                     "`spark_dataframe.toPandas()`."
                 )
-                data = data.limit(_EvaluationDataset.SPARK_DATAFRAME_LIMIT).toPandas()
+                data = data.limit(EvaluationDataset.SPARK_DATAFRAME_LIMIT).toPandas()
 
             self._labels_data = data[targets].to_numpy()
 
@@ -427,7 +427,7 @@ def __hash__(self):
     def __eq__(self, other):
         import numpy as np
 
-        if not isinstance(other, _EvaluationDataset):
+        if not isinstance(other, EvaluationDataset):
             return False
 
         if isinstance(self._features_data, np.ndarray):
@@ -659,13 +659,16 @@ def evaluate(
 
     :param model: A pyfunc model instance, or a URI referring to such a model.
 
-    :param data: One of the following: A numpy array or list of evaluation features, excluding
-                 labels. Or a Pandas DataFrame, or a spark DataFrame, containing evaluation
-                 features and labels. If `feature_names` argument not specified, all columns are
-                 regarded as feature columns, otherwise column names which match `feature_names`
-                 are regarded as feature columns.
+    :param data: One of the following:
 
-    :param targets: If `data` is also a numpy array or list, A numpy array or list of evaluation
+                  - A numpy array or list of evaluation features, excluding labels.
+
+                  - A Pandas DataFrame or Spark DataFrame, containing evaluation features and
+                    labels. If `feature_names` argument not specified, all columns are regarded
+                    as feature columns. Otherwise, only column names present in `feature_names`
+                    are regarded as feature columns.
+
+    :param targets: If `data` is a numpy array or list, a numpy array or list of evaluation
                     labels. If `data` is a DataFrame, the string name of a column from `data`
                     that contains evaluation labels.
 
@@ -676,24 +679,23 @@ def evaluate(
                          the name is logged to the `mlflow.datasets` tag. If not specified, the
                          dataset hash is used as the dataset name.
 
-    :param dataset_path: (Optional) The path to a serialized DataFrame (e.g. a delta table,
-                         parquet file), must not contain double quotes (“). If specified,
-                         the path is logged to the `mlflow.datasets` tag.
+    :param dataset_path: (Optional) The path where the data is stored. Must not contain double
+                         quotes (“). If specified, the path is logged to the `mlflow.datasets`
+                         tag for lineage tracking purposes.
 
     :param feature_names: (Optional) If the `data` argument is a feature data numpy array or list,
-                          `feature_names` is a list of the feature names for each feature, if
+                          `feature_names` is a list of the feature names for each feature. If
                           `None`, then the `feature_names` are generated using the format
                           `feature_{feature_index}`. If the `data` argument is a Pandas DataFrame
                           or a Spark DataFrame, `feature_names` is a list of the names of the
                           feature columns in the DataFrame. If `None`, then all columns except
                           the label column are regarded as feature columns.
 
-    :param evaluators: The name of the evaluator to use for model evaluations, or a list of
+    :param evaluators: The name of the evaluator to use for model evaluation, or a list of
                        evaluator names. If unspecified, all evaluators capable of evaluating the
                        specified model on the specified dataset are used. The default evaluator
-                       can be referred to by the name 'default'. If this argument is unspecified,
-                       then fetch all evaluators from the registry. To get all available
-                       evaluators, call :py:func:`mlflow.models.list_evaluators`
+                       can be referred to by the name 'default'. To see all available evaluators,
+                       call :py:func:`mlflow.models.list_evaluators`.
 
     :param evaluator_config: A dictionary of additional configurations to supply to the evaluator.
                              If multiple evaluators are specified, each configuration should be
@@ -702,74 +704,71 @@ def evaluate(
     :return: An :py:class:`mlflow.models.EvaluationResult` instance containing
              evaluation results.
 
-    The default evaluator supports the 'regressor' and 'classifer' model types.
-
-    For both the 'regressor' and 'classifer' types, the default evaluator will generate model
-    summary plots and feature importance plots generated by shap explainer.
-
-    For regressor model, the default evaluator will additionally log:
-
-     - **metrics**: example_count, mean_absolute_error, mean_squared_error, root_mean_squared_error,
-       sum_on_label, mean_on_label, r2_score, max_error, mean_absolute_percentage_error.
-
-    For binary classifier, the default evaluator will additionally log:
-
-     - **metrics**: true_negatives, false_positives, false_negatives, true_positives, recall,
-       precision, f1_score, accuracy, example_count, log_loss, roc_auc, precision_recall_auc.
-     - **artifacts**: lift curve plot, precision-recall plot, ROC plot.
-
-    For multiclass classifier, the default evaluator will additionally log:
-
-     - **metrics**: accuracy, example_count, f1_score_micro, f1_score_macro, log_loss
-     - **artifacts**: A CSV file for "per_class_metrics" (per-class metrics includes true_negatives/
-       false_positives/false_negatives/true_positives/recall/precision/roc_auc,
-       precision_recall_auc), precision-recall merged curves plot, ROC merged curves plot.
-
-    The logged mlflow metric keys are constructed using the format:
-    `{metric_name}_on_{dataset_name}`. Any preexisting metrics with the same name are overwritten.
-
-    The metrics/artifacts listed above are logged to the active MLflow run.
-    If no active run exists, a new MLflow run is created for logging these metrics and artifacts.
-
-    Additionally, information about the specified dataset - hash, name (if specified), path
-    (if specified), and the UUID of the model that evaluated it - is logged to the
-    `mlflow.datasets` tag.
-
-    The available `evaluator_config` options for the default evaluator include:
-
-     - **log_model_explainability**: A boolean value specifying whether or not to log model
-       explainability insights, default value is True.
-     - **explainability_algorithm**: A string to specify the SHAP Explainer algorithm for model
-       explainability. Supported algorithm includes: 'exact', 'permutation', 'partition'.
-       If not set, `shap.Explainer` is used with the "auto" algorithm, which chooses the best
-       Explainer based on the model.
-     - **explainability_nsamples**: The number of sample rows to use for computing model
-       explainability insights. Default value is 2000.
-     - **max_num_classes_threshold_logging_roc_pr_curve_for_multiclass_classifier**:
-       For multiclass classifier, specify the max number of classes which allow logging per-class
-       ROC curve and Precision-Recall curve.
-
-    Limitations of evaluation dataset:
-     - For classifier, evaluation dataset labels must contains all distinct values, the dataset
-       labels data will be used to infer the number of classes.
-     - For binary classifier, the
-       negative label value must be 0 or -1 or False, and the positive label value must be
-       1 or True.
-
-    Limitations of metrics/artifacts computation:
-     - For classifier, some metrics and plot computation require model provides
-       "predict probability" function. Currently, for sklearn model, we will extract "predict_proba"
-       method from the raw model to achieve this, for other model, it will skip logging
-       metrics/artifacts which require probability prediction.
-
-    Limitations of default evaluator logging model explainability insights:
-     - The `shap.Explainer` "auto" algorithm will choose Linear explainer for linear model,
-       and choose Tree explainer for tree model. But the shap Linear/Tree explainer does not
-       support multi-class classifier, in this case, default evaluator will fallback to use
-       shap Exact or Permutation explainer.
-     - Logging model explainability insights is not currently supported for PySpark models.
-     - The evaluation dataset label values must be number type, and all feature values must be
-       number type and each feature column must only contain scalar values.
+    Default Evaluator behavior:
+     - The default evaluator supports the 'regressor' and 'classifer' model types.
+     - For both the 'regressor' and 'classifer' types, the default evaluator will generate model
+       summary plots and feature importance plots generated by shap explainer.
+     - For regressor model, the default evaluator will additionally log:
+        - **metrics**: example_count, mean_absolute_error, mean_squared_error,
+          root_mean_squared_error, sum_on_label, mean_on_label, r2_score, max_error,
+          mean_absolute_percentage_error.
+
+     - For binary classifier, the default evaluator will additionally log:
+        - **metrics**: true_negatives, false_positives, false_negatives, true_positives, recall,
+          precision, f1_score, accuracy, example_count, log_loss, roc_auc, precision_recall_auc.
+        - **artifacts**: lift curve plot, precision-recall plot, ROC plot.
+
+     - For multiclass classifier, the default evaluator will additionally log:
+        - **metrics**: accuracy, example_count, f1_score_micro, f1_score_macro, log_loss
+        - **artifacts**: A CSV file for "per_class_metrics" (per-class metrics includes
+          true_negatives/false_positives/false_negatives/true_positives/recall/precision/roc_auc,
+          precision_recall_auc), precision-recall merged curves plot, ROC merged curves plot.
+
+     - The logged Mlflow metric keys are constructed using the format:
+       `{metric_name}_on_{dataset_name}`. Any preexisting metrics with the same name are
+       overwritten.
+
+     - The metrics/artifacts listed above are logged to the active MLflow run.
+       If no active run exists, a new MLflow run is created for logging these metrics and
+       artifacts.
+
+     - Additionally, information about the specified dataset - hash, name (if specified), path
+       (if specified), and the UUID of the model that evaluated it - is logged to the
+       `mlflow.datasets` tag.
+
+     - The available `evaluator_config` options for the default evaluator include:
+        - **log_model_explainability**: A boolean value specifying whether or not to log model
+          explainability insights, default value is True.
+        - **explainability_algorithm**: A string to specify the SHAP Explainer algorithm for model
+          explainability. Supported algorithm includes: 'exact', 'permutation', 'partition'.
+          If not set, `shap.Explainer` is used with the "auto" algorithm, which chooses the best
+          Explainer based on the model.
+        - **explainability_nsamples**: The number of sample rows to use for computing model
+          explainability insights. Default value is 2000.
+        - **max_num_classes_threshold_logging_roc_pr_curve_for_multiclass_classifier**:
+          For multiclass classifier, specify the max number of classes which allow logging
+          per-class ROC curve and Precision-Recall curve.
+
+     - Limitations of evaluation dataset:
+        - For classification tasks, dataset labels are used to infer the total number of classes.
+        - For binary classification tasks, the negative label value must be 0 or -1 or False, and
+        - the positive label value must be 1 or True.
+
+     - Limitations of metrics/artifacts computation:
+        - For classification tasks, some metric and artifact computations require the model to
+          output class probabilities. Currently, for scikit-learn models, the default evaluator
+          calls the "predict_proba" method on the underlying model to obtain probabilities. For
+          other model types, the default evaluator does not compute metrics/artifacts that require
+          probability outputs.
+
+     - Limitations of default evaluator logging model explainability insights:
+        - The `shap.Explainer` "auto" algorithm will use the Linear explainer for linear models
+          and the Tree explainer for tree models. Because SHAP's Linear and Tree explainers
+          do not support multi-class classification, the default evaluator will fall back to using
+          the Exact or Permutation explainers for multi-class classification tasks.
+        - Logging model explainability insights is not currently supported for PySpark models.
+        - The evaluation dataset label values must be numeric or boolean, all feature values
+          must be numeric, and each feature column must only contain scalar values.
     """
     from mlflow.pyfunc import PyFuncModel
 
@@ -788,7 +787,7 @@ def evaluate(
         evaluator_name_to_conf_map,
     ) = _normalize_evaluators_and_evaluator_config_args(evaluators, evaluator_config)
 
-    dataset = _EvaluationDataset(
+    dataset = EvaluationDataset(
         data,
         targets=targets,
         name=dataset_name,
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index fb4c33efbff40..a61aff411bca0 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -301,7 +301,7 @@ def _log_image_artifact(
 
         mlflow.log_artifact(artifact_file_local_path)
         artifact = ImageEvaluationArtifact(uri=mlflow.get_artifact_uri(artifact_file_name))
-        artifact.load(artifact_file_local_path)
+        artifact._load(artifact_file_local_path)
         self.artifacts[artifact_name] = artifact
 
     def _log_pandas_df_artifact(self, pandas_df, artifact_name):
@@ -313,7 +313,7 @@ def _log_pandas_df_artifact(self, pandas_df, artifact_name):
             uri=mlflow.get_artifact_uri(artifact_file_name),
             content=pandas_df,
         )
-        artifact.load(artifact_file_local_path)
+        artifact._load(artifact_file_local_path)
         self.artifacts[artifact_name] = artifact
 
     def _log_model_explainability(self):
@@ -336,7 +336,7 @@ def _log_model_explainability(self):
             # Note: python bool type inherits number type but np.bool_ does not inherit np.number.
             _logger.warning(
                 "Skip logging model explainability insights because it requires all label "
-                "values to be number type or bool type."
+                "values to be numeric or boolean."
             )
             return
 
@@ -345,7 +345,7 @@ def _log_model_explainability(self):
             if not np.issubdtype(feature_dtype, np.number):
                 _logger.warning(
                     "Skip logging model explainability insights because it requires all feature "
-                    "values to be number type, and each feature column must only contain scalar "
+                    "values to be numeric, and each feature column must only contain scalar "
                     "values."
                 )
                 return
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index bfbf18307f43f..219c095e50c25 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -8,7 +8,7 @@
     EvaluationArtifact,
 )
 from mlflow.models.evaluation.base import (
-    _EvaluationDataset,
+    EvaluationDataset,
     _normalize_evaluators_and_evaluator_config_args as _normalize_config,
 )
 import hashlib
@@ -99,7 +99,7 @@ def iris_dataset():
     X, y = get_iris()
     eval_X, eval_y = X[0::3], y[0::3]
     constructor_args = {"data": eval_X, "targets": eval_y, "name": "iris_dataset"}
-    ds = _EvaluationDataset(**constructor_args)
+    ds = EvaluationDataset(**constructor_args)
     ds._constructor_args = constructor_args
     return ds
 
@@ -109,7 +109,7 @@ def diabetes_dataset():
     X, y = get_diabetes_dataset()
     eval_X, eval_y = X[0::3], y[0::3]
     constructor_args = {"data": eval_X, "targets": eval_y, "name": "diabetes_dataset"}
-    ds = _EvaluationDataset(**constructor_args)
+    ds = EvaluationDataset(**constructor_args)
     ds._constructor_args = constructor_args
     return ds
 
@@ -118,7 +118,7 @@ def diabetes_dataset():
 def diabetes_spark_dataset():
     spark_df = get_diabetes_spark_dataset().sample(fraction=0.3, seed=1)
     constructor_args = {"data": spark_df, "targets": "label", "name": "diabetes_spark_dataset"}
-    ds = _EvaluationDataset(**constructor_args)
+    ds = EvaluationDataset(**constructor_args)
     ds._constructor_args = constructor_args
     return ds
 
@@ -128,7 +128,7 @@ def breast_cancer_dataset():
     X, y = get_breast_cancer_dataset()
     eval_X, eval_y = X[0::3], y[0::3]
     constructor_args = {"data": eval_X, "targets": eval_y, "name": "breast_cancer_dataset"}
-    ds = _EvaluationDataset(**constructor_args)
+    ds = EvaluationDataset(**constructor_args)
     ds._constructor_args = constructor_args
     return ds
 
@@ -211,7 +211,7 @@ def iris_pandas_df_dataset():
             "y": eval_y,
         }
     )
-    return _EvaluationDataset(data=data, targets="y", name="iris_pandas_df_dataset")
+    return EvaluationDataset(data=data, targets="y", name="iris_pandas_df_dataset")
 
 
 def test_classifier_evaluate(multiclass_logistic_regressor_model_uri, iris_dataset):
@@ -249,7 +249,7 @@ def test_classifier_evaluate(multiclass_logistic_regressor_model_uri, iris_datas
     confusion_matrix_artifact = eval_result.artifacts[artifact_name]
     assert np.array_equal(confusion_matrix_artifact.content, expected_artifact)
     assert confusion_matrix_artifact.uri == get_artifact_uri(run.info.run_id, artifact_name)
-    assert np.array_equal(confusion_matrix_artifact.load(saved_artifact_path), expected_artifact)
+    assert np.array_equal(confusion_matrix_artifact._load(saved_artifact_path), expected_artifact)
 
     with TempDir() as temp_dir:
         temp_dir_path = temp_dir.path()
@@ -278,7 +278,7 @@ def test_classifier_evaluate(multiclass_logistic_regressor_model_uri, iris_datas
         )
 
         new_confusion_matrix_artifact = Array2DEvaluationArtifact(uri=confusion_matrix_artifact.uri)
-        new_confusion_matrix_artifact.load()
+        new_confusion_matrix_artifact._load()
         assert np.array_equal(
             confusion_matrix_artifact.content,
             new_confusion_matrix_artifact.content,
@@ -317,15 +317,15 @@ def test_regressor_evaluate(linear_regressor_model_uri, diabetes_dataset):
 
 def test_dataset_name():
     X, y = get_iris()
-    d1 = _EvaluationDataset(data=X, targets=y, name="a1")
+    d1 = EvaluationDataset(data=X, targets=y, name="a1")
     assert d1.name == "a1"
-    d2 = _EvaluationDataset(data=X, targets=y)
+    d2 = EvaluationDataset(data=X, targets=y)
     assert d2.name == d2.hash
 
 
 def test_dataset_metadata():
     X, y = get_iris()
-    d1 = _EvaluationDataset(data=X, targets=y, name="a1", path="/path/to/a1")
+    d1 = EvaluationDataset(data=X, targets=y, name="a1", path="/path/to/a1")
     assert d1._metadata == {
         "hash": "6bdf4e119bf1a37e7907dfd9f0e68733",
         "name": "a1",
@@ -358,7 +358,7 @@ def test_dataset_hash(iris_dataset, iris_pandas_df_dataset, diabetes_spark_datas
 
 def test_dataset_with_pandas_dataframe():
     data = pd.DataFrame({"f1": [1, 2], "f2": [3, 4], "f3": [5, 6], "label": [0, 1]})
-    eval_dataset = _EvaluationDataset(data=data, targets="label")
+    eval_dataset = EvaluationDataset(data=data, targets="label")
 
     assert list(eval_dataset.features_data.columns) == ["f1", "f2", "f3"]
     assert np.array_equal(eval_dataset.features_data.f1.to_numpy(), [1, 2])
@@ -366,7 +366,7 @@ def test_dataset_with_pandas_dataframe():
     assert np.array_equal(eval_dataset.features_data.f3.to_numpy(), [5, 6])
     assert np.array_equal(eval_dataset.labels_data, [0, 1])
 
-    eval_dataset2 = _EvaluationDataset(data=data, targets="label", feature_names=["f3", "f2"])
+    eval_dataset2 = EvaluationDataset(data=data, targets="label", feature_names=["f3", "f2"])
     assert list(eval_dataset2.features_data.columns) == ["f3", "f2"]
     assert np.array_equal(eval_dataset2.features_data.f2.to_numpy(), [3, 4])
     assert np.array_equal(eval_dataset2.features_data.f3.to_numpy(), [5, 6])
@@ -377,43 +377,43 @@ def test_dataset_with_array_data():
     labels = [0, 1]
 
     for input_data in [features, np.array(features)]:
-        eval_dataset1 = _EvaluationDataset(data=input_data, targets=labels)
+        eval_dataset1 = EvaluationDataset(data=input_data, targets=labels)
         assert np.array_equal(eval_dataset1.features_data, features)
         assert np.array_equal(eval_dataset1.labels_data, labels)
         assert list(eval_dataset1.feature_names) == ["feature_1", "feature_2"]
 
-    assert _EvaluationDataset(
+    assert EvaluationDataset(
         data=input_data, targets=labels, feature_names=["a", "b"]
     ).feature_names == ["a", "b"]
 
     with pytest.raises(ValueError, match="all element must has the same length"):
-        _EvaluationDataset(data=[[1, 2], [3, 4, 5]], targets=labels)
+        EvaluationDataset(data=[[1, 2], [3, 4, 5]], targets=labels)
 
 
 def test_dataset_autogen_feature_names():
     labels = [0]
-    eval_dataset2 = _EvaluationDataset(data=[list(range(9))], targets=labels)
+    eval_dataset2 = EvaluationDataset(data=[list(range(9))], targets=labels)
     assert eval_dataset2.feature_names == [f"feature_{i + 1}" for i in range(9)]
 
-    eval_dataset2 = _EvaluationDataset(data=[list(range(10))], targets=labels)
+    eval_dataset2 = EvaluationDataset(data=[list(range(10))], targets=labels)
     assert eval_dataset2.feature_names == [f"feature_{i + 1:02d}" for i in range(10)]
 
-    eval_dataset2 = _EvaluationDataset(data=[list(range(99))], targets=labels)
+    eval_dataset2 = EvaluationDataset(data=[list(range(99))], targets=labels)
     assert eval_dataset2.feature_names == [f"feature_{i + 1:02d}" for i in range(99)]
 
-    eval_dataset2 = _EvaluationDataset(data=[list(range(100))], targets=labels)
+    eval_dataset2 = EvaluationDataset(data=[list(range(100))], targets=labels)
     assert eval_dataset2.feature_names == [f"feature_{i + 1:03d}" for i in range(100)]
 
     with pytest.raises(
         ValueError, match="features example rows must be the same length with labels array"
     ):
-        _EvaluationDataset(data=[[1, 2], [3, 4]], targets=[1, 2, 3])
+        EvaluationDataset(data=[[1, 2], [3, 4]], targets=[1, 2, 3])
 
 
 def test_dataset_from_spark_df(spark_session):
     spark_df = spark_session.createDataFrame([(1.0, 2.0, 3.0)] * 10, ["f1", "f2", "y"])
-    with mock.patch.object(_EvaluationDataset, "SPARK_DATAFRAME_LIMIT", 5):
-        dataset = _EvaluationDataset(spark_df, targets="y")
+    with mock.patch.object(EvaluationDataset, "SPARK_DATAFRAME_LIMIT", 5):
+        dataset = EvaluationDataset(spark_df, targets="y")
         assert list(dataset.features_data.columns) == ["f1", "f2"]
         assert list(dataset.features_data["f1"]) == [1.0] * 5
         assert list(dataset.features_data["f2"]) == [2.0] * 5
@@ -469,7 +469,7 @@ def evaluate(self, *, model, model_type, dataset, run_id, evaluator_config, **kw
 
 
 class FakeArtifact1(EvaluationArtifact):
-    def save(self, output_artifact_path):
+    def _save(self, output_artifact_path):
         raise RuntimeError()
 
     def _load_content_from_file(self, local_artifact_path):
@@ -477,7 +477,7 @@ def _load_content_from_file(self, local_artifact_path):
 
 
 class FakeArtifact2(EvaluationArtifact):
-    def save(self, output_artifact_path):
+    def _save(self, output_artifact_path):
         raise RuntimeError()
 
     def _load_content_from_file(self, local_artifact_path):
diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
index 1722e92146bfe..6ab0f4f701fe6 100644
--- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -13,7 +13,7 @@
 
 
 class Array2DEvaluationArtifact(EvaluationArtifact):
-    def save(self, output_artifact_path):
+    def _save(self, output_artifact_path):
         pd.DataFrame(self._content).to_csv(output_artifact_path, index=False)
 
     def _load_content_from_file(self, local_artifact_path):
@@ -60,7 +60,7 @@ def evaluate(
                 content=confusion_matrix,
             )
             confusion_matrix_csv_buff = io.StringIO()
-            confusion_matrix_artifact.save(confusion_matrix_csv_buff)
+            confusion_matrix_artifact._save(confusion_matrix_csv_buff)
             client.log_text(
                 run_id, confusion_matrix_csv_buff.getvalue(), confusion_matrix_artifact_name
             )

From 9a40df359d24acd75af81ed96f54eec5dca16692 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Fri, 14 Jan 2022 12:11:44 +0800
Subject: [PATCH 13/14] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 918538a118b00..079fb7403ac82 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -676,8 +676,8 @@ def evaluate(
                        supports "regressor" and "classifier" as model types.
 
     :param dataset_name: (Optional) The name of the dataset, must not contain double quotes (“).
-                         the name is logged to the `mlflow.datasets` tag. If not specified, the
-                         dataset hash is used as the dataset name.
+                         The name is logged to the `mlflow.datasets` tag for lineage tracking
+                         purposes. If not specified, the dataset hash is used as the dataset name.
 
     :param dataset_path: (Optional) The path where the data is stored. Must not contain double
                          quotes (“). If specified, the path is logged to the `mlflow.datasets`

From c3ae6238431383c833e80a2c8e9c4bd3f9cbe4a3 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Fri, 14 Jan 2022 12:38:33 +0800
Subject: [PATCH 14/14] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              |  9 +++++----
 mlflow/models/evaluation/default_evaluator.py | 11 +++++------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 079fb7403ac82..4cdb3ba91be59 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -745,14 +745,15 @@ def evaluate(
           Explainer based on the model.
         - **explainability_nsamples**: The number of sample rows to use for computing model
           explainability insights. Default value is 2000.
-        - **max_num_classes_threshold_logging_roc_pr_curve_for_multiclass_classifier**:
-          For multiclass classifier, specify the max number of classes which allow logging
-          per-class ROC curve and Precision-Recall curve.
+        - **max_classes_for_multiclass_roc_pr**:
+          For multiclass classification tasks, the maximum number of classes for which to log
+          the per-class ROC curve and Precision-Recall curve. If the number of classes is
+          larger than the configured maximum, these curves are not logged.
 
      - Limitations of evaluation dataset:
         - For classification tasks, dataset labels are used to infer the total number of classes.
         - For binary classification tasks, the negative label value must be 0 or -1 or False, and
-        - the positive label value must be 1 or True.
+          the positive label value must be 1 or True.
 
      - Limitations of metrics/artifacts computation:
         - For classification tasks, some metric and artifact computations require the model to
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index a61aff411bca0..dc2e6d12709c9 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -497,17 +497,16 @@ def _log_multiclass_classifier(self):
 
         log_roc_pr_curve = False
         if self.y_probs is not None:
-            max_num_classes_for_logging_curve = self.evaluator_config.get(
-                "max_num_classes_threshold_logging_roc_pr_curve_for_multiclass_classifier", 10
+            max_classes_for_multiclass_roc_pr = self.evaluator_config.get(
+                "max_classes_for_multiclass_roc_pr", 10
             )
-            if self.num_classes <= max_num_classes_for_logging_curve:
+            if self.num_classes <= max_classes_for_multiclass_roc_pr:
                 log_roc_pr_curve = True
             else:
                 _logger.warning(
-                    f"The classifier num_classes > {max_num_classes_for_logging_curve}, skip "
+                    f"The classifier num_classes > {max_classes_for_multiclass_roc_pr}, skip "
                     f"logging ROC curve and Precision-Recall curve. You can add evaluator config "
-                    f"'max_num_classes_threshold_logging_roc_pr_curve_for_multiclass_classifier' "
-                    f"to increase the threshold."
+                    f"'max_classes_for_multiclass_roc_pr' to increase the threshold."
                 )
 
         if log_roc_pr_curve: