From 964f5ab75098c55f028f8acfeeae05df35ea68d5 Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Mon, 10 Jan 2022 20:17:19 +0800
Subject: [PATCH] Evaluation Default evaluator (#5092)

* init

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* rename module

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* address comments

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* address comments

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* revert black change

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* change module path

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* address comments

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* fix

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* refactor

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* lazy load pyspark

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* revert export

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* fix curcit import

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update tests

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* fix conftest.py

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* Revert "fix conftest.py"

This reverts commit 2ea29c62bfffc5461bf77f3da15b5c00f51de19b.

* fix tests

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update doc

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* default evaluator

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* fix

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* fix

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* address comments

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* fix doc

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* fix doc

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update import

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* fix doc

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update hash algo

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update import

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* address comment

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* add tests

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* fix lint

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* add tests

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* add more tests

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* add tests

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* fix lint

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update shap explainer

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* address comments

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* remove scikitplot dep

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* add pr curve

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* add shap.summary_plot

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* log explainer

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* address comments

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* address comments

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* improve explainer code

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* address comments

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* address comments

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update shap init

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update explainer creating

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update predict_proba

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* address comments

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* refactor

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* add multi-class metrics artifacts

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update doc

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* add log_loss metric

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* lazy load pyspark

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* address ben comments

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* fix

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* prevent show shap logo, add tests

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* support spark model

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* add tests

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* add shap version check

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update docs, loose classifier label limit

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* add tests

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* multiclass classifier merge metrics/plots

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* zfill feature name

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update doc

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* add config max_num_classes_threshold_logging_roc_pr_curve_for_multiclass_classifier

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* refactor

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update tests

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* improve label handling

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* refactor

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* add tests

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* black

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* increase plot dpi

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* fix test fixture

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* fix pylint

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update doc

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* use matplot rc_context

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* fix shap import

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* refactor EvaluationDataset

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* limit user specify shap algos

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* clean

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* update evaluation dataset

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* use svg fig

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* revert svg

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* curve dashline, legend display ap/roc, legend move out

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* linewidth 1

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* keyword arguments for evaluate, fix tests

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* mark abc.abstractmethod, kw args for ModelEvaluator methods

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* fix pylint

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>

* fix pylint

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/artifacts.py         |  23 +
 mlflow/models/evaluation/base.py              | 597 +++++++++++-----
 mlflow/models/evaluation/default_evaluator.py | 672 ++++++++++++++++++
 .../models/evaluation/evaluator_registry.py   |   3 +
 mlflow/models/evaluation/lift_curve.py        | 165 +++++
 mlflow/models/utils.py                        |  19 +
 mlflow/utils/string_utils.py                  |  10 +
 requirements/small-requirements.txt           |   3 +
 tests/models/test_default_evaluator.py        | 491 +++++++++++++
 tests/models/test_evaluation.py               | 386 +++++++---
 .../mlflow_test_plugin/dummy_evaluator.py     |   9 +-
 11 files changed, 2091 insertions(+), 287 deletions(-)
 create mode 100644 mlflow/models/evaluation/artifacts.py
 create mode 100644 mlflow/models/evaluation/default_evaluator.py
 create mode 100644 mlflow/models/evaluation/lift_curve.py
 create mode 100644 tests/models/test_default_evaluator.py

diff --git a/mlflow/models/evaluation/artifacts.py b/mlflow/models/evaluation/artifacts.py
new file mode 100644
index 0000000000000..6343d6e15aa48
--- /dev/null
+++ b/mlflow/models/evaluation/artifacts.py
@@ -0,0 +1,23 @@
+import pandas as pd
+
+from mlflow.models.evaluation.base import EvaluationArtifact
+
+
+class ImageEvaluationArtifact(EvaluationArtifact):
+    def save(self, output_artifact_path):
+        self._content.save(output_artifact_path)
+
+    def _load_content_from_file(self, local_artifact_path):
+        from PIL.Image import open as open_image
+
+        self._content = open_image(local_artifact_path)
+        return self._content
+
+
+class CsvEvaluationArtifact(EvaluationArtifact):
+    def save(self, output_artifact_path):
+        self._content.to_csv(output_artifact_path, index=False)
+
+    def _load_content_from_file(self, local_artifact_path):
+        self._content = pd.read_csv(local_artifact_path)
+        return self._content
diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 1c88ecd821c89..074da7537e7d6 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -10,8 +10,13 @@
 from mlflow.tracking.artifact_utils import _download_artifact_from_uri
 from mlflow.utils import _get_fully_qualified_class_name
 from mlflow.utils.class_utils import _get_class_from_string
+from mlflow.utils.annotations import experimental
 import logging
 import struct
+import sys
+import math
+from collections import OrderedDict
+from abc import ABCMeta, abstractmethod
 
 
 _logger = logging.getLogger(__name__)
@@ -25,7 +30,7 @@ class EvaluationMetrics(dict):
     pass
 
 
-class EvaluationArtifact:
+class EvaluationArtifact(metaclass=ABCMeta):
     """
     A model evaluation artifact containing an artifact uri and content.
     """
@@ -34,12 +39,13 @@ def __init__(self, uri, content=None):
         self._uri = uri
         self._content = content
 
+    @abstractmethod
     def _load_content_from_file(self, local_artifact_path):
         """
         Abstract interface to load the content from local artifact file path,
         and return the loaded content.
         """
-        raise NotImplementedError()
+        pass
 
     def load(self, local_artifact_path=None):
         """
@@ -57,9 +63,10 @@ def load(self, local_artifact_path=None):
                 self._content = self._load_content_from_file(local_artifact_file)
         return self._content
 
+    @abstractmethod
     def save(self, output_artifact_path):
         """Save artifact content into specified path."""
-        raise NotImplementedError()
+        pass
 
     @property
     def content(self):
@@ -151,6 +158,78 @@ def artifacts(self) -> Dict[str, "mlflow.models.evaluation.EvaluationArtifact"]:
 _cached_mlflow_client = None
 
 
+def _hash_uint64_ndarray_as_bytes(array):
+    assert len(array.shape) == 1
+    # see struct pack format string https://docs.python.org/3/library/struct.html#format-strings
+    return struct.pack(f">{array.size}Q", *array)
+
+
+def _hash_ndarray_as_bytes(nd_array):
+    from pandas.util import hash_array
+    import numpy as np
+
+    return _hash_uint64_ndarray_as_bytes(
+        hash_array(nd_array.flatten(order="C"))
+    ) + _hash_uint64_ndarray_as_bytes(np.array(nd_array.shape, dtype="uint64"))
+
+
+def _hash_array_like_obj_as_bytes(data):
+    """
+    Helper method to convert pandas dataframe/numpy array/list into bytes for
+    MD5 calculation purpose.
+    """
+    from pandas.util import hash_pandas_object
+    import numpy as np
+    import pandas as pd
+
+    if isinstance(data, pd.DataFrame):
+        # add checking `'pyspark' in sys.modules` to avoid importing pyspark when user
+        # run code not related to pyspark.
+        if "pyspark" in sys.modules:
+            from pyspark.ml.linalg import Vector as spark_vector_type
+        else:
+            spark_vector_type = None
+
+        def _hash_array_like_element_as_bytes(v):
+            if spark_vector_type is not None:
+                if isinstance(v, spark_vector_type):
+                    return _hash_ndarray_as_bytes(v.toArray())
+            if isinstance(v, np.ndarray):
+                return _hash_ndarray_as_bytes(v)
+            if isinstance(v, list):
+                return _hash_ndarray_as_bytes(np.array(v))
+            return v
+
+        data = data.applymap(_hash_array_like_element_as_bytes)
+        return _hash_uint64_ndarray_as_bytes(hash_pandas_object(data))
+    elif isinstance(data, np.ndarray):
+        return _hash_ndarray_as_bytes(data)
+    elif isinstance(data, list):
+        return _hash_ndarray_as_bytes(np.array(data))
+    else:
+        raise ValueError("Unsupported data type.")
+
+
+def _gen_md5_for_arraylike_obj(md5_gen, data):
+    """
+    Helper method to generate MD5 hash array-like object, the MD5 will calculate over:
+     - array length
+     - first NUM_SAMPLE_ROWS_FOR_HASH rows content
+     - last NUM_SAMPLE_ROWS_FOR_HASH rows content
+    """
+    import numpy as np
+
+    len_bytes = _hash_uint64_ndarray_as_bytes(np.array([len(data)], dtype="uint64"))
+    md5_gen.update(len_bytes)
+    if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
+        md5_gen.update(_hash_array_like_obj_as_bytes(data))
+    else:
+        head_rows = data[: EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH]
+        tail_rows = data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH :]
+        md5_gen.update(_hash_array_like_obj_as_bytes(head_rows))
+        md5_gen.update(_hash_array_like_obj_as_bytes(tail_rows))
+
+
 class EvaluationDataset:
     """
     An input dataset for model evaluation. This is intended for use with the
@@ -161,13 +240,15 @@ class EvaluationDataset:
     NUM_SAMPLE_ROWS_FOR_HASH = 5
     SPARK_DATAFRAME_LIMIT = 10000
 
-    def __init__(self, data, labels, name=None, path=None):
+    def __init__(self, data, labels, name=None, path=None, feature_names=None):
         """
         :param data: One of the following:
          - A numpy array or list of evaluation features, excluding labels.
          - A Pandas DataFrame, or a spark DataFrame,
            containing evaluation features and labels. All columns will be regarded as feature
            columns except the "labels" column.
+         Note: If the mlflow model to be evaluated is a pyspark ML model, then the input data must
+           be a spark DataFrame contains a feature column of "Vector" type, and a label column.
 
         :param labels: One of the following:
          - A numpy array or list of evaluation labels, if `data` is also a numpy array or list.
@@ -178,147 +259,131 @@ def __init__(self, data, labels, name=None, path=None):
 
         :param path: (Optional) the path to a serialized DataFrame (must not contain ").
           (e.g. a delta table, parquet file)
+
+        :param feature_names: (Optional) A list of the feature names attached to the numpy array
+          input data. The argument is only useful in the case the input data is numpy array.
+          For pandas DataFrame input case, the pandas column name will be used as feature name.
+          The feature names will be shown in model explainability plots.
         """
         import numpy as np
         import pandas as pd
 
-        try:
-            from pyspark.sql import DataFrame as SparkDataFrame
-
-            supported_dataframe_types = (pd.DataFrame, SparkDataFrame)
-        except ImportError:
-            supported_dataframe_types = (pd.DataFrame,)
-
         if name is not None and '"' in name:
             raise ValueError(f'Dataset name cannot include a double quote (") but got {name}')
         if path is not None and '"' in path:
             raise ValueError(f'Dataset path cannot include a double quote (") but got {path}')
 
+        self._user_specified_name = name
+        self._path = path
+        self._hash = None
+
+        try:
+            # add checking `'pyspark' in sys.modules` to avoid importing pyspark when user
+            # run code not related to pyspark.
+            if "pyspark" in sys.modules:
+                from pyspark.sql import DataFrame as SparkDataFrame
+
+                supported_dataframe_types = (pd.DataFrame, SparkDataFrame)
+                spark_df_type = SparkDataFrame
+            else:
+                supported_dataframe_types = (pd.DataFrame,)
+                spark_df_type = None
+        except ImportError:
+            supported_dataframe_types = (pd.DataFrame,)
+
         if isinstance(data, (np.ndarray, list)):
             if not isinstance(labels, (np.ndarray, list)):
                 raise ValueError(
                     "If data is a numpy array or list of evaluation features, "
                     "labels must be a numpy array or list of evaluation labels"
                 )
+            if isinstance(data, list):
+                data = np.array(data)
+
+            if len(data.shape) != 2:
+                raise ValueError(
+                    "If the `data` argument is a numpy array, it must be a 2 dimension array "
+                    "and second dimension represent the number of features. If the `data` "
+                    "argument is a list, each of its element must be a feature array of "
+                    "numpy array or list and all element must has the same length."
+                )
+
+            self._features_data = data
+            self._labels_data = labels if isinstance(labels, np.ndarray) else np.array(labels)
+
+            if len(self._features_data) != len(self._labels_data):
+                raise ValueError(
+                    "The input features example rows must be the same length with labels array."
+                )
+
+            num_features = data.shape[1]
+
+            if feature_names is not None:
+                feature_names = list(feature_names)
+                if num_features != len(feature_names):
+                    raise ValueError("feature name list must be the same length with feature data.")
+                self._feature_names = feature_names
+            else:
+                self._feature_names = [
+                    f"feature_{str(i + 1).zfill(math.ceil((math.log10(num_features + 1))))}"
+                    for i in range(num_features)
+                ]
         elif isinstance(data, supported_dataframe_types):
             if not isinstance(labels, str):
                 raise ValueError(
                     "If data is a Pandas DataFrame or Spark DataFrame, labels must be the "
                     "string name of a column from `data` that contains evaluation labels"
                 )
+            if isinstance(data, spark_df_type):
+                _logger.warning(
+                    f"Specified Spark DataFrame is too large for model evaluation. Only "
+                    f"the first {EvaluationDataset.SPARK_DATAFRAME_LIMIT} rows will be used."
+                    "If you want evaluate on the whole spark dataframe, please manually call "
+                    "`spark_dataframe.toPandas()`."
+                )
+                data = data.limit(EvaluationDataset.SPARK_DATAFRAME_LIMIT).toPandas()
+
+            self._features_data = data.drop(labels, axis=1, inplace=False)
+            self._labels_data = data[labels].to_numpy()
+
+            if feature_names is not None:
+                raise ValueError(
+                    "If `data` argument is pandas/spark dataframe, you cannot specify the "
+                    "`feature_names` argument, instead, the column names of the input "
+                    "dataframe will be used as the feature names."
+                )
+            self._feature_names = list(self._features_data.columns)
         else:
             raise ValueError(
                 "The data argument must be a numpy array, a list or a Pandas DataFrame, or "
                 "spark DataFrame if pyspark package installed."
             )
 
-        self._user_specified_name = name
-        self._original_data = data
-        self._data = None
-        self.labels = labels
-        self.path = path
-        self._hash = None
+        # generate dataset hash
+        md5_gen = hashlib.md5()
+        _gen_md5_for_arraylike_obj(md5_gen, self._features_data)
+        _gen_md5_for_arraylike_obj(md5_gen, self._labels_data)
+        md5_gen.update(",".join(self._feature_names).encode("UTF-8"))
 
-    @property
-    def data(self):
-        """
-        Return original data if data is numpy array or pandas dataframe,
-        For spark dataframe, will only return the first SPARK_DATAFRAME_LIMIT rows as pandas
-        dataframe and emit warning.
-        """
-        if self._data is not None:
-            return self._data
-
-        try:
-            from pyspark.sql import DataFrame as SparkDataFrame
-
-            spark_df_type = SparkDataFrame
-        except ImportError:
-            spark_df_type = None
+        self._hash = md5_gen.hexdigest()
 
-        if spark_df_type and isinstance(self._original_data, spark_df_type):
-            self._data = self._original_data.limit(
-                EvaluationDataset.SPARK_DATAFRAME_LIMIT
-            ).toPandas()
-            _logger.warning(
-                f"Specified Spark DataFrame is too large for model evaluation. Only "
-                f"the first {EvaluationDataset.SPARK_DATAFRAME_LIMIT} rows will be used."
-            )
-        else:
-            self._data = self._original_data
-
-        return self._data
-
-    def _extract_features_and_labels(self):
-        """
-        Extract features data and labels data.
-        For spark dataframe, will only extract the first SPARK_DATAFRAME_LIMIT rows data
-        and emit warning.
-        """
-        import numpy as np
-
-        if isinstance(self.data, np.ndarray):
-            return self.data, self.labels
-        else:
-            return (
-                self.data.drop(self.labels, axis=1, inplace=False),
-                self.data[self.labels].to_numpy(),
-            )
-
-    @staticmethod
-    def _convert_uint64_ndarray_to_bytes(array):
-        assert len(array.shape) == 1
-        # see struct pack format string https://docs.python.org/3/library/struct.html#format-strings
-        return struct.pack(f">{array.size}Q", *array)
+    @property
+    def feature_names(self):
+        return self._feature_names
 
-    @staticmethod
-    def _array_like_obj_to_bytes(data):
+    @property
+    def features_data(self):
         """
-        Helper method to convert pandas dataframe/numpy array/list into bytes for
-        MD5 calculation purpose.
+        return features data as a numpy array or a pandas DataFrame.
         """
-        from pandas.util import hash_pandas_object, hash_array
-        import numpy as np
-        import pandas as pd
+        return self._features_data
 
-        if isinstance(data, pd.DataFrame):
-            return EvaluationDataset._convert_uint64_ndarray_to_bytes(hash_pandas_object(data))
-        elif isinstance(data, np.ndarray):
-            return EvaluationDataset._convert_uint64_ndarray_to_bytes(
-                hash_array(data.flatten(order="C"))
-            )
-        elif isinstance(data, list):
-            return EvaluationDataset._convert_uint64_ndarray_to_bytes(hash_array(np.array(data)))
-        else:
-            raise ValueError("Unsupported data type.")
-
-    @staticmethod
-    def _gen_md5_for_arraylike_obj(md5_gen, data):
+    @property
+    def labels_data(self):
         """
-        Helper method to generate MD5 hash array-like object, the MD5 will calculate over:
-         - array length
-         - first NUM_SAMPLE_ROWS_FOR_HASH rows content
-         - last NUM_SAMPLE_ROWS_FOR_HASH rows content
+        return labels data as a numpy array
         """
-        import numpy as np
-
-        len_bytes = EvaluationDataset._convert_uint64_ndarray_to_bytes(
-            np.array([len(data)], dtype="uint64")
-        )
-        md5_gen.update(len_bytes)
-        if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
-            md5_gen.update(EvaluationDataset._array_like_obj_to_bytes(data))
-        else:
-            md5_gen.update(
-                EvaluationDataset._array_like_obj_to_bytes(
-                    data[: EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH]
-                )
-            )
-            md5_gen.update(
-                EvaluationDataset._array_like_obj_to_bytes(
-                    data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH :]
-                )
-            )
+        return self._labels_data
 
     @property
     def name(self):
@@ -329,25 +394,17 @@ def name(self):
         return self._user_specified_name if self._user_specified_name is not None else self.hash
 
     @property
-    def hash(self):
+    def path(self):
         """
-        Compute a hash from the specified dataset by selecting the first 5 records, last 5 records,
-        dataset size and feeding them through a cheap, low-collision hash function
+        Dataset path
         """
-        import numpy as np
-        import pandas as pd
+        return self._path
 
-        if self._hash is None:
-            md5_gen = hashlib.md5()
-            if isinstance(self.data, np.ndarray):
-                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
-                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.labels)
-            elif isinstance(self.data, pd.DataFrame):
-                column_names = ",".join(self.data.columns)
-                meta_str = f"columns={column_names}\nlabels={self.labels}"
-                md5_gen.update(meta_str.encode("UTF-8"))
-                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
-            self._hash = md5_gen.hexdigest()
+    @property
+    def hash(self):
+        """
+        Dataset hash, includes hash on first 20 rows and last 20 rows.
+        """
         return self._hash
 
     @property
@@ -363,7 +420,7 @@ def _metadata(self):
             metadata["path"] = self.path
         return metadata
 
-    def _log_dataset_tag(self, client, run_id):
+    def _log_dataset_tag(self, client, run_id, model_uuid):
         """
         Log dataset metadata as a tag "mlflow.datasets", if the tag already exists, it will
         append current dataset metadata into existing tag content.
@@ -374,10 +431,14 @@ def _log_dataset_tag(self, client, run_id):
         dataset_metadata_list = json.loads(existing_dataset_metadata_str)
 
         for metadata in dataset_metadata_list:
-            if metadata["hash"] == self.hash and metadata["name"] == self._user_specified_name:
+            if (
+                metadata["hash"] == self.hash
+                and metadata["name"] == self.name
+                and metadata["model"] == model_uuid
+            ):
                 break
         else:
-            dataset_metadata_list.append(self._metadata)
+            dataset_metadata_list.append({**self._metadata, "model": model_uuid})
 
         dataset_metadata_str = json.dumps(dataset_metadata_list, separators=(",", ":"))
         client.log_batch(
@@ -386,8 +447,9 @@ def _log_dataset_tag(self, client, run_id):
         )
 
 
-class ModelEvaluator:
-    def can_evaluate(self, model_type, evaluator_config=None, **kwargs) -> bool:
+class ModelEvaluator(metaclass=ABCMeta):
+    @abstractmethod
+    def can_evaluate(self, *, model_type, evaluator_config, **kwargs) -> bool:
         """
         :param model_type: A string describing the model type (e.g., "regressor",
                            "classifier", …).
@@ -401,15 +463,8 @@ def can_evaluate(self, model_type, evaluator_config=None, **kwargs) -> bool:
         """
         raise NotImplementedError()
 
-    def evaluate(
-        self,
-        model,
-        model_type,
-        dataset,
-        run_id,
-        evaluator_config,
-        **kwargs,
-    ):
+    @abstractmethod
+    def evaluate(self, *, model, model_type, dataset, run_id, evaluator_config, **kwargs):
         """
         The abstract API to log metrics and artifacts, and return evaluation results.
 
@@ -456,12 +511,149 @@ def _start_run_or_reuse_active_run(run_id):
     else:
         if run_id and active_run.info.run_id != run_id:
             raise ValueError(
-                "An active run exists, you cannot specify another run_id when " "evaluating."
+                "An active run exists, you cannot specify another run_id when evaluating."
             )
         yield active_run.info.run_id
 
 
+def _normalize_evaluators_and_evaluator_config_args(
+    evaluators,
+    evaluator_config,
+):
+    from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
+
+    def check_nesting_config_dict(_evaluator_name_list, _evaluator_name_to_conf_map):
+        return isinstance(_evaluator_name_to_conf_map, dict) and all(
+            k in _evaluator_name_list and isinstance(v, dict)
+            for k, v in _evaluator_name_to_conf_map.items()
+        )
+
+    if evaluators is None:
+        evaluator_name_list = list(_model_evaluation_registry._registry.keys())
+        if len(evaluator_name_list) > 1:
+            _logger.warning(
+                f"Multiple registered evaluators are found {evaluator_name_list} and "
+                "they will all be used in evaluation if they support the specified model type. "
+                "If you want to evaluate with one evaluator, specify the `evaluator` argument "
+                "and optionally specify the `evaluator_config` argument."
+            )
+        if evaluator_config is not None:
+            conf_dict_value_error = ValueError(
+                "If `evaluators` argument is None, all available evaluators will be used. "
+                "If only the default evaluator is available, the `evaluator_config` argument is "
+                "interpreted as the config dictionary for the default evaluator. Otherwise, the "
+                "`evaluator_config` argument must be a dictionary mapping each evaluator's name "
+                "to its own evaluator config dictionary."
+            )
+            if evaluator_name_list == ["default"]:
+                if not isinstance(evaluator_config, dict):
+                    raise conf_dict_value_error
+                elif "default" not in evaluator_config:
+                    evaluator_name_to_conf_map = {"default": evaluator_config}
+                else:
+                    evaluator_name_to_conf_map = evaluator_config
+            else:
+                if not check_nesting_config_dict(evaluator_name_list, evaluator_config):
+                    raise conf_dict_value_error
+                evaluator_name_to_conf_map = evaluator_config
+        else:
+            evaluator_name_to_conf_map = {}
+    elif isinstance(evaluators, str):
+        if not (evaluator_config is None or isinstance(evaluator_config, dict)):
+            raise ValueError(
+                "If `evaluators` argument is the name of an evaluator, evaluator_config must be "
+                "None or a dict containing config items for the evaluator."
+            )
+        evaluator_name_list = [evaluators]
+        evaluator_name_to_conf_map = {evaluators: evaluator_config}
+    elif isinstance(evaluators, list):
+        if evaluator_config is not None:
+            if not check_nesting_config_dict(evaluators, evaluator_config):
+                raise ValueError(
+                    "If `evaluators` argument is an evaluator name list, evaluator_config "
+                    "must be a dict contains mapping from evaluator name to individual "
+                    "evaluator config dict."
+                )
+        # Use `OrderedDict.fromkeys` to deduplicate elements but keep elements order.
+        evaluator_name_list = list(OrderedDict.fromkeys(evaluators))
+        evaluator_name_to_conf_map = evaluator_config or {}
+    else:
+        raise ValueError(
+            "`evaluators` argument must be None, an evaluator name string, or a list of "
+            "evaluator names."
+        )
+
+    return evaluator_name_list, evaluator_name_to_conf_map
+
+
+_last_failed_evaluator = None
+
+
+def _get_last_failed_evaluator():
+    """
+    Return the evaluator name of the last failed evaluator when calling `evalaute`.
+    This can be used to check which evaluator fail when `evaluate` API fail.
+    """
+    return _last_failed_evaluator
+
+
+def _evaluate(
+    *, model, model_type, dataset, actual_run_id, evaluator_name_list, evaluator_name_to_conf_map
+):
+    """
+    The public API "evaluate" will verify argument first, and then pass normalized arguments
+    to the _evaluate method.
+    """
+    # import _model_evaluation_registry and PyFuncModel inside function to avoid circuit importing
+    from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
+
+    global _last_failed_evaluator
+    _last_failed_evaluator = None
+
+    client = mlflow.tracking.MlflowClient()
+    model_uuid = model.metadata.model_uuid
+    dataset._log_dataset_tag(client, actual_run_id, model_uuid)
+
+    eval_results = []
+    for evaluator_name in evaluator_name_list:
+        config = evaluator_name_to_conf_map.get(evaluator_name) or {}
+        try:
+            evaluator = _model_evaluation_registry.get_evaluator(evaluator_name)
+        except MlflowException:
+            _logger.warning(f"Evaluator '{evaluator_name}' is not registered.")
+            continue
+
+        _last_failed_evaluator = evaluator_name
+        if evaluator.can_evaluate(model_type=model_type, evaluator_config=config):
+            _logger.info(f"Evaluating the model with the {evaluator_name} evaluator.")
+            result = evaluator.evaluate(
+                model=model,
+                model_type=model_type,
+                dataset=dataset,
+                run_id=actual_run_id,
+                evaluator_config=config,
+            )
+            eval_results.append(result)
+
+    _last_failed_evaluator = None
+
+    if len(eval_results) == 0:
+        raise ValueError(
+            "The model could not be evaluated by any of the registered evaluators, please "
+            "verify that the model type and other configs are set correctly."
+        )
+
+    merged_eval_result = EvaluationResult(EvaluationMetrics(), dict())
+    for eval_result in eval_results:
+        merged_eval_result.metrics.update(eval_result.metrics)
+        merged_eval_result.artifacts.update(eval_result.artifacts)
+
+    return merged_eval_result
+
+
+@experimental
 def evaluate(
+    *,
     model: Union[str, "mlflow.pyfunc.PyFuncModel"],
     model_type: str,
     dataset: "mlflow.models.evaluation.EvaluationDataset",
@@ -496,33 +688,72 @@ def evaluate(
                              a nested dictionary whose key is the evaluator name.
     :return: An :py:class:`mlflow.models.evaluation.EvaluationDataset` instance containing
              evaluation results.
+
+    The default evaluator supports the 'regressor' and 'classifer' model types.
+
+    For both the 'regressor' and 'classifer' types, the default evaluator will generate model
+    summary plots and feature importance plots generated by shap explainer.
+
+    For regressor model, the default evaluator will additionally log:
+
+     - **metrics**: example_count, mean_absolute_error, mean_squared_error, root_mean_squared_error,
+       sum_on_label, mean_on_label, r2_score, max_error, mean_absolute_percentage_error.
+
+    For binary classifier, the default evaluator will additionally log:
+
+     - **metrics**: true_negatives, false_positives, false_negatives, true_positives, recall,
+       precision, f1_score, accuracy, example_count, log_loss, roc_auc, precision_recall_auc.
+     - **artifacts**: lift curve plot, precision-recall plot, ROC plot.
+
+    For multiclass classifier, the default evaluator will additionally log:
+
+     - **metrics**: accuracy, example_count, f1_score_micro, f1_score_macro, log_loss
+     - **artifacts**: A CSV file for "per_class_metrics" (per-class metrics includes true_negatives/
+       false_positives/false_negatives/true_positives/recall/precision/roc_auc,
+       precision_recall_auc), precision-recall merged curves plot, ROC merged curves plot.
+
+    The available `evaluator_config` options for the default evaluator include:
+
+     - **log_model_explainability**: A boolean value specifying whether or not to log model
+       explainability insights, default value is True.
+     - **explainability_algorithm**: A string to specify the SHAP Explainer algorithm for model
+       explainability. Supported algorithm includes: 'exact', 'permutation', 'partition'.
+       If not set, `shap.Explainer` is used with the "auto" algorithm, which chooses the best
+       Explainer based on the model.
+     - **explainability_nsamples**: The number of sample rows to use for computing model
+       explainability insights. Default value is 2000.
+     - **max_num_classes_threshold_logging_roc_pr_curve_for_multiclass_classifier**:
+       For multiclass classifier, specify the max number of classes which allow logging per-class
+       ROC curve and Precision-Recall curve.
+
+    Limitations of evaluation dataset:
+     - If the input dataset is pandas dataframe, the feature columns in pandas dataframe must be
+       scalar value columns, other object types (nd.array/list/etc.) are not supported yet.
+     - If the mlflow model to be evaluated is a pyspark ML model, then the input data must
+       be a spark DataFrame or pandas DataFrame contains a feature column with values of type
+       "pyspark.ml.linalg.Vector", and a label column.
+     - For classifier, evaluation dataset labels must contains all distinct values, the dataset
+       labels data will be used to infer the number of classes. For binary classifier, the
+       negative label value must be 0 or -1 or False, and the positive label value must be
+       1 or True.
+       For multiclass classifier, if logging explainability insights enabled, the label values
+       must be number type.
+
+    Limitations of metrics/artifacts computation:
+     - For classifier, some metrics and plot computation require model provides
+       "predict probability" function. Currently, for sklearn model, we will extract "predict_proba"
+       method from the raw model to achieve this, for other model, it will skip logging
+       metrics/artifacts which require probability prediction.
+
+    Limitations of default evaluator logging model explainability insights:
+     - The `shap.Explainer` "auto" algorithm will choose Linear explainer for linear model,
+       and choose Tree explainer for tree model. But the shap Linear/Tree explainer does not
+       support multi-class classifier, in this case, default evaluator will fallback to use
+       shap Exact or Permutation explainer.
+     - Logging model explainability insights is not currently supported for PySpark models.
     """
-    # import _model_evaluation_registry and PyFuncModel inside function to avoid circuit importing
-    from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
     from mlflow.pyfunc import PyFuncModel
 
-    if not evaluators:
-        evaluators = list(_model_evaluation_registry._registry.keys())
-
-    if isinstance(evaluators, str):
-        evaluators = [evaluators]
-        if not (evaluator_config is None or isinstance(evaluator_config, dict)):
-            raise ValueError(
-                "If `evaluators` argument is a str, evaluator_config must be None or a dict."
-            )
-        evaluator_config = {evaluators[0]: evaluator_config}
-    elif isinstance(evaluators, list):
-        evaluators = set(evaluators)
-        if not (
-            isinstance(evaluator_config, dict)
-            and all(k in evaluators and isinstance(v, dict) for k, v in evaluator_config.items())
-        ):
-            raise ValueError(
-                "If `evaluators` argument is a evaluator name list, evaluator_config"
-                "must be a dict contains mapping from evaluator name to individual "
-                "evaluator config dict."
-            )
-
     if isinstance(model, str):
         model = mlflow.pyfunc.load_model(model)
     elif isinstance(model, PyFuncModel):
@@ -533,27 +764,17 @@ def evaluate(
             "an instance of `mlflow.pyfunc.PyFuncModel`."
         )
 
+    (
+        evaluator_name_list,
+        evaluator_name_to_conf_map,
+    ) = _normalize_evaluators_and_evaluator_config_args(evaluators, evaluator_config)
+
     with _start_run_or_reuse_active_run(run_id) as actual_run_id:
-        client = mlflow.tracking.MlflowClient()
-        dataset._log_dataset_tag(client, actual_run_id)
-
-        eval_results = []
-        for evaluator_name in evaluators:
-            config = evaluator_config.get(evaluator_name) or {}
-            try:
-                evaluator = _model_evaluation_registry.get_evaluator(evaluator_name)
-            except MlflowException:
-                _logger.warning(f"Evaluator '{evaluator_name}' is not registered.")
-                continue
-
-            if evaluator.can_evaluate(model_type, config):
-                _logger.info(f"Evaluating the model with the {evaluator_name} evaluator.")
-                result = evaluator.evaluate(model, model_type, dataset, actual_run_id, config)
-                eval_results.append(result)
-
-        merged_eval_result = EvaluationResult(EvaluationMetrics(), dict())
-        for eval_result in eval_results:
-            merged_eval_result.metrics.update(eval_result.metrics)
-            merged_eval_result.artifacts.update(eval_result.artifacts)
-
-        return merged_eval_result
+        return _evaluate(
+            model=model,
+            model_type=model_type,
+            dataset=dataset,
+            actual_run_id=actual_run_id,
+            evaluator_name_list=evaluator_name_list,
+            evaluator_name_to_conf_map=evaluator_name_to_conf_map,
+        )
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
new file mode 100644
index 0000000000000..6ee604aed8988
--- /dev/null
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -0,0 +1,672 @@
+import mlflow
+from mlflow.models.evaluation.base import (
+    ModelEvaluator,
+    EvaluationMetrics,
+    EvaluationResult,
+)
+from mlflow.entities.metric import Metric
+from mlflow.utils.file_utils import TempDir
+from mlflow.utils.string_utils import truncate_str_from_middle
+from mlflow.models.utils import plot_lines
+from mlflow.models.evaluation.artifacts import ImageEvaluationArtifact, CsvEvaluationArtifact
+
+from sklearn import metrics as sk_metrics
+import math
+from collections import namedtuple
+import numbers
+import pandas as pd
+import numpy as np
+import time
+from functools import partial
+import logging
+from packaging.version import Version
+
+_logger = logging.getLogger(__name__)
+
+_DEFAULT_SAMPLE_ROWS_FOR_SHAP = 2000
+
+
+def _infer_model_type_by_labels(labels):
+    distinct_labels = set(labels)
+    for v in distinct_labels:
+        if not isinstance(v, numbers.Number):
+            return "classifier"
+        if not float(v).is_integer():
+            return "regressor"
+    if len(distinct_labels) > 1000 and len(distinct_labels) / len(labels) > 0.7:
+        return "regressor"
+    return "classifier"
+
+
+def _extract_raw_model_and_predict_fn(model):
+    model_loader_module = model.metadata.flavors["python_function"]["loader_module"]
+    predict_fn = model.predict
+    predict_proba_fn = None
+
+    try:
+        if model_loader_module == "mlflow.sklearn":
+            raw_model = model._model_impl
+        else:
+            raw_model = None
+    except Exception as e:
+        raw_model = None
+        _logger.warning(
+            f"Raw model resolution fails unexpectedly on PyFuncModel {model!r}, "
+            f"error message is {e}"
+        )
+
+    if raw_model:
+        predict_fn = raw_model.predict
+        predict_proba_fn = getattr(raw_model, "predict_proba", None)
+
+        try:
+            import xgboost
+
+            if isinstance(raw_model, xgboost.XGBModel):
+                # Because shap evaluation will pass evaluation data in ndarray format
+                # (without feature names), if set validate_features=True it will raise error.
+                predict_fn = partial(predict_fn, validate_features=False)
+                if predict_proba_fn is not None:
+                    predict_proba_fn = partial(predict_proba_fn, validate_features=False)
+        except ImportError:
+            pass
+
+    return model_loader_module, raw_model, predict_fn, predict_proba_fn
+
+
+def _gen_log_key(key, dataset_name):
+    return f"{key}_on_data_{dataset_name}"
+
+
+def _get_regressor_metrics(y, y_pred):
+    return {
+        "example_count": len(y),
+        "mean_absolute_error": sk_metrics.mean_absolute_error(y, y_pred),
+        "mean_squared_error": sk_metrics.mean_squared_error(y, y_pred),
+        "root_mean_squared_error": math.sqrt(sk_metrics.mean_squared_error(y, y_pred)),
+        "sum_on_label": sum(y),
+        "mean_on_label": sum(y) / len(y),
+        "r2_score": sk_metrics.r2_score(y, y_pred),
+        "max_error": sk_metrics.max_error(y, y_pred),
+        "mean_absolute_percentage_error": sk_metrics.mean_absolute_percentage_error(y, y_pred),
+    }
+
+
+def _get_binary_sum_up_label_pred_prob(positive_class_index, positive_class, y, y_pred, y_probs):
+    y = np.array(y)
+    y_bin = np.where(y == positive_class, 1, 0)
+    y_pred_bin = None
+    y_prob_bin = None
+    if y_pred is not None:
+        y_pred = np.array(y_pred)
+        y_pred_bin = np.where(y_pred == positive_class, 1, 0)
+
+    if y_probs is not None:
+        y_probs = np.array(y_probs)
+        y_prob_bin = y_probs[:, positive_class_index]
+
+    return y_bin, y_pred_bin, y_prob_bin
+
+
+def _get_classifier_per_class_metrics(y, y_pred):
+    """
+    get classifier metrics which computing over a specific class.
+    For binary classifier, y/y_pred is for the positive class.
+    For multiclass classifier, y/y_pred sum up to a binary "is class" and "is not class".
+    """
+    metrics = {}
+    confusion_matrix = sk_metrics.confusion_matrix(y, y_pred)
+    tn, fp, fn, tp = confusion_matrix.ravel()
+    metrics["true_negatives"] = tn
+    metrics["false_positives"] = fp
+    metrics["false_negatives"] = fn
+    metrics["true_positives"] = tp
+    metrics["recall"] = sk_metrics.recall_score(y, y_pred)
+    metrics["precision"] = sk_metrics.precision_score(y, y_pred)
+    metrics["f1_score"] = sk_metrics.f1_score(y, y_pred)
+    return metrics
+
+
+def _get_classifier_global_metrics(is_binomial, y, y_pred, y_probs, labels):
+    """
+    get classifier metrics which computing over all classes examples.
+    """
+    metrics = {}
+    metrics["accuracy"] = sk_metrics.accuracy_score(y, y_pred)
+    metrics["example_count"] = len(y)
+
+    if not is_binomial:
+        metrics["f1_score_micro"] = sk_metrics.f1_score(y, y_pred, average="micro", labels=labels)
+        metrics["f1_score_macro"] = sk_metrics.f1_score(y, y_pred, average="macro", labels=labels)
+
+    if y_probs is not None:
+        metrics["log_loss"] = sk_metrics.log_loss(y, y_probs, labels=labels)
+
+    return metrics
+
+
+def _get_classifier_per_class_metrics_collection_df(y, y_pred, labels):
+    per_class_metrics_list = []
+    for positive_class_index, positive_class in enumerate(labels):
+        (y_bin, y_pred_bin, _,) = _get_binary_sum_up_label_pred_prob(
+            positive_class_index, positive_class, y, y_pred, None
+        )
+
+        per_class_metrics = {"positive_class": positive_class}
+        per_class_metrics.update(_get_classifier_per_class_metrics(y_bin, y_pred_bin))
+        per_class_metrics_list.append(per_class_metrics)
+
+    return pd.DataFrame(per_class_metrics_list)
+
+
+_Curve = namedtuple("_Curve", ["plot_fn", "plot_fn_args", "auc"])
+
+
+def _gen_classifier_curve(
+    is_binomial,
+    y,
+    y_probs,
+    labels,
+    curve_type,
+):
+    """
+    Generate precision-recall curve or ROC curve for classifier.
+    :param is_binomial: True if it is binary classifier otherwise False
+    :param y: True label values
+    :param y_probs: if binary classifer, the predicted probability for positive class.
+                    if multiclass classiifer, the predicted probabilities for all classes.
+    :param labels: The set of labels.
+    :param curve_type: "pr" or "roc"
+    :return: An instance of "_Curve" which includes attributes "plot_fn", "plot_fn_args", "auc".
+    """
+    if curve_type == "roc":
+
+        def gen_line_x_y_label_fn(_y, _y_prob):
+            fpr, tpr, _ = sk_metrics.roc_curve(_y, _y_prob)
+            auc = sk_metrics.auc(fpr, tpr)
+            return fpr, tpr, f"AUC={auc:.3f}"
+
+        xlabel = "False Positive Rate"
+        ylabel = "True Positive Rate"
+    elif curve_type == "pr":
+
+        def gen_line_x_y_label_fn(_y, _y_prob):
+            precision, recall, _thresholds = sk_metrics.precision_recall_curve(_y, _y_prob)
+            ap = np.mean(precision)
+            return recall, precision, f"AP={ap:.3f}"
+
+        xlabel = "recall"
+        ylabel = "precision"
+    else:
+        assert False, "illegal curve type"
+
+    if is_binomial:
+        x_data, y_data, line_label = gen_line_x_y_label_fn(y, y_probs)
+        data_series = [(line_label, x_data, y_data)]
+        auc = sk_metrics.auc(x_data, y_data)
+    else:
+        curve_list = []
+        for positive_class_index, positive_class in enumerate(labels):
+            y_bin, _, y_prob_bin = _get_binary_sum_up_label_pred_prob(
+                positive_class_index, positive_class, y, None, y_probs
+            )
+
+            x_data, y_data, line_label = gen_line_x_y_label_fn(y_bin, y_prob_bin)
+            curve_list.append((positive_class, x_data, y_data, line_label))
+
+        data_series = [
+            (f"label={positive_class},{line_label}", x_data, y_data)
+            for positive_class, x_data, y_data, line_label in curve_list
+        ]
+        auc = [sk_metrics.auc(x_data, y_data) for _, x_data, y_data, _ in curve_list]
+
+    def _do_plot(**kwargs):
+        import matplotlib.pyplot as pyplot
+
+        _, ax = plot_lines(**kwargs)
+        dash_line_args = {
+            "color": "gray",
+            "alpha": 0.3,
+            "drawstyle": "default",
+            "linestyle": "dashed",
+        }
+        if curve_type == "pr":
+            ax.plot([0, 1], [1, 0], **dash_line_args)
+        elif curve_type == "roc":
+            ax.plot([0, 1], [0, 1], **dash_line_args)
+
+        if is_binomial:
+            ax.legend(loc="best")
+        else:
+            ax.legend(loc="center left", bbox_to_anchor=(1, 0.5))
+            pyplot.subplots_adjust(right=0.6, bottom=0.25)
+
+    return _Curve(
+        plot_fn=_do_plot,
+        plot_fn_args={
+            "data_series": data_series,
+            "xlabel": xlabel,
+            "ylabel": ylabel,
+            "line_kwargs": {"drawstyle": "steps-post", "linewidth": 1},
+        },
+        auc=auc,
+    )
+
+
+_matplotlib_config = {
+    "figure.dpi": 288,
+    "figure.figsize": [6.0, 4.0],
+}
+
+
+# pylint: disable=attribute-defined-outside-init
+class DefaultEvaluator(ModelEvaluator):
+    # pylint: disable=unused-argument
+    def can_evaluate(self, *, model_type, evaluator_config, **kwargs):
+        return model_type in ["classifier", "regressor"]
+
+    def _log_metrics(self):
+        """
+        Helper method to log metrics into specified run.
+        """
+        timestamp = int(time.time() * 1000)
+        self.client.log_batch(
+            self.run_id,
+            metrics=[
+                Metric(
+                    key=_gen_log_key(key, self.dataset_name),
+                    value=value,
+                    timestamp=timestamp,
+                    step=0,
+                )
+                for key, value in self.metrics.items()
+            ],
+        )
+
+    def _log_image_artifact(
+        self,
+        do_plot,
+        artifact_name,
+    ):
+        import matplotlib.pyplot as pyplot
+
+        artifact_file_name = _gen_log_key(artifact_name, self.dataset_name) + ".png"
+        artifact_file_local_path = self.temp_dir.path(artifact_file_name)
+
+        try:
+            pyplot.clf()
+            do_plot()
+            pyplot.savefig(artifact_file_local_path)
+        finally:
+            pyplot.close(pyplot.gcf())
+
+        mlflow.log_artifact(artifact_file_local_path)
+        artifact = ImageEvaluationArtifact(uri=mlflow.get_artifact_uri(artifact_file_name))
+        artifact.load(artifact_file_local_path)
+        self.artifacts[artifact_name] = artifact
+
+    def _log_pandas_df_artifact(self, pandas_df, artifact_name):
+        artifact_file_name = _gen_log_key(artifact_name, self.dataset_name) + ".csv"
+        artifact_file_local_path = self.temp_dir.path(artifact_file_name)
+        pandas_df.to_csv(artifact_file_local_path, index=False)
+        mlflow.log_artifact(artifact_file_local_path)
+        artifact = CsvEvaluationArtifact(
+            uri=mlflow.get_artifact_uri(artifact_file_name),
+            content=pandas_df,
+        )
+        artifact.load(artifact_file_local_path)
+        self.artifacts[artifact_name] = artifact
+
+    def _log_model_explainability(self):
+        if not self.evaluator_config.get("log_model_explainability", True):
+            return
+
+        if self.model_loader_module == "mlflow.spark":
+            # TODO: Shap explainer need to manipulate on each feature values,
+            #  but spark model input dataframe contains Vector type feature column
+            #  which shap explainer does not support.
+            #  To support this, we need expand the Vector type feature column into
+            #  multiple scaler feature columns and pass it to shap explainer.
+            _logger.warning(
+                "Logging model explainability insights is not currently supported for PySpark "
+                "models."
+            )
+            return
+
+        if self.model_type == "classifier" and not all(
+            [isinstance(label, (numbers.Number, np.bool_)) for label in self.label_list]
+        ):
+            _logger.warning(
+                "Skip logging model explainability insights because it requires all label "
+                "values to be Number type."
+            )
+            return
+
+        try:
+            import shap
+            import matplotlib.pyplot as pyplot
+        except ImportError:
+            _logger.warning(
+                "SHAP or matplotlib package is not installed, so model explainability insights "
+                "will not be logged."
+            )
+            return
+
+        if Version(shap.__version__) < Version("0.40"):
+            _logger.warning(
+                "Shap package version is lower than 0.40, Skip log model explainability."
+            )
+            return
+
+        is_multinomial_classifier = self.model_type == "classifier" and self.num_classes > 2
+
+        sample_rows = self.evaluator_config.get(
+            "explainability_nsamples", _DEFAULT_SAMPLE_ROWS_FOR_SHAP
+        )
+        algorithm = self.evaluator_config.get("explainability_algorithm", None)
+
+        truncated_feature_names = [truncate_str_from_middle(f, 20) for f in self.feature_names]
+        for i, truncated_name in enumerate(truncated_feature_names):
+            if truncated_name != self.feature_names[i]:
+                # For duplicated truncated name, attach "(f_{feature_index})" at the end
+                truncated_feature_names[i] = f"{truncated_name}(f_{i + 1})"
+
+        truncated_feature_name_map = {
+            f: f2 for f, f2 in zip(self.feature_names, truncated_feature_names)
+        }
+
+        sampled_X = shap.sample(self.X, sample_rows)
+
+        if isinstance(sampled_X, pd.DataFrame):
+            # For some shap explainer, the plot will use the DataFrame column names instead of
+            # using feature_names argument value. So rename the dataframe column names.
+            sampled_X = sampled_X.rename(columns=truncated_feature_name_map, copy=False)
+
+        if algorithm:
+            supported_algos = ["exact", "permutation", "partition"]
+            if algorithm not in supported_algos:
+                raise ValueError(
+                    f"Specified explainer algorithm {algorithm} is unsupported. Currently only "
+                    f"support {','.join(supported_algos)} algorithms."
+                )
+            explainer = shap.Explainer(
+                self.predict_fn,
+                sampled_X,
+                feature_names=truncated_feature_names,
+                algorithm=algorithm,
+            )
+        else:
+            if self.raw_model and not is_multinomial_classifier:
+                # For mulitnomial classifier, shap.Explainer may choose Tree/Linear explainer for
+                # raw model, this case shap plot doesn't support it well, so exclude the
+                # multinomial_classifier case here.
+                explainer = shap.Explainer(
+                    self.raw_model, sampled_X, feature_names=truncated_feature_names
+                )
+            else:
+                # fallback to default explainer
+                explainer = shap.Explainer(
+                    self.predict_fn, sampled_X, feature_names=truncated_feature_names
+                )
+
+        _logger.info(f"Shap explainer {explainer.__class__.__name__} is used.")
+
+        shap_values = explainer(sampled_X)
+
+        try:
+            mlflow.shap.log_explainer(
+                explainer, artifact_path=_gen_log_key("explainer", self.dataset_name)
+            )
+        except Exception as e:
+            # TODO: The explainer saver is buggy, if `get_underlying_model_flavor` return "unknown",
+            #   then fallback to shap explainer saver, and shap explainer will call `model.save`
+            #   for sklearn model, there is no `.save` method, so error will happen.
+            _logger.warning(f"Log explainer failed. Reason: {str(e)}")
+
+        def plot_beeswarm():
+            pyplot.subplots_adjust(bottom=0.2, left=0.4)
+            shap.plots.beeswarm(shap_values, show=False)
+
+        self._log_image_artifact(
+            plot_beeswarm,
+            "shap_beeswarm_plot",
+        )
+
+        def plot_summary():
+            pyplot.subplots_adjust(bottom=0.2, left=0.4)
+            shap.summary_plot(shap_values, show=False)
+
+        self._log_image_artifact(
+            plot_summary,
+            "shap_summary_plot",
+        )
+
+        def plot_feature_importance():
+            pyplot.subplots_adjust(bottom=0.2, left=0.4)
+            shap.plots.bar(shap_values, show=False)
+
+        self._log_image_artifact(
+            plot_feature_importance,
+            "shap_feature_importance_plot",
+        )
+
+    def _log_binary_classifier(self):
+        self.metrics.update(_get_classifier_per_class_metrics(self.y, self.y_pred))
+
+        if self.y_probs is not None:
+            roc_curve = _gen_classifier_curve(
+                is_binomial=True,
+                y=self.y,
+                y_probs=self.y_prob,
+                labels=self.label_list,
+                curve_type="roc",
+            )
+
+            def plot_roc_curve():
+                roc_curve.plot_fn(**roc_curve.plot_fn_args)
+
+            self._log_image_artifact(plot_roc_curve, "roc_curve_plot")
+            self.metrics["roc_auc"] = roc_curve.auc
+
+            pr_curve = _gen_classifier_curve(
+                is_binomial=True,
+                y=self.y,
+                y_probs=self.y_prob,
+                labels=self.label_list,
+                curve_type="pr",
+            )
+
+            def plot_pr_curve():
+                pr_curve.plot_fn(**pr_curve.plot_fn_args)
+
+            self._log_image_artifact(plot_pr_curve, "precision_recall_curve_plot")
+            self.metrics["precision_recall_auc"] = pr_curve.auc
+
+    def _log_multiclass_classifier(self):
+        per_class_metrics_collection_df = _get_classifier_per_class_metrics_collection_df(
+            self.y, self.y_pred, self.label_list
+        )
+
+        log_roc_pr_curve = False
+        if self.y_probs is not None:
+            max_num_classes_for_logging_curve = self.evaluator_config.get(
+                "max_num_classes_threshold_logging_roc_pr_curve_for_multiclass_classifier", 10
+            )
+            if self.num_classes <= max_num_classes_for_logging_curve:
+                log_roc_pr_curve = True
+            else:
+                _logger.warning(
+                    f"The classifier num_classes > {max_num_classes_for_logging_curve}, skip "
+                    f"logging ROC curve and Precision-Recall curve. You can add evaluator config "
+                    f"'max_num_classes_threshold_logging_roc_pr_curve_for_multiclass_classifier' "
+                    f"to increase the threshold."
+                )
+
+        if log_roc_pr_curve:
+            roc_curve = _gen_classifier_curve(
+                is_binomial=False,
+                y=self.y,
+                y_probs=self.y_probs,
+                labels=self.label_list,
+                curve_type="roc",
+            )
+
+            def plot_roc_curve():
+                roc_curve.plot_fn(**roc_curve.plot_fn_args)
+
+            self._log_image_artifact(plot_roc_curve, "roc_curve_plot")
+            per_class_metrics_collection_df["roc_auc"] = roc_curve.auc
+
+            pr_curve = _gen_classifier_curve(
+                is_binomial=False,
+                y=self.y,
+                y_probs=self.y_probs,
+                labels=self.label_list,
+                curve_type="pr",
+            )
+
+            def plot_pr_curve():
+                pr_curve.plot_fn(**pr_curve.plot_fn_args)
+
+            self._log_image_artifact(plot_pr_curve, "precision_recall_curve_plot")
+            per_class_metrics_collection_df["precision_recall_auc"] = pr_curve.auc
+
+        self._log_pandas_df_artifact(per_class_metrics_collection_df, "per_class_metrics")
+
+    def _evaluate_classifier(self):
+        from mlflow.models.evaluation.lift_curve import plot_lift_curve
+
+        self.label_list = np.unique(self.y)
+        self.num_classes = len(self.label_list)
+
+        self.y_pred = self.predict_fn(self.X)
+        self.is_binomial = self.num_classes <= 2
+
+        if self.is_binomial:
+            if list(self.label_list) not in [[0, 1], [-1, 1]]:
+                raise ValueError(
+                    "Binary classifier evaluation dataset positive class label must be 1 or True, "
+                    "negative class label must be 0 or -1 or False, and dataset must contains "
+                    "both positive and negative examples."
+                )
+            _logger.info(
+                "The evaluation dataset is inferred as binary dataset, positive label is "
+                f"{self.label_list[1]}, negative label is {self.label_list[0]}."
+            )
+        else:
+            _logger.info(
+                "The evaluation dataset is inferred as multiclass dataset, number of classes "
+                f"is inferred as {self.num_classes}"
+            )
+
+        if self.predict_proba_fn is not None:
+            self.y_probs = self.predict_proba_fn(self.X)
+            if self.is_binomial:
+                self.y_prob = self.y_probs[:, 1]
+            else:
+                self.y_prob = None
+        else:
+            self.y_probs = None
+            self.y_prob = None
+
+        self.metrics.update(
+            _get_classifier_global_metrics(
+                self.is_binomial, self.y, self.y_pred, self.y_probs, self.label_list
+            )
+        )
+
+        if self.is_binomial:
+            self._log_binary_classifier()
+        else:
+            self._log_multiclass_classifier()
+
+        if self.is_binomial and self.y_probs is not None:
+            self._log_image_artifact(
+                lambda: plot_lift_curve(self.y, self.y_probs),
+                "lift_curve_plot",
+            )
+
+        # normalize the confusion matrix, keep consistent with sklearn autologging.
+        confusion_matrix = sk_metrics.confusion_matrix(
+            self.y, self.y_pred, labels=self.label_list, normalize="true"
+        )
+
+        def plot_confusion_matrix():
+            sk_metrics.ConfusionMatrixDisplay(
+                confusion_matrix=confusion_matrix,
+                display_labels=self.label_list,
+            ).plot(cmap="Blues")
+
+        if hasattr(sk_metrics, "ConfusionMatrixDisplay"):
+            self._log_image_artifact(
+                plot_confusion_matrix,
+                "confusion_matrix",
+            )
+
+        self._log_metrics()
+        self._log_model_explainability()
+        return EvaluationResult(self.metrics, self.artifacts)
+
+    def _evaluate_regressor(self):
+        self.y_pred = self.model.predict(self.X)
+        self.metrics.update(_get_regressor_metrics(self.y, self.y_pred))
+
+        self._log_metrics()
+        self._log_model_explainability()
+        return EvaluationResult(self.metrics, self.artifacts)
+
+    def evaluate(
+        self,
+        *,
+        model: "mlflow.pyfunc.PyFuncModel",
+        model_type,
+        dataset,
+        run_id,
+        evaluator_config,
+        **kwargs,
+    ):
+        import matplotlib
+
+        with TempDir() as temp_dir, matplotlib.rc_context(_matplotlib_config):
+            self.client = mlflow.tracking.MlflowClient()
+
+            self.temp_dir = temp_dir
+            self.model = model
+            self.model_type = model_type
+            self.dataset = dataset
+            self.run_id = run_id
+            self.evaluator_config = evaluator_config
+            self.dataset_name = dataset.name
+            self.feature_names = dataset.feature_names
+
+            (
+                model_loader_module,
+                raw_model,
+                predict_fn,
+                predict_proba_fn,
+            ) = _extract_raw_model_and_predict_fn(model)
+            self.model_loader_module = model_loader_module
+            self.raw_model = raw_model
+            self.predict_fn = predict_fn
+            self.predict_proba_fn = predict_proba_fn
+
+            self.X = dataset.features_data
+            self.y = dataset.labels_data
+            self.metrics = EvaluationMetrics()
+            self.artifacts = {}
+
+            infered_model_type = _infer_model_type_by_labels(self.y)
+
+            if model_type != infered_model_type:
+                _logger.warning(
+                    f"According to the evaluation dataset label values, the model type looks like "
+                    f"{infered_model_type}, but you specified model type {model_type}. Please "
+                    f"verify that you set the `model_type` and `dataset` arguments correctly."
+                )
+
+            if model_type == "classifier":
+                return self._evaluate_classifier()
+            elif model_type == "regressor":
+                return self._evaluate_regressor()
+            else:
+                raise ValueError(f"Unsupported model type {model_type}")
diff --git a/mlflow/models/evaluation/evaluator_registry.py b/mlflow/models/evaluation/evaluator_registry.py
index 9e7f027f0496e..1822e313460ea 100644
--- a/mlflow/models/evaluation/evaluator_registry.py
+++ b/mlflow/models/evaluation/evaluator_registry.py
@@ -48,6 +48,9 @@ def get_evaluator(self, evaluator_name):
 
 
 def register_evaluators(module):
+    from mlflow.models.evaluation.default_evaluator import DefaultEvaluator
+
+    module._model_evaluation_registry.register("default", DefaultEvaluator)
     module._model_evaluation_registry.register_entrypoints()
 
 
diff --git a/mlflow/models/evaluation/lift_curve.py b/mlflow/models/evaluation/lift_curve.py
new file mode 100644
index 0000000000000..cbcba712a9329
--- /dev/null
+++ b/mlflow/models/evaluation/lift_curve.py
@@ -0,0 +1,165 @@
+import matplotlib.pyplot as plt
+
+import numpy as np
+
+
+def _cumulative_gain_curve(y_true, y_score, pos_label=None):
+    """
+    This method is copied from scikit-plot package.
+
+    This function generates the points necessary to plot the Cumulative Gain
+
+    Note: This implementation is restricted to the binary classification task.
+
+    Args:
+        y_true (array-like, shape (n_samples)): True labels of the data.
+
+        y_score (array-like, shape (n_samples)): Target scores, can either be
+            probability estimates of the positive class, confidence values, or
+            non-thresholded measure of decisions (as returned by
+            decision_function on some classifiers).
+
+        pos_label (int or str, default=None): Label considered as positive and
+            others are considered negative
+
+    Returns:
+        percentages (numpy.ndarray): An array containing the X-axis values for
+            plotting the Cumulative Gains chart.
+
+        gains (numpy.ndarray): An array containing the Y-axis values for one
+            curve of the Cumulative Gains chart.
+
+    Raises:
+        ValueError: If `y_true` is not composed of 2 classes. The Cumulative
+            Gain Chart is only relevant in binary classification.
+    """
+    y_true, y_score = np.asarray(y_true), np.asarray(y_score)
+
+    # ensure binary classification if pos_label is not specified
+    classes = np.unique(y_true)
+    if pos_label is None and not (
+        np.array_equal(classes, [0, 1])
+        or np.array_equal(classes, [-1, 1])
+        or np.array_equal(classes, [0])
+        or np.array_equal(classes, [-1])
+        or np.array_equal(classes, [1])
+    ):
+        raise ValueError("Data is not binary and pos_label is not specified")
+    elif pos_label is None:
+        pos_label = 1.0
+
+    # make y_true a boolean vector
+    y_true = y_true == pos_label
+
+    sorted_indices = np.argsort(y_score)[::-1]
+    y_true = y_true[sorted_indices]
+    gains = np.cumsum(y_true)
+
+    percentages = np.arange(start=1, stop=len(y_true) + 1)
+
+    gains = gains / float(np.sum(y_true))
+    percentages = percentages / float(len(y_true))
+
+    gains = np.insert(gains, 0, [0])
+    percentages = np.insert(percentages, 0, [0])
+
+    return percentages, gains
+
+
+def plot_lift_curve(
+    y_true,
+    y_probas,
+    title="Lift Curve",
+    ax=None,
+    figsize=None,
+    title_fontsize="large",
+    text_fontsize="medium",
+):
+    """
+    This method is copied from scikit-plot package.
+
+    Generates the Lift Curve from labels and scores/probabilities
+
+    The lift curve is used to determine the effectiveness of a
+    binary classifier. A detailed explanation can be found at
+    http://www2.cs.uregina.ca/~dbd/cs831/notes/lift_chart/lift_chart.html.
+    The implementation here works only for binary classification.
+
+    Args:
+        y_true (array-like, shape (n_samples)):
+            Ground truth (correct) target values.
+
+        y_probas (array-like, shape (n_samples, n_classes)):
+            Prediction probabilities for each class returned by a classifier.
+
+        title (string, optional): Title of the generated plot. Defaults to
+            "Lift Curve".
+
+        ax (:class:`matplotlib.axes.Axes`, optional): The axes upon which to
+            plot the learning curve. If None, the plot is drawn on a new set of
+            axes.
+
+        figsize (2-tuple, optional): Tuple denoting figure size of the plot
+            e.g. (6, 6). Defaults to ``None``.
+
+        title_fontsize (string or int, optional): Matplotlib-style fontsizes.
+            Use e.g. "small", "medium", "large" or integer-values. Defaults to
+            "large".
+
+        text_fontsize (string or int, optional): Matplotlib-style fontsizes.
+            Use e.g. "small", "medium", "large" or integer-values. Defaults to
+            "medium".
+
+    Returns:
+        ax (:class:`matplotlib.axes.Axes`): The axes on which the plot was
+            drawn.
+
+    Example:
+        >>> lr = LogisticRegression()
+        >>> lr = lr.fit(X_train, y_train)
+        >>> y_probas = lr.predict_proba(X_test)
+        >>> plot_lift_curve(y_test, y_probas)
+        <matplotlib.axes._subplots.AxesSubplot object at 0x7fe967d64490>
+        >>> plt.show()
+
+        .. image:: _static/examples/plot_lift_curve.png
+           :align: center
+           :alt: Lift Curve
+    """
+    y_true = np.array(y_true)
+    y_probas = np.array(y_probas)
+
+    classes = np.unique(y_true)
+    if len(classes) != 2:
+        raise ValueError(
+            "Cannot calculate Lift Curve for data with " "{} category/ies".format(len(classes))
+        )
+
+    # Compute Cumulative Gain Curves
+    percentages, gains1 = _cumulative_gain_curve(y_true, y_probas[:, 0], classes[0])
+    percentages, gains2 = _cumulative_gain_curve(y_true, y_probas[:, 1], classes[1])
+
+    percentages = percentages[1:]
+    gains1 = gains1[1:]
+    gains2 = gains2[1:]
+
+    gains1 = gains1 / percentages
+    gains2 = gains2 / percentages
+
+    if ax is None:
+        _, ax = plt.subplots(1, 1, figsize=figsize)
+
+    ax.set_title(title, fontsize=title_fontsize)
+
+    ax.plot(percentages, gains1, lw=3, label="Class {}".format(classes[0]))
+    ax.plot(percentages, gains2, lw=3, label="Class {}".format(classes[1]))
+
+    ax.plot([0, 1], [1, 1], "k--", lw=2, label="Baseline")
+
+    ax.set_xlabel("Percentage of sample", fontsize=text_fontsize)
+    ax.set_ylabel("Lift", fontsize=text_fontsize)
+    ax.tick_params(labelsize=text_fontsize)
+    ax.grid("on")
+    ax.legend(loc="best", fontsize=text_fontsize)
+
+    return ax
diff --git a/mlflow/models/utils.py b/mlflow/models/utils.py
index bb862c4c923fe..dc9d9c1f5bede 100644
--- a/mlflow/models/utils.py
+++ b/mlflow/models/utils.py
@@ -234,3 +234,22 @@ def _read_sparse_matrix_from_json(path, example_type):
             return csc_matrix((data, indices, indptr), shape=shape)
         else:
             return csr_matrix((data, indices, indptr), shape=shape)
+
+
+def plot_lines(data_series, xlabel, ylabel, legend_loc=None, line_kwargs=None):
+    import matplotlib.pyplot as plt
+
+    fig, ax = plt.subplots()
+
+    if line_kwargs is None:
+        line_kwargs = {}
+
+    for label, data_x, data_y in data_series:
+        ax.plot(data_x, data_y, label=label, **line_kwargs)
+
+    if legend_loc:
+        ax.legend(loc=legend_loc)
+
+    ax.set(xlabel=xlabel, ylabel=ylabel)
+
+    return fig, ax
diff --git a/mlflow/utils/string_utils.py b/mlflow/utils/string_utils.py
index e3b1d0b911301..4cce65499b7cb 100644
--- a/mlflow/utils/string_utils.py
+++ b/mlflow/utils/string_utils.py
@@ -12,3 +12,13 @@ def strip_suffix(original, suffix):
 
 def is_string_type(item):
     return isinstance(item, str)
+
+
+def truncate_str_from_middle(s, max_length):
+    assert max_length > 5
+    if len(s) <= max_length:
+        return s
+    else:
+        left_part_len = (max_length - 3) // 2
+        right_part_len = max_length - 3 - left_part_len
+        return f"{s[:left_part_len]}...{s[-right_part_len:]}"
diff --git a/requirements/small-requirements.txt b/requirements/small-requirements.txt
index 855d6a3052473..18424b787c6fc 100644
--- a/requirements/small-requirements.txt
+++ b/requirements/small-requirements.txt
@@ -13,3 +13,6 @@ moto!=2.0.7
 azure-storage-blob>=12.0.0
 azure-identity>=1.6.1
 databricks-cli@git+https://github.com/databricks/databricks-cli.git
+pillow
+matplotlib
+shap>=0.40
diff --git a/tests/models/test_default_evaluator.py b/tests/models/test_default_evaluator.py
new file mode 100644
index 0000000000000..872f599ecf717
--- /dev/null
+++ b/tests/models/test_default_evaluator.py
@@ -0,0 +1,491 @@
+import numpy as np
+import json
+
+
+from mlflow.models.evaluation import evaluate
+from mlflow.models.evaluation.default_evaluator import (
+    _get_classifier_global_metrics,
+    _infer_model_type_by_labels,
+    _extract_raw_model_and_predict_fn,
+    _get_regressor_metrics,
+    _get_binary_sum_up_label_pred_prob,
+    _get_classifier_per_class_metrics,
+    _gen_classifier_curve,
+)
+import mlflow
+from sklearn.linear_model import LogisticRegression
+
+# pylint: disable=unused-import
+from tests.models.test_evaluation import (
+    get_run_data,
+    linear_regressor_model_uri,
+    diabetes_dataset,
+    multiclass_logistic_regressor_model_uri,
+    iris_dataset,
+    binary_logistic_regressor_model_uri,
+    breast_cancer_dataset,
+    spark_linear_regressor_model_uri,
+    diabetes_spark_dataset,
+    svm_model_uri,
+)
+from mlflow.models.utils import plot_lines
+
+
+def assert_dict_equal(d1, d2, rtol):
+    for k in d1:
+        assert k in d2
+        assert np.isclose(d1[k], d2[k], rtol=rtol)
+
+
+def test_regressor_evaluation(linear_regressor_model_uri, diabetes_dataset):
+    with mlflow.start_run() as run:
+        result = evaluate(
+            model=linear_regressor_model_uri,
+            model_type="regressor",
+            dataset=diabetes_dataset,
+            evaluators="default",
+        )
+
+    _, metrics, tags, artifacts = get_run_data(run.info.run_id)
+
+    model = mlflow.pyfunc.load_model(linear_regressor_model_uri)
+
+    y = diabetes_dataset.labels_data
+    y_pred = model.predict(diabetes_dataset.features_data)
+
+    expected_metrics = _get_regressor_metrics(y, y_pred)
+    for metric_key in expected_metrics:
+        assert np.isclose(
+            expected_metrics[metric_key],
+            metrics[metric_key + "_on_data_diabetes_dataset"],
+            rtol=1e-3,
+        )
+        assert np.isclose(expected_metrics[metric_key], result.metrics[metric_key], rtol=1e-3)
+
+    assert json.loads(tags["mlflow.datasets"]) == [
+        {**diabetes_dataset._metadata, "model": model.metadata.model_uuid}
+    ]
+
+    assert set(artifacts) == {
+        "shap_beeswarm_plot_on_data_diabetes_dataset.png",
+        "shap_feature_importance_plot_on_data_diabetes_dataset.png",
+        "shap_summary_plot_on_data_diabetes_dataset.png",
+    }
+    assert result.artifacts.keys() == {
+        "shap_beeswarm_plot",
+        "shap_feature_importance_plot",
+        "shap_summary_plot",
+    }
+
+
+def test_multi_classifier_evaluation(multiclass_logistic_regressor_model_uri, iris_dataset):
+    with mlflow.start_run() as run:
+        result = evaluate(
+            model=multiclass_logistic_regressor_model_uri,
+            model_type="classifier",
+            dataset=iris_dataset,
+            evaluators="default",
+        )
+
+    _, metrics, tags, artifacts = get_run_data(run.info.run_id)
+
+    model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri)
+
+    _, _, predict_fn, predict_proba_fn = _extract_raw_model_and_predict_fn(model)
+    y = iris_dataset.labels_data
+    y_pred = predict_fn(iris_dataset.features_data)
+    y_probs = predict_proba_fn(iris_dataset.features_data)
+
+    expected_metrics = _get_classifier_global_metrics(False, y, y_pred, y_probs, labels=None)
+
+    for metric_key in expected_metrics:
+        assert np.isclose(
+            expected_metrics[metric_key], metrics[metric_key + "_on_data_iris_dataset"], rtol=1e-3
+        )
+        assert np.isclose(expected_metrics[metric_key], result.metrics[metric_key], rtol=1e-3)
+
+    assert json.loads(tags["mlflow.datasets"]) == [
+        {**iris_dataset._metadata, "model": model.metadata.model_uuid}
+    ]
+
+    assert set(artifacts) == {
+        "shap_beeswarm_plot_on_data_iris_dataset.png",
+        "per_class_metrics_on_data_iris_dataset.csv",
+        "roc_curve_plot_on_data_iris_dataset.png",
+        "precision_recall_curve_plot_on_data_iris_dataset.png",
+        "shap_feature_importance_plot_on_data_iris_dataset.png",
+        "explainer_on_data_iris_dataset",
+        "confusion_matrix_on_data_iris_dataset.png",
+        "shap_summary_plot_on_data_iris_dataset.png",
+    }
+    assert result.artifacts.keys() == {
+        "per_class_metrics",
+        "roc_curve_plot",
+        "precision_recall_curve_plot",
+        "confusion_matrix",
+        "shap_beeswarm_plot",
+        "shap_summary_plot",
+        "shap_feature_importance_plot",
+    }
+
+
+def test_bin_classifier_evaluation(binary_logistic_regressor_model_uri, breast_cancer_dataset):
+    with mlflow.start_run() as run:
+        result = evaluate(
+            model=binary_logistic_regressor_model_uri,
+            model_type="classifier",
+            dataset=breast_cancer_dataset,
+            evaluators="default",
+        )
+
+    _, metrics, tags, artifacts = get_run_data(run.info.run_id)
+
+    model = mlflow.pyfunc.load_model(binary_logistic_regressor_model_uri)
+
+    _, _, predict_fn, predict_proba_fn = _extract_raw_model_and_predict_fn(model)
+    y = breast_cancer_dataset.labels_data
+    y_pred = predict_fn(breast_cancer_dataset.features_data)
+    y_probs = predict_proba_fn(breast_cancer_dataset.features_data)
+
+    expected_metrics = _get_classifier_global_metrics(True, y, y_pred, y_probs, labels=None)
+
+    for metric_key in expected_metrics:
+        assert np.isclose(
+            expected_metrics[metric_key],
+            metrics[metric_key + "_on_data_breast_cancer_dataset"],
+            rtol=1e-3,
+        )
+        assert np.isclose(expected_metrics[metric_key], result.metrics[metric_key], rtol=1e-3)
+
+    assert json.loads(tags["mlflow.datasets"]) == [
+        {**breast_cancer_dataset._metadata, "model": model.metadata.model_uuid}
+    ]
+
+    assert set(artifacts) == {
+        "shap_feature_importance_plot_on_data_breast_cancer_dataset.png",
+        "lift_curve_plot_on_data_breast_cancer_dataset.png",
+        "shap_beeswarm_plot_on_data_breast_cancer_dataset.png",
+        "precision_recall_curve_plot_on_data_breast_cancer_dataset.png",
+        "confusion_matrix_on_data_breast_cancer_dataset.png",
+        "shap_summary_plot_on_data_breast_cancer_dataset.png",
+        "roc_curve_plot_on_data_breast_cancer_dataset.png",
+    }
+    assert result.artifacts.keys() == {
+        "roc_curve_plot",
+        "precision_recall_curve_plot",
+        "lift_curve_plot",
+        "confusion_matrix",
+        "shap_beeswarm_plot",
+        "shap_summary_plot",
+        "shap_feature_importance_plot",
+    }
+
+
+def test_spark_regressor_model_evaluation(spark_linear_regressor_model_uri, diabetes_spark_dataset):
+    with mlflow.start_run() as run:
+        result = evaluate(
+            model=spark_linear_regressor_model_uri,
+            model_type="regressor",
+            dataset=diabetes_spark_dataset,
+            evaluators="default",
+            evaluator_config={"log_model_explainability": True},
+        )
+
+    _, metrics, tags, artifacts = get_run_data(run.info.run_id)
+
+    model = mlflow.pyfunc.load_model(spark_linear_regressor_model_uri)
+
+    X = diabetes_spark_dataset.features_data
+    y = diabetes_spark_dataset.labels_data
+    y_pred = model.predict(X)
+
+    expected_metrics = _get_regressor_metrics(y, y_pred)
+
+    for metric_key in expected_metrics:
+        assert np.isclose(
+            expected_metrics[metric_key],
+            metrics[metric_key + "_on_data_diabetes_spark_dataset"],
+            rtol=1e-3,
+        )
+        assert np.isclose(expected_metrics[metric_key], result.metrics[metric_key], rtol=1e-3)
+
+    model = mlflow.pyfunc.load_model(spark_linear_regressor_model_uri)
+
+    assert json.loads(tags["mlflow.datasets"]) == [
+        {**diabetes_spark_dataset._metadata, "model": model.metadata.model_uuid}
+    ]
+
+    assert set(artifacts) == set()
+    assert result.artifacts == {}
+
+
+def test_svm_classifier_evaluation(svm_model_uri, breast_cancer_dataset):
+    with mlflow.start_run() as run:
+        result = evaluate(
+            model=svm_model_uri,
+            model_type="classifier",
+            dataset=breast_cancer_dataset,
+            evaluators="default",
+        )
+
+    _, metrics, tags, artifacts = get_run_data(run.info.run_id)
+
+    model = mlflow.pyfunc.load_model(svm_model_uri)
+
+    _, _, predict_fn, _ = _extract_raw_model_and_predict_fn(model)
+    y = breast_cancer_dataset.labels_data
+    y_pred = predict_fn(breast_cancer_dataset.features_data)
+
+    expected_metrics = _get_classifier_global_metrics(True, y, y_pred, None, labels=None)
+
+    for metric_key in expected_metrics:
+        assert np.isclose(
+            expected_metrics[metric_key],
+            metrics[metric_key + "_on_data_breast_cancer_dataset"],
+            rtol=1e-3,
+        )
+        assert np.isclose(expected_metrics[metric_key], result.metrics[metric_key], rtol=1e-3)
+
+    assert json.loads(tags["mlflow.datasets"]) == [
+        {**breast_cancer_dataset._metadata, "model": model.metadata.model_uuid}
+    ]
+
+    assert set(artifacts) == {
+        "confusion_matrix_on_data_breast_cancer_dataset.png",
+        "shap_feature_importance_plot_on_data_breast_cancer_dataset.png",
+        "shap_beeswarm_plot_on_data_breast_cancer_dataset.png",
+        "shap_summary_plot_on_data_breast_cancer_dataset.png",
+    }
+    assert result.artifacts.keys() == {
+        "confusion_matrix",
+        "shap_beeswarm_plot",
+        "shap_summary_plot",
+        "shap_feature_importance_plot",
+    }
+
+
+def test_infer_model_type_by_labels():
+    assert _infer_model_type_by_labels(["a", "b"]) == "classifier"
+    assert _infer_model_type_by_labels([1, 2.5]) == "regressor"
+    assert _infer_model_type_by_labels(list(range(2000))) == "regressor"
+    assert _infer_model_type_by_labels([1, 2, 3]) == "classifier"
+
+
+def test_extract_raw_model_and_predict_fn(binary_logistic_regressor_model_uri):
+    model = mlflow.pyfunc.load_model(binary_logistic_regressor_model_uri)
+    (
+        model_loader_module,
+        raw_model,
+        predict_fn,
+        predict_proba_fn,
+    ) = _extract_raw_model_and_predict_fn(model)
+    assert model_loader_module == "mlflow.sklearn"
+    assert isinstance(raw_model, LogisticRegression)
+    assert predict_fn == raw_model.predict
+    assert predict_proba_fn == raw_model.predict_proba
+
+
+def test_get_regressor_metrics():
+    y = [1.1, 2.1, -3.5]
+    y_pred = [1.5, 2.0, -3.0]
+
+    metrics = _get_regressor_metrics(y, y_pred)
+    expected_metrics = {
+        "example_count": 3,
+        "mean_absolute_error": 0.3333333333333333,
+        "mean_squared_error": 0.13999999999999999,
+        "root_mean_squared_error": 0.3741657386773941,
+        "sum_on_label": -0.2999999999999998,
+        "mean_on_label": -0.09999999999999994,
+        "r2_score": 0.976457399103139,
+        "max_error": 0.5,
+        "mean_absolute_percentage_error": 0.18470418470418468,
+    }
+    assert_dict_equal(metrics, expected_metrics, rtol=1e-3)
+
+
+def test_get_binary_sum_up_label_pred_prob():
+    y = [0, 1, 2]
+    y_pred = [0, 2, 1]
+    y_probs = [[0.7, 0.1, 0.2], [0.2, 0.3, 0.5], [0.25, 0.4, 0.35]]
+
+    results = []
+    for idx, label in enumerate([0, 1, 2]):
+        y_bin, y_pred_bin, y_prob_bin = _get_binary_sum_up_label_pred_prob(
+            idx, label, y, y_pred, y_probs
+        )
+        results.append((list(y_bin), list(y_pred_bin), list(y_prob_bin)))
+
+    print(results)
+    assert results == [
+        ([1, 0, 0], [1, 0, 0], [0.7, 0.2, 0.25]),
+        ([0, 1, 0], [0, 0, 1], [0.1, 0.3, 0.4]),
+        ([0, 0, 1], [0, 1, 0], [0.2, 0.5, 0.35]),
+    ]
+
+
+def test_get_classifier_per_class_metrics():
+    y = [0, 1, 0, 1, 0, 1, 0, 1, 1, 0]
+    y_pred = [0, 1, 1, 0, 1, 1, 0, 1, 1, 0]
+
+    expected_metrics = {
+        "true_negatives": 3,
+        "false_positives": 2,
+        "false_negatives": 1,
+        "true_positives": 4,
+        "recall": 0.8,
+        "precision": 0.6666666666666666,
+        "f1_score": 0.7272727272727272,
+    }
+    metrics = _get_classifier_per_class_metrics(y, y_pred)
+    assert_dict_equal(metrics, expected_metrics, rtol=1e-3)
+
+
+def test_multiclass_get_classifier_global_metrics():
+    y = [0, 1, 2, 1, 2]
+    y_pred = [0, 2, 1, 1, 0]
+    y_probs = [
+        [0.7, 0.1, 0.2],
+        [0.2, 0.3, 0.5],
+        [0.25, 0.4, 0.35],
+        [0.3, 0.4, 0.3],
+        [0.8, 0.1, 0.1],
+    ]
+
+    metrics = _get_classifier_global_metrics(
+        is_binomial=False, y=y, y_pred=y_pred, y_probs=y_probs, labels=[0, 1, 2]
+    )
+    expected_metrics = {
+        "accuracy": 0.4,
+        "example_count": 5,
+        "f1_score_micro": 0.4,
+        "f1_score_macro": 0.38888888888888884,
+        "log_loss": 1.1658691395263094,
+    }
+    assert_dict_equal(metrics, expected_metrics, 1e-3)
+
+
+def test_binary_get_classifier_global_metrics():
+    y = [0, 1, 0, 1, 0, 1, 0, 1, 1, 0]
+    y_pred = [0, 1, 1, 0, 1, 1, 0, 1, 1, 0]
+    y_prob = [0.1, 0.9, 0.8, 0.2, 0.7, 0.8, 0.3, 0.6, 0.65, 0.4]
+    y_probs = [[1 - p, p] for p in y_prob]
+    metrics = _get_classifier_global_metrics(
+        is_binomial=True, y=y, y_pred=y_pred, y_probs=y_probs, labels=[0, 1]
+    )
+    expected_metrics = {"accuracy": 0.7, "example_count": 10, "log_loss": 0.6665822319387167}
+    assert_dict_equal(metrics, expected_metrics, 1e-3)
+
+
+def test_gen_binary_precision_recall_curve():
+    y = [0, 1, 0, 1, 0, 1, 0, 1, 1, 0]
+    y_prob = [0.1, 0.9, 0.8, 0.2, 0.7, 0.8, 0.3, 0.6, 0.65, 0.4]
+
+    results = _gen_classifier_curve(
+        is_binomial=True, y=y, y_probs=y_prob, labels=[0, 1], curve_type="pr"
+    )
+    assert np.allclose(
+        results.plot_fn_args["data_series"][0][1],
+        np.array([1.0, 0.8, 0.8, 0.8, 0.6, 0.4, 0.4, 0.2, 0.0]),
+        rtol=1e-3,
+    )
+    assert np.allclose(
+        results.plot_fn_args["data_series"][0][2],
+        np.array([0.55555556, 0.5, 0.57142857, 0.66666667, 0.6, 0.5, 0.66666667, 1.0, 1.0]),
+        rtol=1e-3,
+    )
+    assert results.plot_fn_args["xlabel"] == "recall"
+    assert results.plot_fn_args["ylabel"] == "precision"
+    assert results.plot_fn_args["line_kwargs"] == {"drawstyle": "steps-post", "linewidth": 1}
+    assert np.isclose(results.auc, 0.7088888888888889, rtol=1e-3)
+
+
+def test_gen_binary_roc_curve():
+    y = [0, 1, 0, 1, 0, 1, 0, 1, 1, 0]
+    y_prob = [0.1, 0.9, 0.8, 0.2, 0.7, 0.8, 0.3, 0.6, 0.65, 0.4]
+
+    results = _gen_classifier_curve(
+        is_binomial=True, y=y, y_probs=y_prob, labels=[0, 1], curve_type="roc"
+    )
+    assert np.allclose(
+        results.plot_fn_args["data_series"][0][1],
+        np.array([0.0, 0.0, 0.2, 0.4, 0.4, 0.8, 0.8, 1.0]),
+        rtol=1e-3,
+    )
+    assert np.allclose(
+        results.plot_fn_args["data_series"][0][2],
+        np.array([0.0, 0.2, 0.4, 0.4, 0.8, 0.8, 1.0, 1.0]),
+        rtol=1e-3,
+    )
+    assert results.plot_fn_args["xlabel"] == "False Positive Rate"
+    assert results.plot_fn_args["ylabel"] == "True Positive Rate"
+    assert results.plot_fn_args["line_kwargs"] == {"drawstyle": "steps-post", "linewidth": 1}
+    assert np.isclose(results.auc, 0.66, rtol=1e-3)
+
+
+def test_gen_multiclass_precision_recall_curve():
+    y = [0, 1, 2, 1, 2]
+    y_probs = [
+        [0.7, 0.1, 0.2],
+        [0.2, 0.3, 0.5],
+        [0.25, 0.4, 0.35],
+        [0.3, 0.4, 0.3],
+        [0.8, 0.1, 0.1],
+    ]
+
+    results = _gen_classifier_curve(
+        is_binomial=False, y=y, y_probs=y_probs, labels=[0, 1, 2], curve_type="pr"
+    )
+    expected_x_data_list = [[1.0, 0.0, 0.0], [1.0, 0.5, 0.0], [1.0, 0.5, 0.5, 0.5, 0.0, 0.0]]
+    expected_y_data_list = [
+        [0.5, 0.0, 1.0],
+        [0.66666667, 0.5, 1.0],
+        [0.4, 0.25, 0.33333333, 0.5, 0.0, 1.0],
+    ]
+    line_labels = ["label=0,AP=0.500", "label=1,AP=0.722", "label=2,AP=0.414"]
+    for index, (name, x_data, y_data) in enumerate(results.plot_fn_args["data_series"]):
+        assert name == line_labels[index]
+        assert np.allclose(x_data, expected_x_data_list[index], rtol=1e-3)
+        assert np.allclose(y_data, expected_y_data_list[index], rtol=1e-3)
+
+    assert results.plot_fn_args["xlabel"] == "recall"
+    assert results.plot_fn_args["ylabel"] == "precision"
+    assert results.plot_fn_args["line_kwargs"] == {"drawstyle": "steps-post", "linewidth": 1}
+
+    expected_auc = [0.25, 0.6666666666666666, 0.2875]
+    assert np.allclose(results.auc, expected_auc, rtol=1e-3)
+
+
+def test_gen_multiclass_roc_curve():
+    y = [0, 1, 2, 1, 2]
+    y_probs = [
+        [0.7, 0.1, 0.2],
+        [0.2, 0.3, 0.5],
+        [0.25, 0.4, 0.35],
+        [0.3, 0.4, 0.3],
+        [0.8, 0.1, 0.1],
+    ]
+
+    results = _gen_classifier_curve(
+        is_binomial=False, y=y, y_probs=y_probs, labels=[0, 1, 2], curve_type="roc"
+    )
+    print(results)
+
+    expected_x_data_list = [
+        [0.0, 0.25, 0.25, 1.0],
+        [0.0, 0.33333333, 0.33333333, 1.0],
+        [0.0, 0.33333333, 0.33333333, 1.0, 1.0],
+    ]
+    expected_y_data_list = [[0.0, 0.0, 1.0, 1.0], [0.0, 0.5, 1.0, 1.0], [0.0, 0.0, 0.5, 0.5, 1.0]]
+    line_labels = ["label=0,AUC=0.750", "label=1,AUC=0.750", "label=2,AUC=0.333"]
+    for index, (name, x_data, y_data) in enumerate(results.plot_fn_args["data_series"]):
+        assert name == line_labels[index]
+        assert np.allclose(x_data, expected_x_data_list[index], rtol=1e-3)
+        assert np.allclose(y_data, expected_y_data_list[index], rtol=1e-3)
+
+    assert results.plot_fn_args["xlabel"] == "False Positive Rate"
+    assert results.plot_fn_args["ylabel"] == "True Positive Rate"
+    assert results.plot_fn_args["line_kwargs"] == {"drawstyle": "steps-post", "linewidth": 1}
+
+    expected_auc = [0.75, 0.75, 0.3333]
+    assert np.allclose(results.auc, expected_auc, rtol=1e-3)
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index 6892e6d8227b7..0555ca82a9a47 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -1,4 +1,5 @@
 import mlflow
+from collections import namedtuple
 
 from mlflow.models.evaluation import (
     evaluate,
@@ -8,6 +9,9 @@
     EvaluationArtifact,
     EvaluationMetrics,
 )
+from mlflow.models.evaluation.base import (
+    _normalize_evaluators_and_evaluator_config_args as _normalize_config,
+)
 import hashlib
 from mlflow.models.evaluation.base import _start_run_or_reuse_active_run
 import sklearn
@@ -21,6 +25,7 @@
 from mlflow.utils.file_utils import TempDir
 from mlflow_test_plugin.dummy_evaluator import Array2DEvaluationArtifact
 from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
+from mlflow.models.evaluation.base import _logger as _base_logger, _gen_md5_for_arraylike_obj
 
 from sklearn.metrics import (
     accuracy_score,
@@ -30,19 +35,39 @@
 )
 
 from pyspark.sql import SparkSession
+from pyspark.ml.linalg import Vectors
+from pyspark.ml.regression import LinearRegression as SparkLinearRegression
 
 from mlflow.tracking.artifact_utils import get_artifact_uri
 import json
+import uuid
 
 
 def get_iris():
     iris = sklearn.datasets.load_iris()
-    return iris.data[:, :2], iris.target
+    return iris.data, iris.target
 
 
 def get_diabetes_dataset():
     data = sklearn.datasets.load_diabetes()
-    return data.data[:, :2], data.target
+    return data.data, data.target
+
+
+def get_diabetes_spark_dataset():
+    data = sklearn.datasets.load_diabetes()
+    spark = SparkSession.builder.master("local[*]").getOrCreate()
+    rows = [
+        (Vectors.dense(features), float(label)) for features, label in zip(data.data, data.target)
+    ]
+    return spark.createDataFrame(spark.sparkContext.parallelize(rows, 1), ["features", "label"])
+
+
+def get_breast_cancer_dataset():
+    data = sklearn.datasets.load_breast_cancer()
+    return data.data, data.target
+
+
+RunData = namedtuple("RunData", ["params", "metrics", "tags", "artifacts"])
 
 
 def get_run_data(run_id):
@@ -50,7 +75,7 @@ def get_run_data(run_id):
     data = client.get_run(run_id).data
     tags = {k: v for k, v in data.tags.items()}
     artifacts = [f.path for f in client.list_artifacts(run_id)]
-    return data.params, data.metrics, tags, artifacts
+    return RunData(params=data.params, metrics=data.metrics, tags=tags, artifacts=artifacts)
 
 
 def get_raw_tag(run_id, tag_name):
@@ -71,51 +96,118 @@ def spark_session():
 
 
 @pytest.fixture(scope="module")
-def regressor_model_uri():
+def iris_dataset():
+    X, y = get_iris()
+    eval_X, eval_y = X[0::3], y[0::3]
+    return EvaluationDataset(data=eval_X, labels=eval_y, name="iris_dataset")
+
+
+@pytest.fixture(scope="module")
+def diabetes_dataset():
+    X, y = get_diabetes_dataset()
+    eval_X, eval_y = X[0::3], y[0::3]
+    return EvaluationDataset(data=eval_X, labels=eval_y, name="diabetes_dataset")
+
+
+@pytest.fixture(scope="module")
+def diabetes_spark_dataset():
+    spark_df = get_diabetes_spark_dataset().sample(fraction=0.3, seed=1)
+    return EvaluationDataset(data=spark_df, labels="label", name="diabetes_spark_dataset")
+
+
+@pytest.fixture(scope="module")
+def breast_cancer_dataset():
+    X, y = get_breast_cancer_dataset()
+    eval_X, eval_y = X[0::3], y[0::3]
+    return EvaluationDataset(data=eval_X, labels=eval_y, name="breast_cancer_dataset")
+
+
+@pytest.fixture
+def linear_regressor_model_uri():
     X, y = get_diabetes_dataset()
     reg = sklearn.linear_model.LinearRegression()
     reg.fit(X, y)
 
     with mlflow.start_run() as run:
         mlflow.sklearn.log_model(reg, "reg_model")
-        regressor_model_uri = get_artifact_uri(run.info.run_id, "reg_model")
+        linear_regressor_model_uri = get_artifact_uri(run.info.run_id, "reg_model")
 
-    return regressor_model_uri
+    return linear_regressor_model_uri
 
 
-@pytest.fixture(scope="module")
-def classifier_model_uri():
+@pytest.fixture
+def spark_linear_regressor_model_uri():
+    spark_df = get_diabetes_spark_dataset()
+    reg = SparkLinearRegression()
+    spark_reg_model = reg.fit(spark_df)
+
+    with mlflow.start_run() as run:
+        mlflow.spark.log_model(spark_reg_model, "spark_reg_model")
+        spark_linear_regressor_model_uri = get_artifact_uri(run.info.run_id, "spark_reg_model")
+
+    return spark_linear_regressor_model_uri
+
+
+@pytest.fixture
+def multiclass_logistic_regressor_model_uri():
     X, y = get_iris()
-    clf = sklearn.linear_model.LogisticRegression()
+    clf = sklearn.linear_model.LogisticRegression(max_iter=2)
     clf.fit(X, y)
 
     with mlflow.start_run() as run:
         mlflow.sklearn.log_model(clf, "clf_model")
-        classifier_model_uri = get_artifact_uri(run.info.run_id, "clf_model")
+        multiclass_logistic_regressor_model_uri = get_artifact_uri(run.info.run_id, "clf_model")
 
-    return classifier_model_uri
+    return multiclass_logistic_regressor_model_uri
 
 
-@pytest.fixture(scope="module")
-def iris_dataset():
-    X, y = get_iris()
-    eval_X, eval_y = X[0::3], y[0::3]
-    return EvaluationDataset(data=eval_X, labels=eval_y, name="iris_dataset")
+@pytest.fixture
+def binary_logistic_regressor_model_uri():
+    X, y = get_breast_cancer_dataset()
+    clf = sklearn.linear_model.LogisticRegression()
+    clf.fit(X, y)
 
+    with mlflow.start_run() as run:
+        mlflow.sklearn.log_model(clf, "bin_clf_model")
+        binary_logistic_regressor_model_uri = get_artifact_uri(run.info.run_id, "bin_clf_model")
 
-@pytest.fixture(scope="module")
+    return binary_logistic_regressor_model_uri
+
+
+@pytest.fixture
+def svm_model_uri():
+    X, y = get_breast_cancer_dataset()
+    clf = sklearn.svm.LinearSVC()
+    clf.fit(X, y)
+
+    with mlflow.start_run() as run:
+        mlflow.sklearn.log_model(clf, "svm_model")
+        svm_model_uri = get_artifact_uri(run.info.run_id, "svm_model")
+
+    return svm_model_uri
+
+
+@pytest.fixture
 def iris_pandas_df_dataset():
     X, y = get_iris()
     eval_X, eval_y = X[0::3], y[0::3]
-    data = pd.DataFrame({"f1": eval_X[:, 0], "f2": eval_X[:, 1], "y": eval_y})
+    data = pd.DataFrame(
+        {
+            "f1": eval_X[:, 0],
+            "f2": eval_X[:, 1],
+            "f3": eval_X[:, 2],
+            "f4": eval_X[:, 3],
+            "y": eval_y,
+        }
+    )
     labels = "y"
     return EvaluationDataset(data=data, labels=labels, name="iris_pandas_df_dataset")
 
 
-def test_classifier_evaluate(classifier_model_uri, iris_dataset):
-    y_true = iris_dataset.labels
-    classifier_model = mlflow.pyfunc.load_model(classifier_model_uri)
-    y_pred = classifier_model.predict(iris_dataset.data)
+def test_classifier_evaluate(multiclass_logistic_regressor_model_uri, iris_dataset):
+    y_true = iris_dataset.labels_data
+    classifier_model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri)
+    y_pred = classifier_model.predict(iris_dataset.features_data)
     expected_accuracy_score = accuracy_score(y_true, y_pred)
     expected_metrics = {
         "accuracy_score": expected_accuracy_score,
@@ -128,9 +220,9 @@ def test_classifier_evaluate(classifier_model_uri, iris_dataset):
 
     with mlflow.start_run() as run:
         eval_result = evaluate(
-            classifier_model,
-            "classifier",
-            iris_dataset,
+            model=classifier_model,
+            model_type="classifier",
+            dataset=iris_dataset,
             run_id=None,
             evaluators="dummy_evaluator",
         )
@@ -182,10 +274,10 @@ def test_classifier_evaluate(classifier_model_uri, iris_dataset):
         )
 
 
-def test_regressor_evaluate(regressor_model_uri, iris_dataset):
-    y_true = iris_dataset.labels
-    regressor_model = mlflow.pyfunc.load_model(regressor_model_uri)
-    y_pred = regressor_model.predict(iris_dataset.data)
+def test_regressor_evaluate(linear_regressor_model_uri, diabetes_dataset):
+    y_true = diabetes_dataset.labels_data
+    regressor_model = mlflow.pyfunc.load_model(linear_regressor_model_uri)
+    y_pred = regressor_model.predict(diabetes_dataset.features_data)
     expected_mae = mean_absolute_error(y_true, y_pred)
     expected_mse = mean_squared_error(y_true, y_pred)
     expected_metrics = {
@@ -193,16 +285,16 @@ def test_regressor_evaluate(regressor_model_uri, iris_dataset):
         "mean_squared_error": expected_mse,
     }
     expected_saved_metrics = {
-        "mean_absolute_error_on_iris_dataset": expected_mae,
-        "mean_squared_error_on_iris_dataset": expected_mse,
+        "mean_absolute_error_on_diabetes_dataset": expected_mae,
+        "mean_squared_error_on_diabetes_dataset": expected_mse,
     }
 
-    for model in [regressor_model, regressor_model_uri]:
+    for model in [regressor_model, linear_regressor_model_uri]:
         with mlflow.start_run() as run:
             eval_result = evaluate(
-                model,
-                "regressor",
-                iris_dataset,
+                model=model,
+                model_type="regressor",
+                dataset=diabetes_dataset,
                 run_id=None,
                 evaluators="dummy_evaluator",
             )
@@ -222,7 +314,7 @@ def test_dataset_name():
 def test_gen_md5_for_arraylike_obj():
     def get_md5(data):
         md5_gen = hashlib.md5()
-        EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, data)
+        _gen_md5_for_arraylike_obj(md5_gen, data)
         return md5_gen.hexdigest()
 
     list0 = list(range(20))
@@ -236,73 +328,111 @@ def get_md5(data):
     assert get_md5(list3) == get_md5(list4)
 
 
-def test_dataset_hash(iris_dataset, iris_pandas_df_dataset):
-    assert iris_dataset.hash == "c7417e63a9ce038a32f37ecd7fb829f6"
-    assert iris_pandas_df_dataset.hash == "d06cfb6352dba29afe514d9be87021aa"
+def test_dataset_hash(iris_dataset, iris_pandas_df_dataset, diabetes_spark_dataset):
+    assert iris_dataset.hash == "99329a790dc483e7382c0d1d27aac3f3"
+    assert iris_pandas_df_dataset.hash == "799d4f50e2e353127f94a0e5300add06"
+    assert diabetes_spark_dataset.hash == "e646b03e976240bd0c79c6bcc1ae0bda"
+
+
+def test_datasset_with_pandas_dataframe():
+    data = pd.DataFrame({"f1": [1, 2], "f2": [3, 4], "label": [0, 1]})
+    eval_dataset = EvaluationDataset(data=data, labels="label")
+
+    assert list(eval_dataset.features_data.columns) == ["f1", "f2"]
+    assert np.array_equal(eval_dataset.features_data.f1.to_numpy(), [1, 2])
+    assert np.array_equal(eval_dataset.features_data.f2.to_numpy(), [3, 4])
+    assert np.array_equal(eval_dataset.labels_data, [0, 1])
 
 
-def test_datasset_extract_features_label(iris_dataset, iris_pandas_df_dataset):
-    X1, y1 = iris_dataset._extract_features_and_labels()
-    assert np.array_equal(X1, iris_dataset.data)
-    assert np.array_equal(y1, iris_dataset.labels)
+def test_datasset_with_array_data():
+    features = [[1, 2], [3, 4]]
+    labels = [0, 1]
 
-    X2, y2 = iris_pandas_df_dataset._extract_features_and_labels()
-    assert list(X2.columns) == ["f1", "f2"]
-    assert np.array_equal(X2["f1"], X1[:, 0])
-    assert np.array_equal(X2["f2"], X1[:, 1])
-    assert np.array_equal(y2, y1)
+    for input_data in [features, np.array(features)]:
+        eval_dataset1 = EvaluationDataset(data=input_data, labels=labels)
+        assert np.array_equal(eval_dataset1.features_data, features)
+        assert np.array_equal(eval_dataset1.labels_data, labels)
+        assert list(eval_dataset1.feature_names) == ["feature_1", "feature_2"]
+
+    with pytest.raises(ValueError, match="all element must has the same length"):
+        EvaluationDataset(data=[[1, 2], [3, 4, 5]], labels=labels)
+
+
+def test_autogen_feature_names():
+    labels = [0]
+    eval_dataset2 = EvaluationDataset(data=[list(range(9))], labels=labels)
+    assert eval_dataset2.feature_names == [f"feature_{i + 1}" for i in range(9)]
+
+    eval_dataset2 = EvaluationDataset(data=[list(range(10))], labels=labels)
+    assert eval_dataset2.feature_names == [f"feature_{i + 1:02d}" for i in range(10)]
+
+    eval_dataset2 = EvaluationDataset(data=[list(range(99))], labels=labels)
+    assert eval_dataset2.feature_names == [f"feature_{i + 1:02d}" for i in range(99)]
+
+    eval_dataset2 = EvaluationDataset(data=[list(range(100))], labels=labels)
+    assert eval_dataset2.feature_names == [f"feature_{i + 1:03d}" for i in range(100)]
+
+    with pytest.raises(
+        ValueError, match="features example rows must be the same length with labels array"
+    ):
+        EvaluationDataset(data=[[1, 2], [3, 4]], labels=[1, 2, 3])
 
 
 def test_spark_df_dataset(spark_session):
     spark_df = spark_session.createDataFrame([(1.0, 2.0, 3.0)] * 10, ["f1", "f2", "y"])
     with mock.patch.object(EvaluationDataset, "SPARK_DATAFRAME_LIMIT", 5):
         dataset = EvaluationDataset(spark_df, "y")
-        assert list(dataset.data.columns) == ["f1", "f2", "y"]
-        assert list(dataset.data["f1"]) == [1.0] * 5
-        assert list(dataset.data["f2"]) == [2.0] * 5
-        assert list(dataset.data["y"]) == [3.0] * 5
+        assert list(dataset.features_data.columns) == ["f1", "f2"]
+        assert list(dataset.features_data["f1"]) == [1.0] * 5
+        assert list(dataset.features_data["f2"]) == [2.0] * 5
+        assert list(dataset.labels_data) == [3.0] * 5
 
 
 def test_log_dataset_tag(iris_dataset, iris_pandas_df_dataset):
+    model_uuid = uuid.uuid4().hex
     with mlflow.start_run() as run:
         client = mlflow.tracking.MlflowClient()
-        iris_dataset._log_dataset_tag(client, run.info.run_id)
+        iris_dataset._log_dataset_tag(client, run.info.run_id, model_uuid=model_uuid)
         _, _, tags, _ = get_run_data(run.info.run_id)
-        assert json.loads(tags["mlflow.datasets"]) == [iris_dataset._metadata]
+
+        logged_meta1 = {**iris_dataset._metadata, "model": model_uuid}
+        logged_meta2 = {**iris_pandas_df_dataset._metadata, "model": model_uuid}
+
+        assert json.loads(tags["mlflow.datasets"]) == [logged_meta1]
 
         raw_tag = get_raw_tag(run.info.run_id, "mlflow.datasets")
         assert " " not in raw_tag  # assert the tag string remove all whitespace chars.
 
         # Test appending dataset tag
-        iris_pandas_df_dataset._log_dataset_tag(client, run.info.run_id)
+        iris_pandas_df_dataset._log_dataset_tag(client, run.info.run_id, model_uuid=model_uuid)
         _, _, tags, _ = get_run_data(run.info.run_id)
         assert json.loads(tags["mlflow.datasets"]) == [
-            iris_dataset._metadata,
-            iris_pandas_df_dataset._metadata,
+            logged_meta1,
+            logged_meta2,
         ]
 
         # Test log repetitive dataset
-        iris_dataset._log_dataset_tag(client, run.info.run_id)
+        iris_dataset._log_dataset_tag(client, run.info.run_id, model_uuid=model_uuid)
         _, _, tags, _ = get_run_data(run.info.run_id)
         assert json.loads(tags["mlflow.datasets"]) == [
-            iris_dataset._metadata,
-            iris_pandas_df_dataset._metadata,
+            logged_meta1,
+            logged_meta2,
         ]
 
 
 class FakeEvauator1(ModelEvaluator):
-    def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
+    def can_evaluate(self, *, model_type, evaluator_config, **kwargs):
         raise RuntimeError()
 
-    def evaluate(self, model, model_type, dataset, run_id, evaluator_config, **kwargs):
+    def evaluate(self, *, model, model_type, dataset, run_id, evaluator_config, **kwargs):
         raise RuntimeError()
 
 
 class FakeEvauator2(ModelEvaluator):
-    def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
+    def can_evaluate(self, *, model_type, evaluator_config, **kwargs):
         raise RuntimeError()
 
-    def evaluate(self, model, model_type, dataset, run_id, evaluator_config, **kwargs):
+    def evaluate(self, *, model, model_type, dataset, run_id, evaluator_config, **kwargs):
         raise RuntimeError()
 
 
@@ -322,7 +452,7 @@ def _load_content_from_file(self, local_artifact_path):
         raise RuntimeError()
 
 
-def test_evaluator_interface(classifier_model_uri, iris_dataset):
+def test_evaluator_interface(multiclass_logistic_regressor_model_uri, iris_dataset):
     with mock.patch.object(
         _model_evaluation_registry, "_registry", {"test_evaluator1": FakeEvauator1}
     ):
@@ -337,27 +467,33 @@ def test_evaluator_interface(classifier_model_uri, iris_dataset):
             FakeEvauator1, "evaluate", return_value=evaluator1_return_value
         ) as mock_evaluate:
             with mlflow.start_run():
-                evaluate(
-                    classifier_model_uri,
-                    "classifier",
-                    iris_dataset,
-                    run_id=None,
-                    evaluators="test_evaluator1",
-                    evaluator_config=evaluator1_config,
+                with pytest.raises(
+                    ValueError,
+                    match="The model could not be evaluated by any of the registered evaluators",
+                ):
+                    evaluate(
+                        model=multiclass_logistic_regressor_model_uri,
+                        model_type="classifier",
+                        dataset=iris_dataset,
+                        run_id=None,
+                        evaluators="test_evaluator1",
+                        evaluator_config=evaluator1_config,
+                    )
+                mock_can_evaluate.assert_called_once_with(
+                    model_type="classifier", evaluator_config=evaluator1_config
                 )
-                mock_can_evaluate.assert_called_once_with("classifier", evaluator1_config)
                 mock_evaluate.assert_not_called()
         with mock.patch.object(
             FakeEvauator1, "can_evaluate", return_value=True
         ) as mock_can_evaluate, mock.patch.object(
             FakeEvauator1, "evaluate", return_value=evaluator1_return_value
         ) as mock_evaluate:
-            classifier_model = mlflow.pyfunc.load_model(classifier_model_uri)
+            classifier_model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri)
             with mlflow.start_run() as run:
                 eval1_result = evaluate(
-                    classifier_model,
-                    "classifier",
-                    iris_dataset,
+                    model=classifier_model,
+                    model_type="classifier",
+                    dataset=iris_dataset,
                     run_id=None,
                     evaluators="test_evaluator1",
                     evaluator_config=evaluator1_config,
@@ -365,13 +501,19 @@ def test_evaluator_interface(classifier_model_uri, iris_dataset):
                 assert eval1_result.metrics == evaluator1_return_value.metrics
                 assert eval1_result.artifacts == evaluator1_return_value.artifacts
 
-                mock_can_evaluate.assert_called_once_with("classifier", evaluator1_config)
+                mock_can_evaluate.assert_called_once_with(
+                    model_type="classifier", evaluator_config=evaluator1_config
+                )
                 mock_evaluate.assert_called_once_with(
-                    classifier_model, "classifier", iris_dataset, run.info.run_id, evaluator1_config
+                    model=classifier_model,
+                    model_type="classifier",
+                    dataset=iris_dataset,
+                    run_id=run.info.run_id,
+                    evaluator_config=evaluator1_config,
                 )
 
 
-def test_evaluate_with_multi_evaluators(classifier_model_uri, iris_dataset):
+def test_evaluate_with_multi_evaluators(multiclass_logistic_regressor_model_uri, iris_dataset):
     with mock.patch.object(
         _model_evaluation_registry,
         "_registry",
@@ -399,12 +541,12 @@ def test_evaluate_with_multi_evaluators(classifier_model_uri, iris_dataset):
             ) as mock_can_evaluate2, mock.patch.object(
                 FakeEvauator2, "evaluate", return_value=evaluator2_return_value
             ) as mock_evaluate2:
-                classifier_model = mlflow.pyfunc.load_model(classifier_model_uri)
+                classifier_model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri)
                 with mlflow.start_run() as run:
                     eval_result = evaluate(
-                        classifier_model,
-                        "classifier",
-                        iris_dataset,
+                        model=classifier_model,
+                        model_type="classifier",
+                        dataset=iris_dataset,
                         run_id=None,
                         evaluators=evaluators,
                         evaluator_config={
@@ -420,21 +562,26 @@ def test_evaluate_with_multi_evaluators(classifier_model_uri, iris_dataset):
                         **evaluator1_return_value.artifacts,
                         **evaluator2_return_value.artifacts,
                     }
-                    mock_can_evaluate1.assert_called_once_with("classifier", evaluator1_config)
+                    mock_can_evaluate1.assert_called_once_with(
+                        model_type="classifier", evaluator_config=evaluator1_config
+                    )
                     mock_evaluate1.assert_called_once_with(
-                        classifier_model,
-                        "classifier",
-                        iris_dataset,
-                        run.info.run_id,
-                        evaluator1_config,
+                        model=classifier_model,
+                        model_type="classifier",
+                        dataset=iris_dataset,
+                        run_id=run.info.run_id,
+                        evaluator_config=evaluator1_config,
+                    )
+                    mock_can_evaluate2.assert_called_once_with(
+                        model_type="classifier",
+                        evaluator_config=evaluator2_config,
                     )
-                    mock_can_evaluate2.assert_called_once_with("classifier", evaluator2_config)
                     mock_evaluate2.assert_called_once_with(
-                        classifier_model,
-                        "classifier",
-                        iris_dataset,
-                        run.info.run_id,
-                        evaluator2_config,
+                        model=classifier_model,
+                        model_type="classifier",
+                        dataset=iris_dataset,
+                        run_id=run.info.run_id,
+                        evaluator_config=evaluator2_config,
                     )
 
 
@@ -462,3 +609,50 @@ def test_start_run_or_reuse_active_run():
         with pytest.raises(ValueError, match="An active run exists"):
             with _start_run_or_reuse_active_run(run_id=previous_run_id):
                 pass
+
+
+def test_normalize_evaluators_and_evaluator_config_args():
+    from mlflow.models.evaluation.default_evaluator import DefaultEvaluator
+
+    with mock.patch.object(
+        _model_evaluation_registry,
+        "_registry",
+        {"default": DefaultEvaluator},
+    ):
+        assert _normalize_config(None, None) == (["default"], {})
+        assert _normalize_config(None, {"a": 3}) == (["default"], {"default": {"a": 3}})
+        assert _normalize_config(None, {"default": {"a": 3}}) == (
+            ["default"],
+            {"default": {"a": 3}},
+        )
+
+    assert _normalize_config(None, None) == (["default", "dummy_evaluator"], {})
+    with pytest.raises(
+        ValueError, match="`evaluator_config` argument must be a dictionary mapping each evaluator"
+    ):
+        assert _normalize_config(None, {"a": 3}) == (["default", "dummy_evaluator"], {})
+
+    assert _normalize_config(None, {"default": {"a": 3}}) == (
+        ["default", "dummy_evaluator"],
+        {"default": {"a": 3}},
+    )
+
+    with mock.patch.object(_base_logger, "warning") as patched_warning_fn:
+        _normalize_config(None, None)
+        patched_warning_fn.assert_called_once()
+        assert "Multiple registered evaluators are found" in patched_warning_fn.call_args[0][0]
+
+    assert _normalize_config("dummy_evaluator", {"a": 3}) == (
+        ["dummy_evaluator"],
+        {"dummy_evaluator": {"a": 3}},
+    )
+
+    assert _normalize_config(["default", "dummy_evaluator"], {"dummy_evaluator": {"a": 3}}) == (
+        ["default", "dummy_evaluator"],
+        {"dummy_evaluator": {"a": 3}},
+    )
+
+    with pytest.raises(
+        ValueError, match="evaluator_config must be a dict contains mapping from evaluator name to"
+    ):
+        _normalize_config(["default", "dummy_evaluator"], {"abc": {"a": 3}})
diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
index bb3c8a8875849..c88bd21d09321 100644
--- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -23,7 +23,8 @@ def _load_content_from_file(self, local_artifact_path):
 
 
 class DummyEvaluator(ModelEvaluator):
-    def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
+    # pylint: disable=unused-argument
+    def can_evaluate(self, *, model_type, evaluator_config, **kwargs):
         return model_type in ["classifier", "regressor"]
 
     def _log_metrics(self, run_id, metrics, dataset_name):
@@ -40,11 +41,13 @@ def _log_metrics(self, run_id, metrics, dataset_name):
             ],
         )
 
+    # pylint: disable=unused-argument
     def evaluate(
-        self, model, model_type, dataset, run_id, evaluator_config=None, **kwargs
+        self, *, model, model_type, dataset, run_id, evaluator_config, **kwargs
     ) -> EvaluationResult:
         client = mlflow.tracking.MlflowClient()
-        X, y = dataset._extract_features_and_labels()
+        X = dataset.features_data
+        y = dataset.labels_data
         y_pred = model.predict(X)
         if model_type == "classifier":
             accuracy_score = sk_metrics.accuracy_score(y, y_pred)