From e2c5aff1622c869e67feeb5f23be2af2bba8aac3 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 15 Nov 2021 21:46:22 +0800
Subject: [PATCH 001/120] init

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/evaluation.py | 227 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 227 insertions(+)
 create mode 100644 mlflow/evaluation.py

diff --git a/mlflow/evaluation.py b/mlflow/evaluation.py
new file mode 100644
index 0000000000000..9d12a49d2b073
--- /dev/null
+++ b/mlflow/evaluation.py
@@ -0,0 +1,227 @@
+from collections.abc import Mapping
+from typing import List
+import entrypoints
+import warnings
+import mlflow
+from mlflow.exceptions import MlflowException
+
+
+class EvaluationMetrics(dict):
+    pass
+
+
+class EvaluationArtifact:
+
+    @property
+    def content(self):
+        """
+        The content of the artifact (representation varies)
+        """
+        raise NotImplementedError()
+
+    @property
+    def location(self) -> str:
+        """
+        The location of the artifact
+        """
+        raise NotImplementedError()
+
+
+class EvaluationResult:
+
+    @classmethod
+    def load(cls, path):
+        """Load the evaluation results from the specified local filesystem path"""
+        raise NotImplementedError()
+
+    def save(self, path):
+        """Write the evaluation results to the specified local filesystem path"""
+        # We will likely avoid serializing artifacts themselves, just locations.
+        # Deserialization will resolve locations to artifact contents.
+        raise NotImplementedError()
+
+    @property
+    def metrics(self) -> EvaluationMetrics:
+        """
+        A dictionary mapping scalar metric names to scalar metric values
+        """
+        raise NotImplementedError()
+
+    @property
+    def artifacts(self) -> Mapping[str, EvaluationArtifact]:
+        """
+        A dictionary mapping standardized artifact names (e.g. "roc_data") to
+        artifact content and location information
+        """
+        raise NotImplementedError()
+
+
+class EvaluationDataset:
+    """
+    Represents an input dataset for model evaluation. This is intended for
+    use with the `mlflow.evaluate()`API.
+    """
+
+    def __init__(self, data, labels=None, name=None):
+        """
+        :param data: One of the following:
+         - A numpy array or list of evaluation features, excluding labels.
+         - A Pandas DataFrame, or the path to a serialized DataFrame,
+           containing evaluation features and labels.
+
+        :param labels: One of the following:
+         - A numpy array or list of evaluation labels, if `data` is also a numpy array or list.
+         - The string name of a column from `data` that contains evaluation labels, if `data`
+           is a DataFrame.
+
+        :param name: (Optional) The name of the dataset (must not contain ").
+        """
+        self.name = name
+        self.data = data
+        self.labels = labels
+
+
+class ModelEvaluator:
+
+    def can_evaluate(
+        model_type, evaluator_config=None, **kwargs
+    ) -> bool:
+        """
+        :param model_type: A string describing the model type (e.g., "regressor",
+                           "classifier", …).
+        :param evaluator_config: A dictionary of additional configurations for
+                                 the evaluator.
+        :param **kwargs: For forwards compatibility, a placeholder for additional
+                         arguments that may be added to the evaluation interface
+                         in the future.
+        :return: True if the evaluator can evaluate the specified model on the
+                 specified dataset. False otherwise.
+        """
+        raise NotImplementedError()
+
+    def evaluate(
+        predict, dataset, run_id, evaluator_config=None, **kwargs
+    ) -> EvaluationResult:
+        """
+        :param predict: A function used to compute model predictions. Predict
+                        accepts features from the specified `dataset` and
+                        feeds them to the model, producing output predictions.
+        :param dataset: An instance of `EvaluationDataset` containing features
+                        and labels (optional) for model evaluation.
+        :param run_id: The ID of the MLflow Run to which to log results.
+        :param evaluator_config: A dictionary of additional configurations for
+                                 the evaluator.
+        :param **kwargs: For forwards compatibility, a placeholder for additional
+                         arguments that may be added to the evaluation interface
+                         in the future.
+        :return: An `EvaluationResult` instance containing evaluation results.
+        """
+        raise NotImplementedError()
+
+
+class ModelEvaluatorRegistry:
+    """
+    Scheme-based registry for model evaluator implementations
+    """
+
+    def __init__(self):
+        self._registry = {}
+
+    def register(self, scheme, evaluator):
+        """Register model evaluator provided by other packages"""
+        self._registry[scheme] = evaluator
+
+    def register_entrypoints(self):
+        # Register artifact repositories provided by other packages
+        for entrypoint in entrypoints.get_group_all("mlflow.model_evaluator"):
+            try:
+                self.register(entrypoint.name, entrypoint.load())
+            except (AttributeError, ImportError) as exc:
+                warnings.warn(
+                    'Failure attempting to register model evaluator for scheme "{}": {}'.format(
+                        entrypoint.name, str(exc)
+                    ),
+                    stacklevel=2,
+                )
+
+    def get_evaluator(self, evaluator_name):
+        """
+        Get an evaluator instance from the registry based on the name of evaluator
+        """
+        evaluator_cls = self._registry.get(evaluator_name)
+        if evaluator_cls is None:
+            raise MlflowException(
+                "Could not find a registered model evaluator for: {}. "
+                "Currently registered evaluator names are: {}".format(
+                    evaluator_name, list(self._registry.keys())
+                )
+            )
+        return evaluator_cls()
+
+
+_model_evaluation_registry = ModelEvaluatorRegistry()
+
+
+def evaluate(
+   model, model_type, dataset, run_id=None, evaluators=None, evaluator_config=None
+) -> EvaluationResult | Mapping[str, EvaluationResult]:
+    """
+    :param model: A model supported by the specified `evaluator`, or a URI
+                  referring to such a model. The default evaluator supports the
+                  following:
+
+                  - A pyfunc model instance (an instance of class `PyFuncModel`)
+
+    :param model_type: A string describing the model type. The default evaluator
+                       supports "regressor" and "classifier" as model types.
+    :param dataset: An instance of `EvaluationDataset` containing features
+                    labels (optional) for model evaluation.
+    :param run_id: The ID of the MLflow Run to which to log results. If
+                   unspecified, behavior depends on the specified `evaluator`.
+                   When `run_id` is unspecified, the default evaluator logs
+                   results to the current active run, creating a new active run if
+                   one does not exist.
+    :param evaluators: The name of the evaluator to use for model evaluations, or
+                       a list of evaluator names. If unspecified, all evaluators
+                       capable  of evaluating the specified model on the specified
+                       dataset are used.
+    :param evaluator_config: A dictionary of additional configurations to supply
+                             to the evaluator. If multiple evaluators are
+                             specified, each configuration should be supplied as
+                             a nested dictionary whose key is the evaluator name.
+    :return: An `EvaluationResult` instance containing evaluation results.
+    """
+    if evaluators is None:
+        evaluators = 'default_evaluator'
+
+    if not isinstance(evaluators, list):
+        evaluators = [evaluators]
+
+    if isinstance(model, str):
+        model = mlflow.pyfunc.load_model(model)
+
+    predict = model.predict
+
+    eval_results = {}
+    for evaluator_name in evaluators:
+        try:
+            evaluator = _model_evaluation_registry.get_evaluator(evaluator_name)
+        except MlflowException:
+            eval_results[evaluator_name] = None
+            continue
+
+        if evaluator.can_evaluate(model_type, evaluator_config):
+            result = evaluator.evaluate(predict, dataset, run_id, evaluator_config)
+            eval_results[evaluator_name] = result
+        else:
+            eval_results[evaluator_name] = None
+
+    if len(evaluators) > 1:
+        return eval_results
+    else:
+        return eval_results[evaluators[0]]
+
+
+
+
+

From 91a6e1c5ebccd19c5d79c6e23521bd388c38dbcf Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 15 Nov 2021 22:32:19 +0800
Subject: [PATCH 002/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/evaluation.py                          | 17 +++---
 .../mlflow_test_plugin/dummy_evaluator.py     | 58 +++++++++++++++++++
 2 files changed, 65 insertions(+), 10 deletions(-)
 create mode 100644 tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py

diff --git a/mlflow/evaluation.py b/mlflow/evaluation.py
index 9d12a49d2b073..bf6afed5684d5 100644
--- a/mlflow/evaluation.py
+++ b/mlflow/evaluation.py
@@ -1,5 +1,4 @@
 from collections.abc import Mapping
-from typing import List
 import entrypoints
 import warnings
 import mlflow
@@ -84,7 +83,7 @@ def __init__(self, data, labels=None, name=None):
 class ModelEvaluator:
 
     def can_evaluate(
-        model_type, evaluator_config=None, **kwargs
+        self, model_type, evaluator_config=None, **kwargs
     ) -> bool:
         """
         :param model_type: A string describing the model type (e.g., "regressor",
@@ -100,7 +99,7 @@ def can_evaluate(
         raise NotImplementedError()
 
     def evaluate(
-        predict, dataset, run_id, evaluator_config=None, **kwargs
+        self, predict, dataset, run_id, evaluator_config=None, **kwargs
     ) -> EvaluationResult:
         """
         :param predict: A function used to compute model predictions. Predict
@@ -160,6 +159,7 @@ def get_evaluator(self, evaluator_name):
 
 
 _model_evaluation_registry = ModelEvaluatorRegistry()
+_model_evaluation_registry.register_entrypoints()
 
 
 def evaluate(
@@ -196,6 +196,7 @@ def evaluate(
 
     if not isinstance(evaluators, list):
         evaluators = [evaluators]
+        evaluator_config = {evaluators[0]: evaluator_config}
 
     if isinstance(model, str):
         model = mlflow.pyfunc.load_model(model)
@@ -204,14 +205,15 @@ def evaluate(
 
     eval_results = {}
     for evaluator_name in evaluators:
+        config = evaluator_config[evaluator_name]
         try:
             evaluator = _model_evaluation_registry.get_evaluator(evaluator_name)
         except MlflowException:
             eval_results[evaluator_name] = None
             continue
 
-        if evaluator.can_evaluate(model_type, evaluator_config):
-            result = evaluator.evaluate(predict, dataset, run_id, evaluator_config)
+        if evaluator.can_evaluate(model_type, config):
+            result = evaluator.evaluate(predict, dataset, run_id, config)
             eval_results[evaluator_name] = result
         else:
             eval_results[evaluator_name] = None
@@ -220,8 +222,3 @@ def evaluate(
         return eval_results
     else:
         return eval_results[evaluators[0]]
-
-
-
-
-
diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
new file mode 100644
index 0000000000000..557bf64f34a9b
--- /dev/null
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -0,0 +1,58 @@
+import mlflow
+from mlflow.evaluation import ModelEvaluator, EvaluationMetrics, \
+    EvaluationArtifact, EvaluationResult, EvaluationDataset
+
+
+class DummyEvaluationResult(EvaluationResult):
+    def __init__(self, predict, dataset, run_id, evaluator_config):
+        # Dummy implementation, only store arguments passed to `evaluate`, for testing
+        self.predict = predict
+        self.dataset = dataset
+        self.run_id = run_id
+        self.evaluator_config = evaluator_config
+
+
+class DummyClassifierEvaluationResult(DummyEvaluationResult)
+
+
+class DummyClassifierEvaluator(ModelEvaluator):
+
+    def can_evaluate(
+        self, model_type, evaluator_config=None, **kwargs
+    ):
+        return model_type == 'classifier' and evaluator_config.get('can_evaluate')
+
+    def _evaluate(self, predict, dataset, run_id, evaluator_config):
+        return DummyClassifierEvaluationResult()
+
+    def evaluate(
+        self, predict, dataset, run_id, evaluator_config=None
+    ):
+        if run_id is not None:
+            return self._evaluate(self, predict, dataset, run_id, evaluator_config)
+        else:
+            with mlflow.start_run() as run:
+                return self._evaluate(self, predict, dataset, run.info.run_id, evaluator_config)
+
+
+class DummyRegressorEvaluationResult(DummyEvaluationResult)
+
+
+class DummyRegressorEvaluator(ModelEvaluator):
+
+    def can_evaluate(
+        self, model_type, evaluator_config=None, **kwargs
+    ):
+        return model_type == 'regressor' and evaluator_config.get('can_evaluate')
+
+    def _evaluate(self, predict, dataset, run_id, evaluator_config):
+        return DummyRegressorEvaluationResult()
+
+    def evaluate(
+        self, predict, dataset, run_id, evaluator_config=None
+    ):
+        if run_id is not None:
+            return self._evaluate(self, predict, dataset, run_id, evaluator_config)
+        else:
+            with mlflow.start_run() as run:
+                return self._evaluate(self, predict, dataset, run.info.run_id, evaluator_config)

From 071e86bbad744b8f7111488d39cd744f51101174 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Tue, 16 Nov 2021 08:57:09 +0800
Subject: [PATCH 003/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 .../mlflow_test_plugin/dummy_evaluator.py                   | 6 ++++--
 tests/resources/mlflow-test-plugin/setup.py                 | 4 ++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
index 557bf64f34a9b..96cc27cab5432 100644
--- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -12,7 +12,8 @@ def __init__(self, predict, dataset, run_id, evaluator_config):
         self.evaluator_config = evaluator_config
 
 
-class DummyClassifierEvaluationResult(DummyEvaluationResult)
+class DummyClassifierEvaluationResult(DummyEvaluationResult):
+    pass
 
 
 class DummyClassifierEvaluator(ModelEvaluator):
@@ -35,7 +36,8 @@ def evaluate(
                 return self._evaluate(self, predict, dataset, run.info.run_id, evaluator_config)
 
 
-class DummyRegressorEvaluationResult(DummyEvaluationResult)
+class DummyRegressorEvaluationResult(DummyEvaluationResult):
+    pass
 
 
 class DummyRegressorEvaluator(ModelEvaluator):
diff --git a/tests/resources/mlflow-test-plugin/setup.py b/tests/resources/mlflow-test-plugin/setup.py
index 61af2d9bfc593..3fda5b7b33e8a 100644
--- a/tests/resources/mlflow-test-plugin/setup.py
+++ b/tests/resources/mlflow-test-plugin/setup.py
@@ -26,5 +26,9 @@
         "mlflow.project_backend": "dummy-backend=mlflow_test_plugin.dummy_backend:PluginDummyProjectBackend",  # noqa
         # Define a MLflow model deployment plugin for target 'faketarget'
         "mlflow.deployments": "faketarget=mlflow_test_plugin.fake_deployment_plugin",
+        "mlflow.model_evaluator": [
+            "dummy_classifier_evaluator=mlflow_test_plugin.dummy_evaluator:DummyClassifierEvaluator",  # noqa
+            "dummy_regressor_evaluator=mlflow_test_plugin.dummy_evaluator:DummyRegressorEvaluator",  # noqa
+        ],
     },
 )

From e7d8a76fbf89b6716ea888f73d6c948d41934249 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Tue, 16 Nov 2021 16:53:22 +0800
Subject: [PATCH 004/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 .../mlflow_test_plugin/dummy_evaluator.py     | 94 ++++++++++++++-----
 1 file changed, 68 insertions(+), 26 deletions(-)

diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
index 96cc27cab5432..fe88c9648e781 100644
--- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -1,30 +1,76 @@
 import mlflow
 from mlflow.evaluation import ModelEvaluator, EvaluationMetrics, \
     EvaluationArtifact, EvaluationResult, EvaluationDataset
+from sklearn import metrics as sk_metrics
+import numpy as np
+import pickle
+
+
+class DummyEvaluationArtifact(EvaluationArtifact):
+
+    def __init__(self, content, location):
+        self.content = content
+        self.location = location
+
+    @property
+    def content(self):
+        return self.content
+
+    @property
+    def location(self) -> str:
+        return self.location
 
 
 class DummyEvaluationResult(EvaluationResult):
-    def __init__(self, predict, dataset, run_id, evaluator_config):
-        # Dummy implementation, only store arguments passed to `evaluate`, for testing
-        self.predict = predict
-        self.dataset = dataset
-        self.run_id = run_id
-        self.evaluator_config = evaluator_config
 
+    def __init__(self, metric_values, artifact_content, artifact_location):
+        self.metric_values = metric_values
+        self.artifact_content = artifact_content
+        self.artifact_location = artifact_location
+
+    @classmethod
+    def load(cls, path):
+        with open(path, 'r') as f:
+            obj = pickle.load(f)
+        return obj
 
-class DummyClassifierEvaluationResult(DummyEvaluationResult):
-    pass
+    def save(self, path):
+        with open(path, 'w') as f:
+            # TODO: skip dump artifact, instead, download artifact content when loading
+            pickle.dump(self, f)
 
+    @property
+    def metrics(self):
+        return self.metric_values()
 
-class DummyClassifierEvaluator(ModelEvaluator):
+    @property
+    def artifacts(self):
+        return DummyEvaluationArtifact(
+            content=self.artifact_content,
+            location=self.artifact_location
+        )
+
+
+class DummyEvaluator(ModelEvaluator):
 
     def can_evaluate(
         self, model_type, evaluator_config=None, **kwargs
     ):
-        return model_type == 'classifier' and evaluator_config.get('can_evaluate')
+        return evaluator_config.get('can_evaluate')
 
     def _evaluate(self, predict, dataset, run_id, evaluator_config):
-        return DummyClassifierEvaluationResult()
+        X = dataset.data
+        assert isinstance(X, np.ndarray), 'Only support array type feature input'
+        assert dataset.name is not None, 'Dataset name required'
+        y = dataset.labels
+        y_pred = predict(X)
+
+        metrics_to_calc = evaluator_config.get('metrics_to_calc')
+        metric_values = {}
+        for metric_name in metrics_to_calc:
+            metric_values[metric_name] = getattr(sk_metrics, metric_name)(y, y_pred)
+
+        return DummyEvaluationResult(metric_values)
 
     def evaluate(
         self, predict, dataset, run_id, evaluator_config=None
@@ -36,25 +82,21 @@ def evaluate(
                 return self._evaluate(self, predict, dataset, run.info.run_id, evaluator_config)
 
 
-class DummyRegressorEvaluationResult(DummyEvaluationResult):
-    pass
-
-
-class DummyRegressorEvaluator(ModelEvaluator):
+class DummyRegressorEvaluator(DummyEvaluator):
 
     def can_evaluate(
         self, model_type, evaluator_config=None, **kwargs
     ):
-        return model_type == 'regressor' and evaluator_config.get('can_evaluate')
+        return model_type == 'regressor' and super(DummyRegressorEvaluator, self).can_evaluate(
+            model_type, evaluator_config=None, **kwargs
+        )
 
-    def _evaluate(self, predict, dataset, run_id, evaluator_config):
-        return DummyRegressorEvaluationResult()
 
-    def evaluate(
-        self, predict, dataset, run_id, evaluator_config=None
+class DummyClassifierEvaluator(DummyEvaluator):
+
+    def can_evaluate(
+        self, model_type, evaluator_config=None, **kwargs
     ):
-        if run_id is not None:
-            return self._evaluate(self, predict, dataset, run_id, evaluator_config)
-        else:
-            with mlflow.start_run() as run:
-                return self._evaluate(self, predict, dataset, run.info.run_id, evaluator_config)
+        return model_type == 'classifier' and super(DummyClassifierEvaluator, self).can_evaluate(
+            model_type, evaluator_config=None, **kwargs
+        )

From 7499ac6e02f39403f72bc1794766b59bf2be626e Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Tue, 16 Nov 2021 17:06:40 +0800
Subject: [PATCH 005/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 .../mlflow_test_plugin/dummy_evaluator.py      | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
index fe88c9648e781..5912326531a60 100644
--- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -1,6 +1,7 @@
 import mlflow
 from mlflow.evaluation import ModelEvaluator, EvaluationMetrics, \
     EvaluationArtifact, EvaluationResult, EvaluationDataset
+from mlflow.tracking import artifact_utils
 from sklearn import metrics as sk_metrics
 import numpy as np
 import pickle
@@ -66,11 +67,24 @@ def _evaluate(self, predict, dataset, run_id, evaluator_config):
         y_pred = predict(X)
 
         metrics_to_calc = evaluator_config.get('metrics_to_calc')
+
+        client = mlflow.tracking.MlflowClient()
         metric_values = {}
         for metric_name in metrics_to_calc:
-            metric_values[metric_name] = getattr(sk_metrics, metric_name)(y, y_pred)
+            metric_value = getattr(sk_metrics, metric_name)(y, y_pred)
+            metric_values[metric_name] = metric_value
+            metric_key = f'{metric_name}_on_{dataset.name}'
+            client.log_metric(run_id=run_id, key=metric_key, value=metric_value)
+
+        client.log_dict(run_id, metric_values, 'metrics_artifact')
 
-        return DummyEvaluationResult(metric_values)
+        # TODO: log `mlflow.datasets` tag containing a list of metadata for all datasets
+
+        return DummyEvaluationResult(
+            metric_values=metric_values,
+            artifact_content=metric_values,
+            artifact_location=artifact_utils.get_artifact_uri('metrics_artifact')
+        )
 
     def evaluate(
         self, predict, dataset, run_id, evaluator_config=None

From f1819a4df7566ec022dc95a5d9fdd99edc29a05e Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Tue, 16 Nov 2021 22:03:08 +0800
Subject: [PATCH 006/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/evaluation.py                          |  6 +-
 .../mlflow_test_plugin/dummy_evaluator.py     | 30 ++-------
 tests/test_evaluation.py                      | 66 +++++++++++++++++++
 3 files changed, 76 insertions(+), 26 deletions(-)
 create mode 100644 tests/test_evaluation.py

diff --git a/mlflow/evaluation.py b/mlflow/evaluation.py
index bf6afed5684d5..1be839eea2d57 100644
--- a/mlflow/evaluation.py
+++ b/mlflow/evaluation.py
@@ -1,4 +1,4 @@
-from collections.abc import Mapping
+from typing import Dict, Union
 import entrypoints
 import warnings
 import mlflow
@@ -47,7 +47,7 @@ def metrics(self) -> EvaluationMetrics:
         raise NotImplementedError()
 
     @property
-    def artifacts(self) -> Mapping[str, EvaluationArtifact]:
+    def artifacts(self) -> Dict[str, EvaluationArtifact]:
         """
         A dictionary mapping standardized artifact names (e.g. "roc_data") to
         artifact content and location information
@@ -164,7 +164,7 @@ def get_evaluator(self, evaluator_name):
 
 def evaluate(
    model, model_type, dataset, run_id=None, evaluators=None, evaluator_config=None
-) -> EvaluationResult | Mapping[str, EvaluationResult]:
+) -> Union[EvaluationResult, Dict[str, EvaluationResult]]:
     """
     :param model: A model supported by the specified `evaluator`, or a URI
                   referring to such a model. The default evaluator supports the
diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
index 5912326531a60..0dfb3402c56b1 100644
--- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -57,7 +57,8 @@ class DummyEvaluator(ModelEvaluator):
     def can_evaluate(
         self, model_type, evaluator_config=None, **kwargs
     ):
-        return evaluator_config.get('can_evaluate')
+        return evaluator_config.get('can_evaluate') and \
+               model_type in ['classifier', 'regressor']
 
     def _evaluate(self, predict, dataset, run_id, evaluator_config):
         X = dataset.data
@@ -76,14 +77,14 @@ def _evaluate(self, predict, dataset, run_id, evaluator_config):
             metric_key = f'{metric_name}_on_{dataset.name}'
             client.log_metric(run_id=run_id, key=metric_key, value=metric_value)
 
-        client.log_dict(run_id, metric_values, 'metrics_artifact')
+        client.log_dict(run_id, metric_values, 'metrics_artifact.json')
 
         # TODO: log `mlflow.datasets` tag containing a list of metadata for all datasets
 
         return DummyEvaluationResult(
             metric_values=metric_values,
             artifact_content=metric_values,
-            artifact_location=artifact_utils.get_artifact_uri('metrics_artifact')
+            artifact_location=artifact_utils.get_artifact_uri(run_id, 'metrics_artifact.json')
         )
 
     def evaluate(
@@ -91,26 +92,9 @@ def evaluate(
     ):
         if run_id is not None:
             return self._evaluate(self, predict, dataset, run_id, evaluator_config)
+        elif mlflow.active_run() is not None:
+            return self._evaluate(self, predict, dataset, mlflow.active_run().info.run_id,
+                                  evaluator_config)
         else:
             with mlflow.start_run() as run:
                 return self._evaluate(self, predict, dataset, run.info.run_id, evaluator_config)
-
-
-class DummyRegressorEvaluator(DummyEvaluator):
-
-    def can_evaluate(
-        self, model_type, evaluator_config=None, **kwargs
-    ):
-        return model_type == 'regressor' and super(DummyRegressorEvaluator, self).can_evaluate(
-            model_type, evaluator_config=None, **kwargs
-        )
-
-
-class DummyClassifierEvaluator(DummyEvaluator):
-
-    def can_evaluate(
-        self, model_type, evaluator_config=None, **kwargs
-    ):
-        return model_type == 'classifier' and super(DummyClassifierEvaluator, self).can_evaluate(
-            model_type, evaluator_config=None, **kwargs
-        )
diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py
new file mode 100644
index 0000000000000..3acf449082fa7
--- /dev/null
+++ b/tests/test_evaluation.py
@@ -0,0 +1,66 @@
+import mlflow
+
+from mlflow.evaluation import evaluate, EvaluationDataset
+import sklearn
+import sklearn.datasets
+import sklearn.linear_model
+import pytest
+
+from tests.sklearn.test_sklearn_autolog import get_iris, get_run_data, load_json_artifact
+from sklearn.metrics import mean_absolute_error, mean_squared_error
+
+
+@pytest.fixture(scope="module")
+def regressor_model():
+    X, y = get_iris()
+    reg = sklearn.linear_model.LinearRegression()
+    reg.fit(X, y)
+    return reg
+
+
+@pytest.fixture(scope="module")
+def classifier_model():
+    X, y = get_iris()
+    clf = sklearn.linear_model.LogisticRegression()
+    clf.fit(X, y)
+    return clf
+
+
+@pytest.fixture(scope="module")
+def evaluation_dataset():
+    X, y = get_iris()
+    eval_X, eval_y = X[0::3], y[0::3]
+    return EvaluationDataset(data=eval_X, labels=eval_y, name='eval_data_1')
+
+
+def test_reg_evaluate(regressor_model, evaluation_dataset):
+    y_true = evaluation_dataset.labels
+    y_pred = regressor_model.predict(evaluation_dataset.data)
+    expected_mae = mean_absolute_error(y_true, y_pred)
+    expected_mse = mean_squared_error(y_true, y_pred)
+    expected_metrics = {
+        'mean_absolute_error': expected_mae,
+        'mean_squared_error': expected_mse,
+    }
+
+    expected_artifact = expected_metrics
+
+    with mlflow.start_run() as run:
+        eval_result = evaluate(
+            regressor_model, 'regressor', evaluation_dataset,
+            run_id=None, evaluators='dummy_regressor_evaluator',
+            evaluator_config={
+                'can_evaluate': True,
+                'metrics_to_calc': ['mean_absolute_error', 'mean_squared_error']
+            }
+        )
+        saved_artifact_uri = mlflow.get_artifact_uri('metrics_artifact.json')
+        saved_artifact = load_json_artifact('metrics_artifact.json')
+        assert saved_artifact == expected_artifact
+
+    _, saved_metrics, _, _ = get_run_data(run.info.run_id)
+    assert saved_metrics == expected_metrics
+
+    assert eval_result.metrics == expected_metrics
+    assert eval_result.artifacts.content == expected_artifact
+    assert eval_result.artifacts.location == saved_artifact_uri

From aa397e132ee7ee800da4774532d533065963192b Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Tue, 16 Nov 2021 22:15:38 +0800
Subject: [PATCH 007/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 .../mlflow_test_plugin/dummy_evaluator.py        | 16 ++++++++--------
 tests/resources/mlflow-test-plugin/setup.py      |  5 +----
 tests/test_evaluation.py                         |  8 ++++++--
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
index 0dfb3402c56b1..46062c33a24df 100644
--- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -10,16 +10,16 @@
 class DummyEvaluationArtifact(EvaluationArtifact):
 
     def __init__(self, content, location):
-        self.content = content
-        self.location = location
+        self._content = content
+        self._location = location
 
     @property
     def content(self):
-        return self.content
+        return self._content
 
     @property
     def location(self) -> str:
-        return self.location
+        return self._location
 
 
 class DummyEvaluationResult(EvaluationResult):
@@ -42,7 +42,7 @@ def save(self, path):
 
     @property
     def metrics(self):
-        return self.metric_values()
+        return self.metric_values
 
     @property
     def artifacts(self):
@@ -91,10 +91,10 @@ def evaluate(
         self, predict, dataset, run_id, evaluator_config=None
     ):
         if run_id is not None:
-            return self._evaluate(self, predict, dataset, run_id, evaluator_config)
+            return self._evaluate(predict, dataset, run_id, evaluator_config)
         elif mlflow.active_run() is not None:
-            return self._evaluate(self, predict, dataset, mlflow.active_run().info.run_id,
+            return self._evaluate(predict, dataset, mlflow.active_run().info.run_id,
                                   evaluator_config)
         else:
             with mlflow.start_run() as run:
-                return self._evaluate(self, predict, dataset, run.info.run_id, evaluator_config)
+                return self._evaluate(predict, dataset, run.info.run_id, evaluator_config)
diff --git a/tests/resources/mlflow-test-plugin/setup.py b/tests/resources/mlflow-test-plugin/setup.py
index 3fda5b7b33e8a..7668f25406f9c 100644
--- a/tests/resources/mlflow-test-plugin/setup.py
+++ b/tests/resources/mlflow-test-plugin/setup.py
@@ -26,9 +26,6 @@
         "mlflow.project_backend": "dummy-backend=mlflow_test_plugin.dummy_backend:PluginDummyProjectBackend",  # noqa
         # Define a MLflow model deployment plugin for target 'faketarget'
         "mlflow.deployments": "faketarget=mlflow_test_plugin.fake_deployment_plugin",
-        "mlflow.model_evaluator": [
-            "dummy_classifier_evaluator=mlflow_test_plugin.dummy_evaluator:DummyClassifierEvaluator",  # noqa
-            "dummy_regressor_evaluator=mlflow_test_plugin.dummy_evaluator:DummyRegressorEvaluator",  # noqa
-        ],
+        "mlflow.model_evaluator": "dummy_evaluator=mlflow_test_plugin.dummy_evaluator:DummyEvaluator",
     },
 )
diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py
index 3acf449082fa7..4c7442c1f1f67 100644
--- a/tests/test_evaluation.py
+++ b/tests/test_evaluation.py
@@ -42,13 +42,17 @@ def test_reg_evaluate(regressor_model, evaluation_dataset):
         'mean_absolute_error': expected_mae,
         'mean_squared_error': expected_mse,
     }
+    expected_saved_metrics = {
+        'mean_absolute_error_on_eval_data_1': expected_mae,
+        'mean_squared_error_on_eval_data_1': expected_mse,
+    }
 
     expected_artifact = expected_metrics
 
     with mlflow.start_run() as run:
         eval_result = evaluate(
             regressor_model, 'regressor', evaluation_dataset,
-            run_id=None, evaluators='dummy_regressor_evaluator',
+            run_id=None, evaluators='dummy_evaluator',
             evaluator_config={
                 'can_evaluate': True,
                 'metrics_to_calc': ['mean_absolute_error', 'mean_squared_error']
@@ -59,7 +63,7 @@ def test_reg_evaluate(regressor_model, evaluation_dataset):
         assert saved_artifact == expected_artifact
 
     _, saved_metrics, _, _ = get_run_data(run.info.run_id)
-    assert saved_metrics == expected_metrics
+    assert saved_metrics == expected_saved_metrics
 
     assert eval_result.metrics == expected_metrics
     assert eval_result.artifacts.content == expected_artifact

From 981164a8f52e195d5d745808d1fd6c73f8b355bc Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Tue, 16 Nov 2021 22:41:06 +0800
Subject: [PATCH 008/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 tests/test_evaluation.py | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py
index 4c7442c1f1f67..301ddbad336a7 100644
--- a/tests/test_evaluation.py
+++ b/tests/test_evaluation.py
@@ -6,10 +6,31 @@
 import sklearn.linear_model
 import pytest
 
-from tests.sklearn.test_sklearn_autolog import get_iris, get_run_data, load_json_artifact
 from sklearn.metrics import mean_absolute_error, mean_squared_error
 
 
+def get_iris():
+    iris = sklearn.datasets.load_iris()
+    return iris.data[:, :2], iris.target
+
+
+def get_run_data(run_id):
+    client = mlflow.tracking.MlflowClient()
+    data = client.get_run(run_id).data
+    # Ignore tags mlflow logs by default (e.g. "mlflow.user")
+    tags = {k: v for k, v in data.tags.items() if not k.startswith("mlflow.")}
+    artifacts = [f.path for f in client.list_artifacts(run_id)]
+    return data.params, data.metrics, tags, artifacts
+
+
+def load_json_artifact(artifact_path):
+    import json
+
+    fpath = mlflow.get_artifact_uri(artifact_path).replace("file://", "")
+    with open(fpath, "r") as f:
+        return json.load(f)
+
+
 @pytest.fixture(scope="module")
 def regressor_model():
     X, y = get_iris()

From 593bec81df09be1b101df24e081f9ff76bb780d3 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 17 Nov 2021 22:47:13 +0800
Subject: [PATCH 009/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/evaluation.py | 172 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 151 insertions(+), 21 deletions(-)

diff --git a/mlflow/evaluation.py b/mlflow/evaluation.py
index 1be839eea2d57..10c4349022be8 100644
--- a/mlflow/evaluation.py
+++ b/mlflow/evaluation.py
@@ -2,57 +2,94 @@
 import entrypoints
 import warnings
 import mlflow
+import hashlib
+import time
+import numpy as np
+import pandas as pd
+import pickle
 from mlflow.exceptions import MlflowException
+from mlflow.utils.file_utils import TempDir
+from mlflow.entities import Metric
+from mlflow.tracking.artifact_utils import get_artifact_uri, _download_artifact_from_uri
 
 
 class EvaluationMetrics(dict):
     pass
 
 
-class EvaluationArtifact:
+class EvaluationArtifacts:
+
+    def __init__(self, location, content=None):
+        self._content = content
+        self._location = location
+
+    def load_content_from_file(self, local_artifact_file):
+        raise NotImplementedError()
 
     @property
     def content(self):
         """
         The content of the artifact (representation varies)
         """
-        raise NotImplementedError()
+        if self._content is None:
+            with TempDir() as temp_dir:
+                local_artifact_file = temp_dir.path('local_artifact')
+                _download_artifact_from_uri(self._location, local_artifact_file)
+                self._content = self.load_content_from_file(local_artifact_file)
+
+        return self._content
 
     @property
     def location(self) -> str:
         """
         The location of the artifact
         """
-        raise NotImplementedError()
+        return self._location
+
+    def __getstate__(self, state):
+        state = state.__dict__.copy()
+        # skip pickling artifact content
+        del state['_content']
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
 
 
 class EvaluationResult:
 
+    def __init__(self, metrics, artifacts):
+        self._metrics = metrics
+        self._artifacts = artifacts
+
     @classmethod
     def load(cls, path):
         """Load the evaluation results from the specified local filesystem path"""
-        raise NotImplementedError()
+        with open(path, 'r') as f:
+            obj = pickle.load(f)
+        return obj
 
     def save(self, path):
         """Write the evaluation results to the specified local filesystem path"""
         # We will likely avoid serializing artifacts themselves, just locations.
         # Deserialization will resolve locations to artifact contents.
-        raise NotImplementedError()
+        with open(path, 'w') as f:
+            pickle.dump(self, f)
 
     @property
     def metrics(self) -> EvaluationMetrics:
         """
         A dictionary mapping scalar metric names to scalar metric values
         """
-        raise NotImplementedError()
+        return self._metrics
 
     @property
-    def artifacts(self) -> Dict[str, EvaluationArtifact]:
+    def artifacts(self) -> Dict[str, EvaluationArtifacts]:
         """
         A dictionary mapping standardized artifact names (e.g. "roc_data") to
         artifact content and location information
         """
-        raise NotImplementedError()
+        return self._artifacts
 
 
 class EvaluationDataset:
@@ -61,12 +98,15 @@ class EvaluationDataset:
     use with the `mlflow.evaluate()`API.
     """
 
-    def __init__(self, data, labels=None, name=None):
+    NUM_SAMPLE_ROWS_FOR_HASH = 5
+
+    def __init__(self, data, labels=None, name=None, path=None):
         """
         :param data: One of the following:
          - A numpy array or list of evaluation features, excluding labels.
          - A Pandas DataFrame, or the path to a serialized DataFrame,
-           containing evaluation features and labels.
+           containing evaluation features and labels. All columns will be regarded as feature
+           columns except the "labels" column.
 
         :param labels: One of the following:
          - A numpy array or list of evaluation labels, if `data` is also a numpy array or list.
@@ -74,10 +114,74 @@ def __init__(self, data, labels=None, name=None):
            is a DataFrame.
 
         :param name: (Optional) The name of the dataset (must not contain ").
+
+        :param path: (Optional) the path to a serialized DataFrame
+          (e.g. a delta table, parquet file)
         """
         self.name = name
         self.data = data
         self.labels = labels
+        self.path = path
+
+    @staticmethod
+    def _gen_md5_for_arraylike_obj(md5_gen, data):
+        md5_gen.update(pickle.dumps(len(data)))
+        if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
+            md5_gen.update(pickle.dumps(data))
+        else:
+            md5_gen.update(pickle.dumps(data[:EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH]))
+            md5_gen.update(pickle.dumps(data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH:]))
+
+    @property
+    def hash(self):
+        """
+        Compute a hash from the specified dataset by selecting the first 5 records, last 5 records,
+        dataset size and feeding them through a cheap, low-collision hash function
+        """
+        md5_gen = hashlib.md5()
+        if isinstance(self.data, np.ndarray):
+            EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
+            EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.labels)
+        elif isinstance(self.data, pd.DataFrame):
+            EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
+            md5_gen.update(self.labels.encode('UTF-8'))
+        return md5_gen.digest()
+
+    @property
+    def metadata(self):
+        return {
+            'name': self.name,
+            'hash': self.hash,
+            'path': self.path,
+        }
+
+
+class GetOrCreateRunId:
+    """
+    Get or create a run, return a run_id
+    if user specified a run_id, use it.
+    otherwise if there's an active run, use it
+    otherwise create a managed run.
+    """
+    def __init__(self, run_id):
+        self.managed_run_context = None
+        if run_id is not None:
+            self.run_id = run_id
+        elif mlflow.active_run() is not None:
+            self.run_id = mlflow.active_run().info.run_id
+        else:
+            self.run_id = None
+
+    def __enter__(self):
+        if self.run_id is not None:
+            return self.run_id
+        else:
+            self.managed_run_context = mlflow.start_run()
+            return self.managed_run_context.__enter__().info.run_id
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.managed_run_context is not None:
+            return self.managed_run_context.__exit__(exc_type, exc_val, exc_tb)
 
 
 class ModelEvaluator:
@@ -98,8 +202,21 @@ def can_evaluate(
         """
         raise NotImplementedError()
 
+    def compute_metrics(self, predict, dataset):
+        """
+        return an instance of EvaluationMetrics
+        """
+        raise NotImplementedError()
+
+    def compute_and_log_artifacts(self, predict, dataset, run_id, mlflow_client):
+        """
+        compute and log artifact, and return a dict of
+        artifact_name -> instance_of_EvaluationArtifacts
+        """
+        raise NotImplementedError()
+
     def evaluate(
-        self, predict, dataset, run_id, evaluator_config=None, **kwargs
+        self, predict, dataset, run_id=None, evaluator_config=None, **kwargs
     ) -> EvaluationResult:
         """
         :param predict: A function used to compute model predictions. Predict
@@ -115,7 +232,21 @@ def evaluate(
                          in the future.
         :return: An `EvaluationResult` instance containing evaluation results.
         """
-        raise NotImplementedError()
+        client = mlflow.tracking.MlflowClient()
+        with GetOrCreateRunId(run_id) as run_id:
+            metrics_dict = self.compute_metrics(predict, dataset)
+            timestamp = int(time.time() * 1000)
+            dataset_id = dataset.name if dataset.name is not None else dataset.hash
+            # TODO: log tags of dataset metadata
+            client.log_batch(
+                run_id,
+                metrics=[
+                    Metric(key=f'{key}_on_{dataset_id}', value=value, timestamp=timestamp, step=0)
+                    for key, value in metrics_dict
+                ],
+            )
+            artifact_dict = self.compute_and_log_artifact(predict, dataset, run_id, client)
+            return EvaluationResult(metrics_dict, artifact_dict)
 
 
 class ModelEvaluatorRegistry:
@@ -203,22 +334,21 @@ def evaluate(
 
     predict = model.predict
 
-    eval_results = {}
+    eval_results = []
     for evaluator_name in evaluators:
         config = evaluator_config[evaluator_name]
         try:
             evaluator = _model_evaluation_registry.get_evaluator(evaluator_name)
         except MlflowException:
-            eval_results[evaluator_name] = None
             continue
 
         if evaluator.can_evaluate(model_type, config):
             result = evaluator.evaluate(predict, dataset, run_id, config)
-            eval_results[evaluator_name] = result
-        else:
-            eval_results[evaluator_name] = None
+            eval_results.append(result)
+
+    merged_eval_result = EvaluationResult(EvaluationMetrics(), dict())
+    for eval_result in eval_results:
+        merged_eval_result.metrics.update(eval_result.metrics)
+        merged_eval_result.artifacts.update(eval_result.artifacts)
 
-    if len(evaluators) > 1:
-        return eval_results
-    else:
-        return eval_results[evaluators[0]]
+    return merged_eval_result

From 2240d0aa6617d1796c86e9f7701a12a3292a98ef Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 18 Nov 2021 18:13:35 +0800
Subject: [PATCH 010/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/evaluation.py     | 106 ++++++++++++++++++++++++++++-----------
 mlflow/utils/__init__.py |   9 ++++
 2 files changed, 85 insertions(+), 30 deletions(-)

diff --git a/mlflow/evaluation.py b/mlflow/evaluation.py
index 10c4349022be8..8213e4cdd80a0 100644
--- a/mlflow/evaluation.py
+++ b/mlflow/evaluation.py
@@ -7,23 +7,30 @@
 import numpy as np
 import pandas as pd
 import pickle
+import json
+import os
 from mlflow.exceptions import MlflowException
 from mlflow.utils.file_utils import TempDir
-from mlflow.entities import Metric
-from mlflow.tracking.artifact_utils import get_artifact_uri, _download_artifact_from_uri
+from mlflow.entities import Metric, RunTag
+from mlflow.tracking.artifact_utils import _download_artifact_from_uri
+from mlflow.utils import _get_fully_qualified_class_name, load_class
 
 
 class EvaluationMetrics(dict):
     pass
 
 
-class EvaluationArtifacts:
+class EvaluationArtifact:
 
     def __init__(self, location, content=None):
         self._content = content
         self._location = location
 
-    def load_content_from_file(self, local_artifact_file):
+    @classmethod
+    def load_content_from_file(self, local_artifact_path):
+        raise NotImplementedError()
+
+    def save_content_to_file(self, content, output_artifact_path):
         raise NotImplementedError()
 
     @property
@@ -52,9 +59,6 @@ def __getstate__(self, state):
         del state['_content']
         return state
 
-    def __setstate__(self, state):
-        self.__dict__.update(state)
-
 
 class EvaluationResult:
 
@@ -65,16 +69,40 @@ def __init__(self, metrics, artifacts):
     @classmethod
     def load(cls, path):
         """Load the evaluation results from the specified local filesystem path"""
-        with open(path, 'r') as f:
-            obj = pickle.load(f)
-        return obj
+        with open(os.path.join(path, 'metrics.json'), 'r') as fp:
+            metrics = EvaluationMetrics(json.load(fp))
+
+        with open(os.path.join(path, 'artifacts_metadata.json'), 'r') as fp:
+            artifacts_metadata = json.load(fp)
+
+        artifacts = {}
+
+        for artifact_name, meta in artifacts_metadata:
+            location = meta['location']
+            ArtifactCls = load_class(meta['class_name'])
+            content = ArtifactCls.load_content_from_file(os.path.join(path, artifact_name))
+            artifacts[artifact_name] = ArtifactCls(location=location, content=content)
+
+        return EvaluationResult(metrics=metrics, artifacts=artifacts)
 
     def save(self, path):
         """Write the evaluation results to the specified local filesystem path"""
-        # We will likely avoid serializing artifacts themselves, just locations.
-        # Deserialization will resolve locations to artifact contents.
-        with open(path, 'w') as f:
-            pickle.dump(self, f)
+        os.makedirs(path, exist_ok=True)
+        with open(os.path.join(path, 'metrics.json'), 'w') as fp:
+            json.dump(self.metrics, fp)
+
+        artifacts_metadata = {
+            artifact_name: {
+                'location': artifact.location,
+                'class_name': _get_fully_qualified_class_name(artifact)
+            }
+            for artifact_name, artifact in self.artifacts.items()
+        }
+        with open(os.path.join(path, 'artifacts_metadata.json'), 'w') as fp:
+            json.dump(artifacts_metadata, fp)
+
+        for artifact_name, artifact in self.artifacts.items():
+            artifact.save_content_to_file(artifact.content, os.path.join(path, artifact_name))
 
     @property
     def metrics(self) -> EvaluationMetrics:
@@ -84,7 +112,7 @@ def metrics(self) -> EvaluationMetrics:
         return self._metrics
 
     @property
-    def artifacts(self) -> Dict[str, EvaluationArtifacts]:
+    def artifacts(self) -> Dict[str, EvaluationArtifact]:
         """
         A dictionary mapping standardized artifact names (e.g. "roc_data") to
         artifact content and location information
@@ -118,10 +146,11 @@ def __init__(self, data, labels=None, name=None, path=None):
         :param path: (Optional) the path to a serialized DataFrame
           (e.g. a delta table, parquet file)
         """
-        self.name = name
+        self.user_specified_name = name
         self.data = data
         self.labels = labels
         self.path = path
+        self._hash = None
 
     @staticmethod
     def _gen_md5_for_arraylike_obj(md5_gen, data):
@@ -132,28 +161,37 @@ def _gen_md5_for_arraylike_obj(md5_gen, data):
             md5_gen.update(pickle.dumps(data[:EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH]))
             md5_gen.update(pickle.dumps(data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH:]))
 
+    @property
+    def name(self):
+        return self.user_specified_name if self.user_specified_name is not None else self.hash
+
     @property
     def hash(self):
         """
         Compute a hash from the specified dataset by selecting the first 5 records, last 5 records,
         dataset size and feeding them through a cheap, low-collision hash function
         """
-        md5_gen = hashlib.md5()
-        if isinstance(self.data, np.ndarray):
-            EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
-            EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.labels)
-        elif isinstance(self.data, pd.DataFrame):
-            EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
-            md5_gen.update(self.labels.encode('UTF-8'))
-        return md5_gen.digest()
+        if self._hash is not None:
+            return self._hash
+        else:
+            md5_gen = hashlib.md5()
+            if isinstance(self.data, np.ndarray):
+                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
+                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.labels)
+            elif isinstance(self.data, pd.DataFrame):
+                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
+                md5_gen.update(self.labels.encode('UTF-8'))
+            return md5_gen.hexdigest()
 
     @property
     def metadata(self):
-        return {
-            'name': self.name,
+        metadata = {
             'hash': self.hash,
             'path': self.path,
         }
+        if self.user_specified_name is not None:
+            metadata['name'] = self.user_specified_name
+        return metadata
 
 
 class GetOrCreateRunId:
@@ -211,7 +249,7 @@ def compute_metrics(self, predict, dataset):
     def compute_and_log_artifacts(self, predict, dataset, run_id, mlflow_client):
         """
         compute and log artifact, and return a dict of
-        artifact_name -> instance_of_EvaluationArtifacts
+        artifact_name -> instance_of_EvaluationArtifact
         """
         raise NotImplementedError()
 
@@ -236,14 +274,22 @@ def evaluate(
         with GetOrCreateRunId(run_id) as run_id:
             metrics_dict = self.compute_metrics(predict, dataset)
             timestamp = int(time.time() * 1000)
-            dataset_id = dataset.name if dataset.name is not None else dataset.hash
-            # TODO: log tags of dataset metadata
+            existing_dataset_metadata_str = client.get_run(run_id).data.tags.get('mlflow.datasets')
+            if existing_dataset_metadata_str is not None:
+                dataset_metadata_list = json.loads(existing_dataset_metadata_str)
+            else:
+                dataset_metadata_list = []
+            dataset_metadata_list.append(dataset.metadata)
+
+            dataset_metadata_str = json.dumps(dataset_metadata_list)
+
             client.log_batch(
                 run_id,
                 metrics=[
-                    Metric(key=f'{key}_on_{dataset_id}', value=value, timestamp=timestamp, step=0)
+                    Metric(key=f'{key}_on_{dataset.name}', value=value, timestamp=timestamp, step=0)
                     for key, value in metrics_dict
                 ],
+                tags=[RunTag('mlflow.datasets', dataset_metadata_str)]
             )
             artifact_dict = self.compute_and_log_artifact(predict, dataset, run_id, client)
             return EvaluationResult(metrics_dict, artifact_dict)
diff --git a/mlflow/utils/__init__.py b/mlflow/utils/__init__.py
index 270637954e784..e6884a4801091 100644
--- a/mlflow/utils/__init__.py
+++ b/mlflow/utils/__init__.py
@@ -1,6 +1,7 @@
 import logging
 from itertools import islice
 from sys import version_info
+import importlib
 
 
 _logger = logging.getLogger(__name__)
@@ -172,3 +173,11 @@ def _inspect_original_var_name(var, fallback_name):
 
     except Exception:
         return fallback_name
+
+
+def load_class(kls):
+    parts = kls.split('.')
+    module = ".".join(parts[:-1])
+    m = importlib.import_module(module)
+    name = parts[-1]
+    return getattr(m, name)

From 0e54b978b2c611377a52fa187761ae84843e32e2 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 18 Nov 2021 22:40:37 +0800
Subject: [PATCH 011/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/evaluation.py                          |  35 +++---
 .../mlflow_test_plugin/dummy_evaluator.py     | 111 ++++++------------
 2 files changed, 57 insertions(+), 89 deletions(-)

diff --git a/mlflow/evaluation.py b/mlflow/evaluation.py
index 8213e4cdd80a0..448b0d2a0a664 100644
--- a/mlflow/evaluation.py
+++ b/mlflow/evaluation.py
@@ -27,7 +27,7 @@ def __init__(self, location, content=None):
         self._location = location
 
     @classmethod
-    def load_content_from_file(self, local_artifact_path):
+    def load_content_from_file(cls, local_artifact_path):
         raise NotImplementedError()
 
     def save_content_to_file(self, content, output_artifact_path):
@@ -240,21 +240,19 @@ def can_evaluate(
         """
         raise NotImplementedError()
 
-    def compute_metrics(self, predict, dataset):
+    def compute_metrics_and_compute_and_log_artifacts(
+            self, model_type, predict, dataset, evaluator_config, run_id
+    ):
         """
-        return an instance of EvaluationMetrics
-        """
-        raise NotImplementedError()
-
-    def compute_and_log_artifacts(self, predict, dataset, run_id, mlflow_client):
-        """
-        compute and log artifact, and return a dict of
-        artifact_name -> instance_of_EvaluationArtifact
+        return an tuple of:
+         - an instance of EvaluationMetrics
+         - a dict of artifact_name -> instance_of_EvaluationArtifact
+        and log artifacts into run specified by run_id
         """
         raise NotImplementedError()
 
     def evaluate(
-        self, predict, dataset, run_id=None, evaluator_config=None, **kwargs
+        self, model_type, predict, dataset, run_id=None, evaluator_config=None, **kwargs
     ) -> EvaluationResult:
         """
         :param predict: A function used to compute model predictions. Predict
@@ -271,18 +269,27 @@ def evaluate(
         :return: An `EvaluationResult` instance containing evaluation results.
         """
         client = mlflow.tracking.MlflowClient()
+        self.mlflow_client = client
+
         with GetOrCreateRunId(run_id) as run_id:
-            metrics_dict = self.compute_metrics(predict, dataset)
             timestamp = int(time.time() * 1000)
             existing_dataset_metadata_str = client.get_run(run_id).data.tags.get('mlflow.datasets')
             if existing_dataset_metadata_str is not None:
                 dataset_metadata_list = json.loads(existing_dataset_metadata_str)
             else:
                 dataset_metadata_list = []
+
+            for metadata in dataset_metadata_list:
+                if metadata['hash'] == dataset.hash:
+                    raise ValueError(f'The dataset {dataset.name} evaluation results has been '
+                                     'logged.')
             dataset_metadata_list.append(dataset.metadata)
 
             dataset_metadata_str = json.dumps(dataset_metadata_list)
 
+            metrics_dict, artifacts_dict = self.compute_metrics_and_compute_and_log_artifacts(
+                model_type, predict, dataset, run_id, client
+            )
             client.log_batch(
                 run_id,
                 metrics=[
@@ -291,8 +298,8 @@ def evaluate(
                 ],
                 tags=[RunTag('mlflow.datasets', dataset_metadata_str)]
             )
-            artifact_dict = self.compute_and_log_artifact(predict, dataset, run_id, client)
-            return EvaluationResult(metrics_dict, artifact_dict)
+
+            return EvaluationResult(metrics_dict, artifacts_dict)
 
 
 class ModelEvaluatorRegistry:
diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
index 46062c33a24df..7a047510d8f27 100644
--- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -1,55 +1,22 @@
 import mlflow
 from mlflow.evaluation import ModelEvaluator, EvaluationMetrics, \
     EvaluationArtifact, EvaluationResult, EvaluationDataset
-from mlflow.tracking import artifact_utils
+from mlflow.tracking.artifact_utils import get_artifact_uri
 from sklearn import metrics as sk_metrics
 import numpy as np
-import pickle
+import json
 
 
-class DummyEvaluationArtifact(EvaluationArtifact):
+class JsonEvaluationArtifact(EvaluationArtifact):
 
-    def __init__(self, content, location):
-        self._content = content
-        self._location = location
-
-    @property
-    def content(self):
-        return self._content
-
-    @property
-    def location(self) -> str:
-        return self._location
-
-
-class DummyEvaluationResult(EvaluationResult):
-
-    def __init__(self, metric_values, artifact_content, artifact_location):
-        self.metric_values = metric_values
-        self.artifact_content = artifact_content
-        self.artifact_location = artifact_location
+    def save_content_to_file(self, content, output_artifact_path):
+        with open(output_artifact_path, 'w') as fp:
+            json.dump(content, fp)
 
     @classmethod
-    def load(cls, path):
-        with open(path, 'r') as f:
-            obj = pickle.load(f)
-        return obj
-
-    def save(self, path):
-        with open(path, 'w') as f:
-            # TODO: skip dump artifact, instead, download artifact content when loading
-            pickle.dump(self, f)
-
-    @property
-    def metrics(self):
-        return self.metric_values
-
-    @property
-    def artifacts(self):
-        return DummyEvaluationArtifact(
-            content=self.artifact_content,
-            location=self.artifact_location
-        )
+    def load_content_from_file(cls, local_artifact_path):
+        with open(local_artifact_path, 'r') as fp:
+            return json.load(fp)
 
 
 class DummyEvaluator(ModelEvaluator):
@@ -60,41 +27,35 @@ def can_evaluate(
         return evaluator_config.get('can_evaluate') and \
                model_type in ['classifier', 'regressor']
 
-    def _evaluate(self, predict, dataset, run_id, evaluator_config):
+    def compute_metrics_and_compute_and_log_artifacts(
+            self, model_type, predict, dataset, evaluator_config, run_id
+    ):
         X = dataset.data
         assert isinstance(X, np.ndarray), 'Only support array type feature input'
         assert dataset.name is not None, 'Dataset name required'
         y = dataset.labels
         y_pred = predict(X)
-
-        metrics_to_calc = evaluator_config.get('metrics_to_calc')
-
-        client = mlflow.tracking.MlflowClient()
-        metric_values = {}
-        for metric_name in metrics_to_calc:
-            metric_value = getattr(sk_metrics, metric_name)(y, y_pred)
-            metric_values[metric_name] = metric_value
-            metric_key = f'{metric_name}_on_{dataset.name}'
-            client.log_metric(run_id=run_id, key=metric_key, value=metric_value)
-
-        client.log_dict(run_id, metric_values, 'metrics_artifact.json')
-
-        # TODO: log `mlflow.datasets` tag containing a list of metadata for all datasets
-
-        return DummyEvaluationResult(
-            metric_values=metric_values,
-            artifact_content=metric_values,
-            artifact_location=artifact_utils.get_artifact_uri(run_id, 'metrics_artifact.json')
-        )
-
-    def evaluate(
-        self, predict, dataset, run_id, evaluator_config=None
-    ):
-        if run_id is not None:
-            return self._evaluate(predict, dataset, run_id, evaluator_config)
-        elif mlflow.active_run() is not None:
-            return self._evaluate(predict, dataset, mlflow.active_run().info.run_id,
-                                  evaluator_config)
-        else:
-            with mlflow.start_run() as run:
-                return self._evaluate(predict, dataset, run.info.run_id, evaluator_config)
+        if model_type == 'classifier':
+            accuracy_score = sk_metrics.accuracy_score(y, y_pred)
+            brier_score_loss = sk_metrics.brier_score_loss(y, y_pred)
+
+            metrics = EvaluationMetrics(
+                accuracy_score=accuracy_score,
+                brier_score_loss=brier_score_loss
+            )
+            confusion_matrix = sk_metrics.confusion_matrix(y, y_pred)
+            confusion_matrix_artifact_name = f'confusion_matrix_on_{dataset.name}.json'
+            confusion_matrix_artifact = JsonEvaluationArtifact(
+                location=get_artifact_uri(run_id, confusion_matrix_artifact_name),
+                content=confusion_matrix
+            )
+            self.mlflow_client.log_dict(run_id, confusion_matrix)
+            artifacts = {confusion_matrix_artifact_name: confusion_matrix_artifact}
+            return metrics, artifacts
+        elif model_type == 'regressor':
+            mean_absolute_error = sk_metrics.mean_absolute_error(y, y_pred)
+            mean_squared_error = sk_metrics.mean_squared_error(y, y_pred)
+            return EvaluationMetrics(
+                mean_absolute_error=mean_absolute_error,
+                mean_squared_error=mean_squared_error
+            ), {}

From 6ee436723b6122668048246671b6f284438d3387 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Fri, 19 Nov 2021 10:22:51 +0800
Subject: [PATCH 012/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/evaluation.py                                 | 12 ++++++++----
 .../mlflow_test_plugin/dummy_evaluator.py            |  2 ++
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/mlflow/evaluation.py b/mlflow/evaluation.py
index 448b0d2a0a664..d1b46fb7410f5 100644
--- a/mlflow/evaluation.py
+++ b/mlflow/evaluation.py
@@ -279,11 +279,15 @@ def evaluate(
             else:
                 dataset_metadata_list = []
 
+            metadata_exists = False
             for metadata in dataset_metadata_list:
-                if metadata['hash'] == dataset.hash:
-                    raise ValueError(f'The dataset {dataset.name} evaluation results has been '
-                                     'logged.')
-            dataset_metadata_list.append(dataset.metadata)
+                if metadata['hash'] == dataset.hash and \
+                        metadata['name'] == dataset.user_specified_name:
+                    metadata_exists = True
+                    break
+
+            if not metadata_exists:
+                dataset_metadata_list.append(dataset.metadata)
 
             dataset_metadata_str = json.dumps(dataset_metadata_list)
 
diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
index 7a047510d8f27..3c39a4ce5b136 100644
--- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -59,3 +59,5 @@ def compute_metrics_and_compute_and_log_artifacts(
                 mean_absolute_error=mean_absolute_error,
                 mean_squared_error=mean_squared_error
             ), {}
+        else:
+            raise ValueError(f'Unsupported model type {model_type}')

From 15712e0ec572a1977f8d1da154acfba360d24ad4 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Fri, 19 Nov 2021 18:41:45 +0800
Subject: [PATCH 013/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/evaluation.py                          |   67 +-
 mlflow/utils/__init__.py                      |    2 +-
 tests/protos/test_message_pb2.py              | 1350 +++++++++++------
 .../mlflow_test_plugin/dummy_evaluator.py     |   75 +-
 tests/test_evaluation.py                      |   97 +-
 5 files changed, 1010 insertions(+), 581 deletions(-)

diff --git a/mlflow/evaluation.py b/mlflow/evaluation.py
index d1b46fb7410f5..b863e9efdb193 100644
--- a/mlflow/evaluation.py
+++ b/mlflow/evaluation.py
@@ -21,7 +21,6 @@ class EvaluationMetrics(dict):
 
 
 class EvaluationArtifact:
-
     def __init__(self, location, content=None):
         self._content = content
         self._location = location
@@ -30,7 +29,8 @@ def __init__(self, location, content=None):
     def load_content_from_file(cls, local_artifact_path):
         raise NotImplementedError()
 
-    def save_content_to_file(self, content, output_artifact_path):
+    @classmethod
+    def save_content_to_file(cls, content, output_artifact_path):
         raise NotImplementedError()
 
     @property
@@ -40,7 +40,7 @@ def content(self):
         """
         if self._content is None:
             with TempDir() as temp_dir:
-                local_artifact_file = temp_dir.path('local_artifact')
+                local_artifact_file = temp_dir.path("local_artifact")
                 _download_artifact_from_uri(self._location, local_artifact_file)
                 self._content = self.load_content_from_file(local_artifact_file)
 
@@ -56,12 +56,11 @@ def location(self) -> str:
     def __getstate__(self, state):
         state = state.__dict__.copy()
         # skip pickling artifact content
-        del state['_content']
+        del state["_content"]
         return state
 
 
 class EvaluationResult:
-
     def __init__(self, metrics, artifacts):
         self._metrics = metrics
         self._artifacts = artifacts
@@ -69,17 +68,17 @@ def __init__(self, metrics, artifacts):
     @classmethod
     def load(cls, path):
         """Load the evaluation results from the specified local filesystem path"""
-        with open(os.path.join(path, 'metrics.json'), 'r') as fp:
+        with open(os.path.join(path, "metrics.json"), "r") as fp:
             metrics = EvaluationMetrics(json.load(fp))
 
-        with open(os.path.join(path, 'artifacts_metadata.json'), 'r') as fp:
+        with open(os.path.join(path, "artifacts_metadata.json"), "r") as fp:
             artifacts_metadata = json.load(fp)
 
         artifacts = {}
 
         for artifact_name, meta in artifacts_metadata:
-            location = meta['location']
-            ArtifactCls = load_class(meta['class_name'])
+            location = meta["location"]
+            ArtifactCls = load_class(meta["class_name"])
             content = ArtifactCls.load_content_from_file(os.path.join(path, artifact_name))
             artifacts[artifact_name] = ArtifactCls(location=location, content=content)
 
@@ -88,17 +87,17 @@ def load(cls, path):
     def save(self, path):
         """Write the evaluation results to the specified local filesystem path"""
         os.makedirs(path, exist_ok=True)
-        with open(os.path.join(path, 'metrics.json'), 'w') as fp:
+        with open(os.path.join(path, "metrics.json"), "w") as fp:
             json.dump(self.metrics, fp)
 
         artifacts_metadata = {
             artifact_name: {
-                'location': artifact.location,
-                'class_name': _get_fully_qualified_class_name(artifact)
+                "location": artifact.location,
+                "class_name": _get_fully_qualified_class_name(artifact),
             }
             for artifact_name, artifact in self.artifacts.items()
         }
-        with open(os.path.join(path, 'artifacts_metadata.json'), 'w') as fp:
+        with open(os.path.join(path, "artifacts_metadata.json"), "w") as fp:
             json.dump(artifacts_metadata, fp)
 
         for artifact_name, artifact in self.artifacts.items():
@@ -158,8 +157,8 @@ def _gen_md5_for_arraylike_obj(md5_gen, data):
         if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
             md5_gen.update(pickle.dumps(data))
         else:
-            md5_gen.update(pickle.dumps(data[:EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH]))
-            md5_gen.update(pickle.dumps(data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH:]))
+            md5_gen.update(pickle.dumps(data[: EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH]))
+            md5_gen.update(pickle.dumps(data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH :]))
 
     @property
     def name(self):
@@ -180,17 +179,17 @@ def hash(self):
                 EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.labels)
             elif isinstance(self.data, pd.DataFrame):
                 EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
-                md5_gen.update(self.labels.encode('UTF-8'))
+                md5_gen.update(self.labels.encode("UTF-8"))
             return md5_gen.hexdigest()
 
     @property
     def metadata(self):
         metadata = {
-            'hash': self.hash,
-            'path': self.path,
+            "hash": self.hash,
+            "path": self.path,
         }
         if self.user_specified_name is not None:
-            metadata['name'] = self.user_specified_name
+            metadata["name"] = self.user_specified_name
         return metadata
 
 
@@ -201,6 +200,7 @@ class GetOrCreateRunId:
     otherwise if there's an active run, use it
     otherwise create a managed run.
     """
+
     def __init__(self, run_id):
         self.managed_run_context = None
         if run_id is not None:
@@ -223,10 +223,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 
 class ModelEvaluator:
-
-    def can_evaluate(
-        self, model_type, evaluator_config=None, **kwargs
-    ) -> bool:
+    def can_evaluate(self, model_type, evaluator_config=None, **kwargs) -> bool:
         """
         :param model_type: A string describing the model type (e.g., "regressor",
                            "classifier", …).
@@ -241,7 +238,7 @@ def can_evaluate(
         raise NotImplementedError()
 
     def compute_metrics_and_compute_and_log_artifacts(
-            self, model_type, predict, dataset, evaluator_config, run_id
+        self, model_type, predict, dataset, evaluator_config, run_id
     ):
         """
         return an tuple of:
@@ -273,7 +270,7 @@ def evaluate(
 
         with GetOrCreateRunId(run_id) as run_id:
             timestamp = int(time.time() * 1000)
-            existing_dataset_metadata_str = client.get_run(run_id).data.tags.get('mlflow.datasets')
+            existing_dataset_metadata_str = client.get_run(run_id).data.tags.get("mlflow.datasets")
             if existing_dataset_metadata_str is not None:
                 dataset_metadata_list = json.loads(existing_dataset_metadata_str)
             else:
@@ -281,8 +278,10 @@ def evaluate(
 
             metadata_exists = False
             for metadata in dataset_metadata_list:
-                if metadata['hash'] == dataset.hash and \
-                        metadata['name'] == dataset.user_specified_name:
+                if (
+                    metadata["hash"] == dataset.hash
+                    and metadata["name"] == dataset.user_specified_name
+                ):
                     metadata_exists = True
                     break
 
@@ -292,15 +291,15 @@ def evaluate(
             dataset_metadata_str = json.dumps(dataset_metadata_list)
 
             metrics_dict, artifacts_dict = self.compute_metrics_and_compute_and_log_artifacts(
-                model_type, predict, dataset, run_id, client
+                model_type, predict, dataset, evaluator_config, run_id
             )
             client.log_batch(
                 run_id,
                 metrics=[
-                    Metric(key=f'{key}_on_{dataset.name}', value=value, timestamp=timestamp, step=0)
-                    for key, value in metrics_dict
+                    Metric(key=f"{key}_on_{dataset.name}", value=value, timestamp=timestamp, step=0)
+                    for key, value in metrics_dict.items()
                 ],
-                tags=[RunTag('mlflow.datasets', dataset_metadata_str)]
+                tags=[RunTag("mlflow.datasets", dataset_metadata_str)],
             )
 
             return EvaluationResult(metrics_dict, artifacts_dict)
@@ -351,7 +350,7 @@ def get_evaluator(self, evaluator_name):
 
 
 def evaluate(
-   model, model_type, dataset, run_id=None, evaluators=None, evaluator_config=None
+    model, model_type, dataset, run_id=None, evaluators=None, evaluator_config=None
 ) -> Union[EvaluationResult, Dict[str, EvaluationResult]]:
     """
     :param model: A model supported by the specified `evaluator`, or a URI
@@ -380,7 +379,7 @@ def evaluate(
     :return: An `EvaluationResult` instance containing evaluation results.
     """
     if evaluators is None:
-        evaluators = 'default_evaluator'
+        evaluators = "default_evaluator"
 
     if not isinstance(evaluators, list):
         evaluators = [evaluators]
@@ -400,7 +399,7 @@ def evaluate(
             continue
 
         if evaluator.can_evaluate(model_type, config):
-            result = evaluator.evaluate(predict, dataset, run_id, config)
+            result = evaluator.evaluate(model_type, predict, dataset, run_id, config)
             eval_results.append(result)
 
     merged_eval_result = EvaluationResult(EvaluationMetrics(), dict())
diff --git a/mlflow/utils/__init__.py b/mlflow/utils/__init__.py
index e6884a4801091..2bb183e281fd4 100644
--- a/mlflow/utils/__init__.py
+++ b/mlflow/utils/__init__.py
@@ -176,7 +176,7 @@ def _inspect_original_var_name(var, fallback_name):
 
 
 def load_class(kls):
-    parts = kls.split('.')
+    parts = kls.split(".")
     module = ".".join(parts[:-1])
     m = importlib.import_module(module)
     name = parts[-1]
diff --git a/tests/protos/test_message_pb2.py b/tests/protos/test_message_pb2.py
index b2e4840487095..f8cf37f752c24 100644
--- a/tests/protos/test_message_pb2.py
+++ b/tests/protos/test_message_pb2.py
@@ -2,529 +2,913 @@
 # source: test_message.proto
 
 import sys
-_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+
+_b = sys.version_info[0] < 3 and (lambda x: x) or (lambda x: x.encode("latin1"))
 from google.protobuf import descriptor as _descriptor
 from google.protobuf import message as _message
 from google.protobuf import reflection as _reflection
 from google.protobuf import symbol_database as _symbol_database
+
 # @@protoc_insertion_point(imports)
 
 _sym_db = _symbol_database.Default()
 
 
-
-
 DESCRIPTOR = _descriptor.FileDescriptor(
-  name='test_message.proto',
-  package='mlflow',
-  syntax='proto2',
-  serialized_options=None,
-  serialized_pb=_b('\n\x12test_message.proto\x12\x06mlflow\"\x9d\t\n\x0bTestMessage\x12\x13\n\x0b\x66ield_int32\x18\x01 \x01(\x05\x12\x13\n\x0b\x66ield_int64\x18\x02 \x01(\x03\x12\x14\n\x0c\x66ield_uint32\x18\x03 \x01(\r\x12\x14\n\x0c\x66ield_uint64\x18\x04 \x01(\x04\x12\x14\n\x0c\x66ield_sint32\x18\x05 \x01(\x11\x12\x14\n\x0c\x66ield_sint64\x18\x06 \x01(\x12\x12\x15\n\rfield_fixed32\x18\x07 \x01(\x07\x12\x15\n\rfield_fixed64\x18\x08 \x01(\x06\x12\x16\n\x0e\x66ield_sfixed32\x18\t \x01(\x0f\x12\x16\n\x0e\x66ield_sfixed64\x18\n \x01(\x10\x12\x12\n\nfield_bool\x18\x0b \x01(\x08\x12\x14\n\x0c\x66ield_string\x18\x0c \x01(\t\x12 \n\x13\x66ield_with_default1\x18\r \x01(\x03:\x03\x31\x30\x30\x12 \n\x13\x66ield_with_default2\x18\x0e \x01(\x03:\x03\x32\x30\x30\x12\x1c\n\x14\x66ield_repeated_int64\x18\x0f \x03(\x03\x12\x30\n\nfield_enum\x18\x10 \x01(\x0e\x32\x1c.mlflow.TestMessage.TestEnum\x12\x41\n\x13\x66ield_inner_message\x18\x11 \x03(\x0b\x32$.mlflow.TestMessage.TestInnerMessage\x12\x10\n\x06oneof1\x18\x12 \x01(\x03H\x00\x12\x10\n\x06oneof2\x18\x13 \x01(\x03H\x00\x12\x36\n\nfield_map1\x18\x14 \x03(\x0b\x32\".mlflow.TestMessage.FieldMap1Entry\x12\x36\n\nfield_map2\x18\x15 \x03(\x0b\x32\".mlflow.TestMessage.FieldMap2Entry\x12\x36\n\nfield_map3\x18\x16 \x03(\x0b\x32\".mlflow.TestMessage.FieldMap3Entry\x12\x36\n\nfield_map4\x18\x17 \x03(\x0b\x32\".mlflow.TestMessage.FieldMap4Entry\x1am\n\x10TestInnerMessage\x12\x19\n\x11\x66ield_inner_int64\x18\x01 \x01(\x03\x12\"\n\x1a\x66ield_inner_repeated_int64\x18\x02 \x03(\x03\x12\x1a\n\x12\x66ield_inner_string\x18\x03 \x01(\t\x1a\x30\n\x0e\x46ieldMap1Entry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x30\n\x0e\x46ieldMap2Entry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\x03:\x02\x38\x01\x1a\x30\n\x0e\x46ieldMap3Entry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\x03:\x02\x38\x01\x1aV\n\x0e\x46ieldMap4Entry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\x33\n\x05value\x18\x02 \x01(\x0b\x32$.mlflow.TestMessage.TestInnerMessage:\x02\x38\x01\"6\n\x08TestEnum\x12\x08\n\x04NONE\x10\x00\x12\x0f\n\x0b\x45NUM_VALUE1\x10\x01\x12\x0f\n\x0b\x45NUM_VALUE2\x10\x02*\x06\x08\xe8\x07\x10\xd0\x0f\x42\x0c\n\ntest_oneof\"F\n\x10\x45xtensionMessage22\n\x14\x66ield_extended_int64\x12\x13.mlflow.TestMessage\x18\xe9\x07 \x01(\x03')
+    name="test_message.proto",
+    package="mlflow",
+    syntax="proto2",
+    serialized_options=None,
+    serialized_pb=_b(
+        '\n\x12test_message.proto\x12\x06mlflow"\x9d\t\n\x0bTestMessage\x12\x13\n\x0b\x66ield_int32\x18\x01 \x01(\x05\x12\x13\n\x0b\x66ield_int64\x18\x02 \x01(\x03\x12\x14\n\x0c\x66ield_uint32\x18\x03 \x01(\r\x12\x14\n\x0c\x66ield_uint64\x18\x04 \x01(\x04\x12\x14\n\x0c\x66ield_sint32\x18\x05 \x01(\x11\x12\x14\n\x0c\x66ield_sint64\x18\x06 \x01(\x12\x12\x15\n\rfield_fixed32\x18\x07 \x01(\x07\x12\x15\n\rfield_fixed64\x18\x08 \x01(\x06\x12\x16\n\x0e\x66ield_sfixed32\x18\t \x01(\x0f\x12\x16\n\x0e\x66ield_sfixed64\x18\n \x01(\x10\x12\x12\n\nfield_bool\x18\x0b \x01(\x08\x12\x14\n\x0c\x66ield_string\x18\x0c \x01(\t\x12 \n\x13\x66ield_with_default1\x18\r \x01(\x03:\x03\x31\x30\x30\x12 \n\x13\x66ield_with_default2\x18\x0e \x01(\x03:\x03\x32\x30\x30\x12\x1c\n\x14\x66ield_repeated_int64\x18\x0f \x03(\x03\x12\x30\n\nfield_enum\x18\x10 \x01(\x0e\x32\x1c.mlflow.TestMessage.TestEnum\x12\x41\n\x13\x66ield_inner_message\x18\x11 \x03(\x0b\x32$.mlflow.TestMessage.TestInnerMessage\x12\x10\n\x06oneof1\x18\x12 \x01(\x03H\x00\x12\x10\n\x06oneof2\x18\x13 \x01(\x03H\x00\x12\x36\n\nfield_map1\x18\x14 \x03(\x0b\x32".mlflow.TestMessage.FieldMap1Entry\x12\x36\n\nfield_map2\x18\x15 \x03(\x0b\x32".mlflow.TestMessage.FieldMap2Entry\x12\x36\n\nfield_map3\x18\x16 \x03(\x0b\x32".mlflow.TestMessage.FieldMap3Entry\x12\x36\n\nfield_map4\x18\x17 \x03(\x0b\x32".mlflow.TestMessage.FieldMap4Entry\x1am\n\x10TestInnerMessage\x12\x19\n\x11\x66ield_inner_int64\x18\x01 \x01(\x03\x12"\n\x1a\x66ield_inner_repeated_int64\x18\x02 \x03(\x03\x12\x1a\n\x12\x66ield_inner_string\x18\x03 \x01(\t\x1a\x30\n\x0e\x46ieldMap1Entry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x30\n\x0e\x46ieldMap2Entry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\x03:\x02\x38\x01\x1a\x30\n\x0e\x46ieldMap3Entry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\x03:\x02\x38\x01\x1aV\n\x0e\x46ieldMap4Entry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\x33\n\x05value\x18\x02 \x01(\x0b\x32$.mlflow.TestMessage.TestInnerMessage:\x02\x38\x01"6\n\x08TestEnum\x12\x08\n\x04NONE\x10\x00\x12\x0f\n\x0b\x45NUM_VALUE1\x10\x01\x12\x0f\n\x0b\x45NUM_VALUE2\x10\x02*\x06\x08\xe8\x07\x10\xd0\x0f\x42\x0c\n\ntest_oneof"F\n\x10\x45xtensionMessage22\n\x14\x66ield_extended_int64\x12\x13.mlflow.TestMessage\x18\xe9\x07 \x01(\x03'
+    ),
 )
 
 
-
 _TESTMESSAGE_TESTENUM = _descriptor.EnumDescriptor(
-  name='TestEnum',
-  full_name='mlflow.TestMessage.TestEnum',
-  filename=None,
-  file=DESCRIPTOR,
-  values=[
-    _descriptor.EnumValueDescriptor(
-      name='NONE', index=0, number=0,
-      serialized_options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='ENUM_VALUE1', index=1, number=1,
-      serialized_options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='ENUM_VALUE2', index=2, number=2,
-      serialized_options=None,
-      type=None),
-  ],
-  containing_type=None,
-  serialized_options=None,
-  serialized_start=1136,
-  serialized_end=1190,
+    name="TestEnum",
+    full_name="mlflow.TestMessage.TestEnum",
+    filename=None,
+    file=DESCRIPTOR,
+    values=[
+        _descriptor.EnumValueDescriptor(
+            name="NONE", index=0, number=0, serialized_options=None, type=None
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="ENUM_VALUE1", index=1, number=1, serialized_options=None, type=None
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="ENUM_VALUE2", index=2, number=2, serialized_options=None, type=None
+        ),
+    ],
+    containing_type=None,
+    serialized_options=None,
+    serialized_start=1136,
+    serialized_end=1190,
 )
 _sym_db.RegisterEnumDescriptor(_TESTMESSAGE_TESTENUM)
 
 
 _TESTMESSAGE_TESTINNERMESSAGE = _descriptor.Descriptor(
-  name='TestInnerMessage',
-  full_name='mlflow.TestMessage.TestInnerMessage',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='field_inner_int64', full_name='mlflow.TestMessage.TestInnerMessage.field_inner_int64', index=0,
-      number=1, type=3, cpp_type=2, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='field_inner_repeated_int64', full_name='mlflow.TestMessage.TestInnerMessage.field_inner_repeated_int64', index=1,
-      number=2, type=3, cpp_type=2, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='field_inner_string', full_name='mlflow.TestMessage.TestInnerMessage.field_inner_string', index=2,
-      number=3, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  serialized_options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=787,
-  serialized_end=896,
+    name="TestInnerMessage",
+    full_name="mlflow.TestMessage.TestInnerMessage",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="field_inner_int64",
+            full_name="mlflow.TestMessage.TestInnerMessage.field_inner_int64",
+            index=0,
+            number=1,
+            type=3,
+            cpp_type=2,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="field_inner_repeated_int64",
+            full_name="mlflow.TestMessage.TestInnerMessage.field_inner_repeated_int64",
+            index=1,
+            number=2,
+            type=3,
+            cpp_type=2,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="field_inner_string",
+            full_name="mlflow.TestMessage.TestInnerMessage.field_inner_string",
+            index=2,
+            number=3,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=False,
+    syntax="proto2",
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=787,
+    serialized_end=896,
 )
 
 _TESTMESSAGE_FIELDMAP1ENTRY = _descriptor.Descriptor(
-  name='FieldMap1Entry',
-  full_name='mlflow.TestMessage.FieldMap1Entry',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='key', full_name='mlflow.TestMessage.FieldMap1Entry.key', index=0,
-      number=1, type=3, cpp_type=2, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='value', full_name='mlflow.TestMessage.FieldMap1Entry.value', index=1,
-      number=2, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  serialized_options=_b('8\001'),
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=898,
-  serialized_end=946,
+    name="FieldMap1Entry",
+    full_name="mlflow.TestMessage.FieldMap1Entry",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="key",
+            full_name="mlflow.TestMessage.FieldMap1Entry.key",
+            index=0,
+            number=1,
+            type=3,
+            cpp_type=2,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="value",
+            full_name="mlflow.TestMessage.FieldMap1Entry.value",
+            index=1,
+            number=2,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=_b("8\001"),
+    is_extendable=False,
+    syntax="proto2",
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=898,
+    serialized_end=946,
 )
 
 _TESTMESSAGE_FIELDMAP2ENTRY = _descriptor.Descriptor(
-  name='FieldMap2Entry',
-  full_name='mlflow.TestMessage.FieldMap2Entry',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='key', full_name='mlflow.TestMessage.FieldMap2Entry.key', index=0,
-      number=1, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='value', full_name='mlflow.TestMessage.FieldMap2Entry.value', index=1,
-      number=2, type=3, cpp_type=2, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  serialized_options=_b('8\001'),
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=948,
-  serialized_end=996,
+    name="FieldMap2Entry",
+    full_name="mlflow.TestMessage.FieldMap2Entry",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="key",
+            full_name="mlflow.TestMessage.FieldMap2Entry.key",
+            index=0,
+            number=1,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="value",
+            full_name="mlflow.TestMessage.FieldMap2Entry.value",
+            index=1,
+            number=2,
+            type=3,
+            cpp_type=2,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=_b("8\001"),
+    is_extendable=False,
+    syntax="proto2",
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=948,
+    serialized_end=996,
 )
 
 _TESTMESSAGE_FIELDMAP3ENTRY = _descriptor.Descriptor(
-  name='FieldMap3Entry',
-  full_name='mlflow.TestMessage.FieldMap3Entry',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='key', full_name='mlflow.TestMessage.FieldMap3Entry.key', index=0,
-      number=1, type=3, cpp_type=2, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='value', full_name='mlflow.TestMessage.FieldMap3Entry.value', index=1,
-      number=2, type=3, cpp_type=2, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  serialized_options=_b('8\001'),
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=998,
-  serialized_end=1046,
+    name="FieldMap3Entry",
+    full_name="mlflow.TestMessage.FieldMap3Entry",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="key",
+            full_name="mlflow.TestMessage.FieldMap3Entry.key",
+            index=0,
+            number=1,
+            type=3,
+            cpp_type=2,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="value",
+            full_name="mlflow.TestMessage.FieldMap3Entry.value",
+            index=1,
+            number=2,
+            type=3,
+            cpp_type=2,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=_b("8\001"),
+    is_extendable=False,
+    syntax="proto2",
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=998,
+    serialized_end=1046,
 )
 
 _TESTMESSAGE_FIELDMAP4ENTRY = _descriptor.Descriptor(
-  name='FieldMap4Entry',
-  full_name='mlflow.TestMessage.FieldMap4Entry',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='key', full_name='mlflow.TestMessage.FieldMap4Entry.key', index=0,
-      number=1, type=3, cpp_type=2, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='value', full_name='mlflow.TestMessage.FieldMap4Entry.value', index=1,
-      number=2, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  serialized_options=_b('8\001'),
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1048,
-  serialized_end=1134,
+    name="FieldMap4Entry",
+    full_name="mlflow.TestMessage.FieldMap4Entry",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="key",
+            full_name="mlflow.TestMessage.FieldMap4Entry.key",
+            index=0,
+            number=1,
+            type=3,
+            cpp_type=2,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="value",
+            full_name="mlflow.TestMessage.FieldMap4Entry.value",
+            index=1,
+            number=2,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=_b("8\001"),
+    is_extendable=False,
+    syntax="proto2",
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=1048,
+    serialized_end=1134,
 )
 
 _TESTMESSAGE = _descriptor.Descriptor(
-  name='TestMessage',
-  full_name='mlflow.TestMessage',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='field_int32', full_name='mlflow.TestMessage.field_int32', index=0,
-      number=1, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='field_int64', full_name='mlflow.TestMessage.field_int64', index=1,
-      number=2, type=3, cpp_type=2, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='field_uint32', full_name='mlflow.TestMessage.field_uint32', index=2,
-      number=3, type=13, cpp_type=3, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='field_uint64', full_name='mlflow.TestMessage.field_uint64', index=3,
-      number=4, type=4, cpp_type=4, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='field_sint32', full_name='mlflow.TestMessage.field_sint32', index=4,
-      number=5, type=17, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='field_sint64', full_name='mlflow.TestMessage.field_sint64', index=5,
-      number=6, type=18, cpp_type=2, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='field_fixed32', full_name='mlflow.TestMessage.field_fixed32', index=6,
-      number=7, type=7, cpp_type=3, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='field_fixed64', full_name='mlflow.TestMessage.field_fixed64', index=7,
-      number=8, type=6, cpp_type=4, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='field_sfixed32', full_name='mlflow.TestMessage.field_sfixed32', index=8,
-      number=9, type=15, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='field_sfixed64', full_name='mlflow.TestMessage.field_sfixed64', index=9,
-      number=10, type=16, cpp_type=2, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='field_bool', full_name='mlflow.TestMessage.field_bool', index=10,
-      number=11, type=8, cpp_type=7, label=1,
-      has_default_value=False, default_value=False,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='field_string', full_name='mlflow.TestMessage.field_string', index=11,
-      number=12, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='field_with_default1', full_name='mlflow.TestMessage.field_with_default1', index=12,
-      number=13, type=3, cpp_type=2, label=1,
-      has_default_value=True, default_value=100,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='field_with_default2', full_name='mlflow.TestMessage.field_with_default2', index=13,
-      number=14, type=3, cpp_type=2, label=1,
-      has_default_value=True, default_value=200,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='field_repeated_int64', full_name='mlflow.TestMessage.field_repeated_int64', index=14,
-      number=15, type=3, cpp_type=2, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='field_enum', full_name='mlflow.TestMessage.field_enum', index=15,
-      number=16, type=14, cpp_type=8, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='field_inner_message', full_name='mlflow.TestMessage.field_inner_message', index=16,
-      number=17, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='oneof1', full_name='mlflow.TestMessage.oneof1', index=17,
-      number=18, type=3, cpp_type=2, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='oneof2', full_name='mlflow.TestMessage.oneof2', index=18,
-      number=19, type=3, cpp_type=2, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='field_map1', full_name='mlflow.TestMessage.field_map1', index=19,
-      number=20, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='field_map2', full_name='mlflow.TestMessage.field_map2', index=20,
-      number=21, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='field_map3', full_name='mlflow.TestMessage.field_map3', index=21,
-      number=22, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-    _descriptor.FieldDescriptor(
-      name='field_map4', full_name='mlflow.TestMessage.field_map4', index=22,
-      number=23, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-  ],
-  extensions=[
-  ],
-  nested_types=[_TESTMESSAGE_TESTINNERMESSAGE, _TESTMESSAGE_FIELDMAP1ENTRY, _TESTMESSAGE_FIELDMAP2ENTRY, _TESTMESSAGE_FIELDMAP3ENTRY, _TESTMESSAGE_FIELDMAP4ENTRY, ],
-  enum_types=[
-    _TESTMESSAGE_TESTENUM,
-  ],
-  serialized_options=None,
-  is_extendable=True,
-  syntax='proto2',
-  extension_ranges=[(1000, 2000), ],
-  oneofs=[
-    _descriptor.OneofDescriptor(
-      name='test_oneof', full_name='mlflow.TestMessage.test_oneof',
-      index=0, containing_type=None, fields=[]),
-  ],
-  serialized_start=31,
-  serialized_end=1212,
+    name="TestMessage",
+    full_name="mlflow.TestMessage",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="field_int32",
+            full_name="mlflow.TestMessage.field_int32",
+            index=0,
+            number=1,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="field_int64",
+            full_name="mlflow.TestMessage.field_int64",
+            index=1,
+            number=2,
+            type=3,
+            cpp_type=2,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="field_uint32",
+            full_name="mlflow.TestMessage.field_uint32",
+            index=2,
+            number=3,
+            type=13,
+            cpp_type=3,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="field_uint64",
+            full_name="mlflow.TestMessage.field_uint64",
+            index=3,
+            number=4,
+            type=4,
+            cpp_type=4,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="field_sint32",
+            full_name="mlflow.TestMessage.field_sint32",
+            index=4,
+            number=5,
+            type=17,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="field_sint64",
+            full_name="mlflow.TestMessage.field_sint64",
+            index=5,
+            number=6,
+            type=18,
+            cpp_type=2,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="field_fixed32",
+            full_name="mlflow.TestMessage.field_fixed32",
+            index=6,
+            number=7,
+            type=7,
+            cpp_type=3,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="field_fixed64",
+            full_name="mlflow.TestMessage.field_fixed64",
+            index=7,
+            number=8,
+            type=6,
+            cpp_type=4,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="field_sfixed32",
+            full_name="mlflow.TestMessage.field_sfixed32",
+            index=8,
+            number=9,
+            type=15,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="field_sfixed64",
+            full_name="mlflow.TestMessage.field_sfixed64",
+            index=9,
+            number=10,
+            type=16,
+            cpp_type=2,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="field_bool",
+            full_name="mlflow.TestMessage.field_bool",
+            index=10,
+            number=11,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=False,
+            default_value=False,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="field_string",
+            full_name="mlflow.TestMessage.field_string",
+            index=11,
+            number=12,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="field_with_default1",
+            full_name="mlflow.TestMessage.field_with_default1",
+            index=12,
+            number=13,
+            type=3,
+            cpp_type=2,
+            label=1,
+            has_default_value=True,
+            default_value=100,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="field_with_default2",
+            full_name="mlflow.TestMessage.field_with_default2",
+            index=13,
+            number=14,
+            type=3,
+            cpp_type=2,
+            label=1,
+            has_default_value=True,
+            default_value=200,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="field_repeated_int64",
+            full_name="mlflow.TestMessage.field_repeated_int64",
+            index=14,
+            number=15,
+            type=3,
+            cpp_type=2,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="field_enum",
+            full_name="mlflow.TestMessage.field_enum",
+            index=15,
+            number=16,
+            type=14,
+            cpp_type=8,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="field_inner_message",
+            full_name="mlflow.TestMessage.field_inner_message",
+            index=16,
+            number=17,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="oneof1",
+            full_name="mlflow.TestMessage.oneof1",
+            index=17,
+            number=18,
+            type=3,
+            cpp_type=2,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="oneof2",
+            full_name="mlflow.TestMessage.oneof2",
+            index=18,
+            number=19,
+            type=3,
+            cpp_type=2,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="field_map1",
+            full_name="mlflow.TestMessage.field_map1",
+            index=19,
+            number=20,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="field_map2",
+            full_name="mlflow.TestMessage.field_map2",
+            index=20,
+            number=21,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="field_map3",
+            full_name="mlflow.TestMessage.field_map3",
+            index=21,
+            number=22,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+        _descriptor.FieldDescriptor(
+            name="field_map4",
+            full_name="mlflow.TestMessage.field_map4",
+            index=22,
+            number=23,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+    ],
+    extensions=[],
+    nested_types=[
+        _TESTMESSAGE_TESTINNERMESSAGE,
+        _TESTMESSAGE_FIELDMAP1ENTRY,
+        _TESTMESSAGE_FIELDMAP2ENTRY,
+        _TESTMESSAGE_FIELDMAP3ENTRY,
+        _TESTMESSAGE_FIELDMAP4ENTRY,
+    ],
+    enum_types=[_TESTMESSAGE_TESTENUM,],
+    serialized_options=None,
+    is_extendable=True,
+    syntax="proto2",
+    extension_ranges=[(1000, 2000),],
+    oneofs=[
+        _descriptor.OneofDescriptor(
+            name="test_oneof",
+            full_name="mlflow.TestMessage.test_oneof",
+            index=0,
+            containing_type=None,
+            fields=[],
+        ),
+    ],
+    serialized_start=31,
+    serialized_end=1212,
 )
 
 
 _EXTENSIONMESSAGE = _descriptor.Descriptor(
-  name='ExtensionMessage',
-  full_name='mlflow.ExtensionMessage',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-  ],
-  extensions=[
-    _descriptor.FieldDescriptor(
-      name='field_extended_int64', full_name='mlflow.ExtensionMessage.field_extended_int64', index=0,
-      number=1001, type=3, cpp_type=2, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=True, extension_scope=None,
-      serialized_options=None, file=DESCRIPTOR),
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  serialized_options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1214,
-  serialized_end=1284,
+    name="ExtensionMessage",
+    full_name="mlflow.ExtensionMessage",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[],
+    extensions=[
+        _descriptor.FieldDescriptor(
+            name="field_extended_int64",
+            full_name="mlflow.ExtensionMessage.field_extended_int64",
+            index=0,
+            number=1001,
+            type=3,
+            cpp_type=2,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=True,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+        ),
+    ],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=False,
+    syntax="proto2",
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=1214,
+    serialized_end=1284,
 )
 
 _TESTMESSAGE_TESTINNERMESSAGE.containing_type = _TESTMESSAGE
 _TESTMESSAGE_FIELDMAP1ENTRY.containing_type = _TESTMESSAGE
 _TESTMESSAGE_FIELDMAP2ENTRY.containing_type = _TESTMESSAGE
 _TESTMESSAGE_FIELDMAP3ENTRY.containing_type = _TESTMESSAGE
-_TESTMESSAGE_FIELDMAP4ENTRY.fields_by_name['value'].message_type = _TESTMESSAGE_TESTINNERMESSAGE
+_TESTMESSAGE_FIELDMAP4ENTRY.fields_by_name["value"].message_type = _TESTMESSAGE_TESTINNERMESSAGE
 _TESTMESSAGE_FIELDMAP4ENTRY.containing_type = _TESTMESSAGE
-_TESTMESSAGE.fields_by_name['field_enum'].enum_type = _TESTMESSAGE_TESTENUM
-_TESTMESSAGE.fields_by_name['field_inner_message'].message_type = _TESTMESSAGE_TESTINNERMESSAGE
-_TESTMESSAGE.fields_by_name['field_map1'].message_type = _TESTMESSAGE_FIELDMAP1ENTRY
-_TESTMESSAGE.fields_by_name['field_map2'].message_type = _TESTMESSAGE_FIELDMAP2ENTRY
-_TESTMESSAGE.fields_by_name['field_map3'].message_type = _TESTMESSAGE_FIELDMAP3ENTRY
-_TESTMESSAGE.fields_by_name['field_map4'].message_type = _TESTMESSAGE_FIELDMAP4ENTRY
+_TESTMESSAGE.fields_by_name["field_enum"].enum_type = _TESTMESSAGE_TESTENUM
+_TESTMESSAGE.fields_by_name["field_inner_message"].message_type = _TESTMESSAGE_TESTINNERMESSAGE
+_TESTMESSAGE.fields_by_name["field_map1"].message_type = _TESTMESSAGE_FIELDMAP1ENTRY
+_TESTMESSAGE.fields_by_name["field_map2"].message_type = _TESTMESSAGE_FIELDMAP2ENTRY
+_TESTMESSAGE.fields_by_name["field_map3"].message_type = _TESTMESSAGE_FIELDMAP3ENTRY
+_TESTMESSAGE.fields_by_name["field_map4"].message_type = _TESTMESSAGE_FIELDMAP4ENTRY
 _TESTMESSAGE_TESTENUM.containing_type = _TESTMESSAGE
-_TESTMESSAGE.oneofs_by_name['test_oneof'].fields.append(
-  _TESTMESSAGE.fields_by_name['oneof1'])
-_TESTMESSAGE.fields_by_name['oneof1'].containing_oneof = _TESTMESSAGE.oneofs_by_name['test_oneof']
-_TESTMESSAGE.oneofs_by_name['test_oneof'].fields.append(
-  _TESTMESSAGE.fields_by_name['oneof2'])
-_TESTMESSAGE.fields_by_name['oneof2'].containing_oneof = _TESTMESSAGE.oneofs_by_name['test_oneof']
-DESCRIPTOR.message_types_by_name['TestMessage'] = _TESTMESSAGE
-DESCRIPTOR.message_types_by_name['ExtensionMessage'] = _EXTENSIONMESSAGE
+_TESTMESSAGE.oneofs_by_name["test_oneof"].fields.append(_TESTMESSAGE.fields_by_name["oneof1"])
+_TESTMESSAGE.fields_by_name["oneof1"].containing_oneof = _TESTMESSAGE.oneofs_by_name["test_oneof"]
+_TESTMESSAGE.oneofs_by_name["test_oneof"].fields.append(_TESTMESSAGE.fields_by_name["oneof2"])
+_TESTMESSAGE.fields_by_name["oneof2"].containing_oneof = _TESTMESSAGE.oneofs_by_name["test_oneof"]
+DESCRIPTOR.message_types_by_name["TestMessage"] = _TESTMESSAGE
+DESCRIPTOR.message_types_by_name["ExtensionMessage"] = _EXTENSIONMESSAGE
 _sym_db.RegisterFileDescriptor(DESCRIPTOR)
 
-TestMessage = _reflection.GeneratedProtocolMessageType('TestMessage', (_message.Message,), dict(
-
-  TestInnerMessage = _reflection.GeneratedProtocolMessageType('TestInnerMessage', (_message.Message,), dict(
-    DESCRIPTOR = _TESTMESSAGE_TESTINNERMESSAGE,
-    __module__ = 'test_message_pb2'
-    # @@protoc_insertion_point(class_scope:mlflow.TestMessage.TestInnerMessage)
-    ))
-  ,
-
-  FieldMap1Entry = _reflection.GeneratedProtocolMessageType('FieldMap1Entry', (_message.Message,), dict(
-    DESCRIPTOR = _TESTMESSAGE_FIELDMAP1ENTRY,
-    __module__ = 'test_message_pb2'
-    # @@protoc_insertion_point(class_scope:mlflow.TestMessage.FieldMap1Entry)
-    ))
-  ,
-
-  FieldMap2Entry = _reflection.GeneratedProtocolMessageType('FieldMap2Entry', (_message.Message,), dict(
-    DESCRIPTOR = _TESTMESSAGE_FIELDMAP2ENTRY,
-    __module__ = 'test_message_pb2'
-    # @@protoc_insertion_point(class_scope:mlflow.TestMessage.FieldMap2Entry)
-    ))
-  ,
-
-  FieldMap3Entry = _reflection.GeneratedProtocolMessageType('FieldMap3Entry', (_message.Message,), dict(
-    DESCRIPTOR = _TESTMESSAGE_FIELDMAP3ENTRY,
-    __module__ = 'test_message_pb2'
-    # @@protoc_insertion_point(class_scope:mlflow.TestMessage.FieldMap3Entry)
-    ))
-  ,
-
-  FieldMap4Entry = _reflection.GeneratedProtocolMessageType('FieldMap4Entry', (_message.Message,), dict(
-    DESCRIPTOR = _TESTMESSAGE_FIELDMAP4ENTRY,
-    __module__ = 'test_message_pb2'
-    # @@protoc_insertion_point(class_scope:mlflow.TestMessage.FieldMap4Entry)
-    ))
-  ,
-  DESCRIPTOR = _TESTMESSAGE,
-  __module__ = 'test_message_pb2'
-  # @@protoc_insertion_point(class_scope:mlflow.TestMessage)
-  ))
+TestMessage = _reflection.GeneratedProtocolMessageType(
+    "TestMessage",
+    (_message.Message,),
+    dict(
+        TestInnerMessage=_reflection.GeneratedProtocolMessageType(
+            "TestInnerMessage",
+            (_message.Message,),
+            dict(
+                DESCRIPTOR=_TESTMESSAGE_TESTINNERMESSAGE,
+                __module__="test_message_pb2"
+                # @@protoc_insertion_point(class_scope:mlflow.TestMessage.TestInnerMessage)
+            ),
+        ),
+        FieldMap1Entry=_reflection.GeneratedProtocolMessageType(
+            "FieldMap1Entry",
+            (_message.Message,),
+            dict(
+                DESCRIPTOR=_TESTMESSAGE_FIELDMAP1ENTRY,
+                __module__="test_message_pb2"
+                # @@protoc_insertion_point(class_scope:mlflow.TestMessage.FieldMap1Entry)
+            ),
+        ),
+        FieldMap2Entry=_reflection.GeneratedProtocolMessageType(
+            "FieldMap2Entry",
+            (_message.Message,),
+            dict(
+                DESCRIPTOR=_TESTMESSAGE_FIELDMAP2ENTRY,
+                __module__="test_message_pb2"
+                # @@protoc_insertion_point(class_scope:mlflow.TestMessage.FieldMap2Entry)
+            ),
+        ),
+        FieldMap3Entry=_reflection.GeneratedProtocolMessageType(
+            "FieldMap3Entry",
+            (_message.Message,),
+            dict(
+                DESCRIPTOR=_TESTMESSAGE_FIELDMAP3ENTRY,
+                __module__="test_message_pb2"
+                # @@protoc_insertion_point(class_scope:mlflow.TestMessage.FieldMap3Entry)
+            ),
+        ),
+        FieldMap4Entry=_reflection.GeneratedProtocolMessageType(
+            "FieldMap4Entry",
+            (_message.Message,),
+            dict(
+                DESCRIPTOR=_TESTMESSAGE_FIELDMAP4ENTRY,
+                __module__="test_message_pb2"
+                # @@protoc_insertion_point(class_scope:mlflow.TestMessage.FieldMap4Entry)
+            ),
+        ),
+        DESCRIPTOR=_TESTMESSAGE,
+        __module__="test_message_pb2"
+        # @@protoc_insertion_point(class_scope:mlflow.TestMessage)
+    ),
+)
 _sym_db.RegisterMessage(TestMessage)
 _sym_db.RegisterMessage(TestMessage.TestInnerMessage)
 _sym_db.RegisterMessage(TestMessage.FieldMap1Entry)
@@ -532,14 +916,18 @@
 _sym_db.RegisterMessage(TestMessage.FieldMap3Entry)
 _sym_db.RegisterMessage(TestMessage.FieldMap4Entry)
 
-ExtensionMessage = _reflection.GeneratedProtocolMessageType('ExtensionMessage', (_message.Message,), dict(
-  DESCRIPTOR = _EXTENSIONMESSAGE,
-  __module__ = 'test_message_pb2'
-  # @@protoc_insertion_point(class_scope:mlflow.ExtensionMessage)
-  ))
+ExtensionMessage = _reflection.GeneratedProtocolMessageType(
+    "ExtensionMessage",
+    (_message.Message,),
+    dict(
+        DESCRIPTOR=_EXTENSIONMESSAGE,
+        __module__="test_message_pb2"
+        # @@protoc_insertion_point(class_scope:mlflow.ExtensionMessage)
+    ),
+)
 _sym_db.RegisterMessage(ExtensionMessage)
 
-TestMessage.RegisterExtension(_EXTENSIONMESSAGE.extensions_by_name['field_extended_int64'])
+TestMessage.RegisterExtension(_EXTENSIONMESSAGE.extensions_by_name["field_extended_int64"])
 
 _TESTMESSAGE_FIELDMAP1ENTRY._options = None
 _TESTMESSAGE_FIELDMAP2ENTRY._options = None
diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
index 3c39a4ce5b136..f3c0b6a90cf4b 100644
--- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -1,63 +1,68 @@
 import mlflow
-from mlflow.evaluation import ModelEvaluator, EvaluationMetrics, \
-    EvaluationArtifact, EvaluationResult, EvaluationDataset
+from mlflow.evaluation import (
+    ModelEvaluator,
+    EvaluationMetrics,
+    EvaluationArtifact,
+    EvaluationResult,
+    EvaluationDataset,
+)
 from mlflow.tracking.artifact_utils import get_artifact_uri
 from sklearn import metrics as sk_metrics
 import numpy as np
-import json
+import pandas as pd
+import io
 
 
-class JsonEvaluationArtifact(EvaluationArtifact):
-
-    def save_content_to_file(self, content, output_artifact_path):
-        with open(output_artifact_path, 'w') as fp:
-            json.dump(content, fp)
+class Array2DEvaluationArtifact(EvaluationArtifact):
+    @classmethod
+    def save_content_to_file(cls, content, output_artifact_path):
+        pd.DataFrame(content).to_csv(output_artifact_path, index=False)
 
     @classmethod
     def load_content_from_file(cls, local_artifact_path):
-        with open(local_artifact_path, 'r') as fp:
-            return json.load(fp)
+        pdf = pd.read_csv(local_artifact_path)
+        return pdf.to_numpy()
 
 
 class DummyEvaluator(ModelEvaluator):
-
-    def can_evaluate(
-        self, model_type, evaluator_config=None, **kwargs
-    ):
-        return evaluator_config.get('can_evaluate') and \
-               model_type in ['classifier', 'regressor']
+    def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
+        return evaluator_config.get("can_evaluate") and model_type in ["classifier", "regressor"]
 
     def compute_metrics_and_compute_and_log_artifacts(
-            self, model_type, predict, dataset, evaluator_config, run_id
+        self, model_type, predict, dataset, evaluator_config, run_id
     ):
         X = dataset.data
-        assert isinstance(X, np.ndarray), 'Only support array type feature input'
-        assert dataset.name is not None, 'Dataset name required'
+        assert isinstance(X, np.ndarray), "Only support array type feature input"
+        assert dataset.name is not None, "Dataset name required"
         y = dataset.labels
         y_pred = predict(X)
-        if model_type == 'classifier':
+        if model_type == "classifier":
             accuracy_score = sk_metrics.accuracy_score(y, y_pred)
-            brier_score_loss = sk_metrics.brier_score_loss(y, y_pred)
 
-            metrics = EvaluationMetrics(
-                accuracy_score=accuracy_score,
-                brier_score_loss=brier_score_loss
-            )
+            metrics = EvaluationMetrics(accuracy_score=accuracy_score,)
             confusion_matrix = sk_metrics.confusion_matrix(y, y_pred)
-            confusion_matrix_artifact_name = f'confusion_matrix_on_{dataset.name}.json'
-            confusion_matrix_artifact = JsonEvaluationArtifact(
+            confusion_matrix_artifact_name = f"confusion_matrix_on_{dataset.name}.csv"
+            confusion_matrix_artifact = Array2DEvaluationArtifact(
                 location=get_artifact_uri(run_id, confusion_matrix_artifact_name),
-                content=confusion_matrix
+                content=confusion_matrix,
+            )
+            confusion_matrix_csv_buff = io.StringIO()
+            Array2DEvaluationArtifact.save_content_to_file(
+                confusion_matrix, confusion_matrix_csv_buff
+            )
+            self.mlflow_client.log_text(
+                run_id, confusion_matrix_csv_buff.getvalue(), confusion_matrix_artifact_name
             )
-            self.mlflow_client.log_dict(run_id, confusion_matrix)
             artifacts = {confusion_matrix_artifact_name: confusion_matrix_artifact}
             return metrics, artifacts
-        elif model_type == 'regressor':
+        elif model_type == "regressor":
             mean_absolute_error = sk_metrics.mean_absolute_error(y, y_pred)
             mean_squared_error = sk_metrics.mean_squared_error(y, y_pred)
-            return EvaluationMetrics(
-                mean_absolute_error=mean_absolute_error,
-                mean_squared_error=mean_squared_error
-            ), {}
+            return (
+                EvaluationMetrics(
+                    mean_absolute_error=mean_absolute_error, mean_squared_error=mean_squared_error
+                ),
+                {},
+            )
         else:
-            raise ValueError(f'Unsupported model type {model_type}')
+            raise ValueError(f"Unsupported model type {model_type}")
diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py
index 301ddbad336a7..65b44d149d324 100644
--- a/tests/test_evaluation.py
+++ b/tests/test_evaluation.py
@@ -5,8 +5,17 @@
 import sklearn.datasets
 import sklearn.linear_model
 import pytest
+import numpy as np
 
-from sklearn.metrics import mean_absolute_error, mean_squared_error
+from sklearn.metrics import (
+    accuracy_score,
+    confusion_matrix,
+    mean_absolute_error,
+    mean_squared_error,
+)
+
+from mlflow_test_plugin.dummy_evaluator import Array2DEvaluationArtifact
+from mlflow.tracking.artifact_utils import get_artifact_uri
 
 
 def get_iris():
@@ -23,12 +32,8 @@ def get_run_data(run_id):
     return data.params, data.metrics, tags, artifacts
 
 
-def load_json_artifact(artifact_path):
-    import json
-
-    fpath = mlflow.get_artifact_uri(artifact_path).replace("file://", "")
-    with open(fpath, "r") as f:
-        return json.load(f)
+def get_local_artifact_path(run_id, artifact_path):
+    return get_artifact_uri(run_id, artifact_path).replace("file://", "")
 
 
 @pytest.fixture(scope="module")
@@ -48,44 +53,76 @@ def classifier_model():
 
 
 @pytest.fixture(scope="module")
-def evaluation_dataset():
+def iris_dataset():
     X, y = get_iris()
     eval_X, eval_y = X[0::3], y[0::3]
-    return EvaluationDataset(data=eval_X, labels=eval_y, name='eval_data_1')
+    return EvaluationDataset(data=eval_X, labels=eval_y, name="iris_dataset")
 
 
-def test_reg_evaluate(regressor_model, evaluation_dataset):
-    y_true = evaluation_dataset.labels
-    y_pred = regressor_model.predict(evaluation_dataset.data)
-    expected_mae = mean_absolute_error(y_true, y_pred)
-    expected_mse = mean_squared_error(y_true, y_pred)
+def test_classifier_evaluate(classifier_model, iris_dataset):
+    y_true = iris_dataset.labels
+    y_pred = classifier_model.predict(iris_dataset.data)
+    expected_accuracy_score = accuracy_score(y_true, y_pred)
     expected_metrics = {
-        'mean_absolute_error': expected_mae,
-        'mean_squared_error': expected_mse,
+        "accuracy_score": expected_accuracy_score,
     }
     expected_saved_metrics = {
-        'mean_absolute_error_on_eval_data_1': expected_mae,
-        'mean_squared_error_on_eval_data_1': expected_mse,
+        "accuracy_score_on_iris_dataset": expected_accuracy_score,
     }
 
-    expected_artifact = expected_metrics
+    expected_artifact = confusion_matrix(y_true, y_pred)
 
     with mlflow.start_run() as run:
         eval_result = evaluate(
-            regressor_model, 'regressor', evaluation_dataset,
-            run_id=None, evaluators='dummy_evaluator',
-            evaluator_config={
-                'can_evaluate': True,
-                'metrics_to_calc': ['mean_absolute_error', 'mean_squared_error']
-            }
+            classifier_model,
+            "classifier",
+            iris_dataset,
+            run_id=None,
+            evaluators="dummy_evaluator",
+            evaluator_config={"can_evaluate": True,},
         )
-        saved_artifact_uri = mlflow.get_artifact_uri('metrics_artifact.json')
-        saved_artifact = load_json_artifact('metrics_artifact.json')
-        assert saved_artifact == expected_artifact
 
+    artifact_name = "confusion_matrix_on_iris_dataset.csv"
+    saved_artifact_path = get_local_artifact_path(run.info.run_id, artifact_name)
+
+    _, saved_metrics, _, saved_artifacts = get_run_data(run.info.run_id)
+    assert saved_metrics == expected_saved_metrics
+    assert saved_artifacts == [artifact_name]
+
+    assert eval_result.metrics == expected_metrics
+    returned_confusion_matrix_artifact = eval_result.artifacts[artifact_name]
+    assert np.array_equal(returned_confusion_matrix_artifact.content, expected_artifact)
+    assert np.array_equal(
+        Array2DEvaluationArtifact.load_content_from_file(saved_artifact_path), expected_artifact
+    )
+    assert returned_confusion_matrix_artifact.location == get_artifact_uri(
+        run.info.run_id, artifact_name
+    )
+
+
+def test_regressor_evaluate(regressor_model, iris_dataset):
+    y_true = iris_dataset.labels
+    y_pred = regressor_model.predict(iris_dataset.data)
+    expected_mae = mean_absolute_error(y_true, y_pred)
+    expected_mse = mean_squared_error(y_true, y_pred)
+    expected_metrics = {
+        "mean_absolute_error": expected_mae,
+        "mean_squared_error": expected_mse,
+    }
+    expected_saved_metrics = {
+        "mean_absolute_error_on_iris_dataset": expected_mae,
+        "mean_squared_error_on_iris_dataset": expected_mse,
+    }
+    with mlflow.start_run() as run:
+        eval_result = evaluate(
+            regressor_model,
+            "regressor",
+            iris_dataset,
+            run_id=None,
+            evaluators="dummy_evaluator",
+            evaluator_config={"can_evaluate": True,},
+        )
     _, saved_metrics, _, _ = get_run_data(run.info.run_id)
     assert saved_metrics == expected_saved_metrics
 
     assert eval_result.metrics == expected_metrics
-    assert eval_result.artifacts.content == expected_artifact
-    assert eval_result.artifacts.location == saved_artifact_uri

From 63b1d2bdd70ba90ccb40d8826e53365fa4c2b152 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Sun, 21 Nov 2021 11:11:48 +0800
Subject: [PATCH 014/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/evaluation.py                          | 35 +++++++++++--------
 .../mlflow_test_plugin/dummy_evaluator.py     |  4 +--
 2 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/mlflow/evaluation.py b/mlflow/evaluation.py
index b863e9efdb193..a7b5d3795f7a5 100644
--- a/mlflow/evaluation.py
+++ b/mlflow/evaluation.py
@@ -14,6 +14,7 @@
 from mlflow.entities import Metric, RunTag
 from mlflow.tracking.artifact_utils import _download_artifact_from_uri
 from mlflow.utils import _get_fully_qualified_class_name, load_class
+from mlflow.pyfunc import PyFuncModel
 
 
 class EvaluationMetrics(dict):
@@ -238,7 +239,7 @@ def can_evaluate(self, model_type, evaluator_config=None, **kwargs) -> bool:
         raise NotImplementedError()
 
     def compute_metrics_and_compute_and_log_artifacts(
-        self, model_type, predict, dataset, evaluator_config, run_id
+        self, model, model_type, dataset, evaluator_config, run_id
     ):
         """
         return an tuple of:
@@ -249,12 +250,18 @@ def compute_metrics_and_compute_and_log_artifacts(
         raise NotImplementedError()
 
     def evaluate(
-        self, model_type, predict, dataset, run_id=None, evaluator_config=None, **kwargs
+            self,
+            model: PyFuncModel,
+            model_type,
+            dataset,
+            run_id=None,
+            evaluator_config=None,
+            **kwargs
     ) -> EvaluationResult:
         """
-        :param predict: A function used to compute model predictions. Predict
-                        accepts features from the specified `dataset` and
-                        feeds them to the model, producing output predictions.
+        :param model: A pyfunc model instance.
+        :param model_type: A string describing the model type (e.g., "regressor",
+                   "classifier", …).
         :param dataset: An instance of `EvaluationDataset` containing features
                         and labels (optional) for model evaluation.
         :param run_id: The ID of the MLflow Run to which to log results.
@@ -291,7 +298,7 @@ def evaluate(
             dataset_metadata_str = json.dumps(dataset_metadata_list)
 
             metrics_dict, artifacts_dict = self.compute_metrics_and_compute_and_log_artifacts(
-                model_type, predict, dataset, evaluator_config, run_id
+                model, model_type, dataset, evaluator_config, run_id
             )
             client.log_batch(
                 run_id,
@@ -350,14 +357,14 @@ def get_evaluator(self, evaluator_name):
 
 
 def evaluate(
-    model, model_type, dataset, run_id=None, evaluators=None, evaluator_config=None
+        model: Union[str, PyFuncModel],
+        model_type, dataset,
+        run_id=None,
+        evaluators=None,
+        evaluator_config=None
 ) -> Union[EvaluationResult, Dict[str, EvaluationResult]]:
     """
-    :param model: A model supported by the specified `evaluator`, or a URI
-                  referring to such a model. The default evaluator supports the
-                  following:
-
-                  - A pyfunc model instance (an instance of class `PyFuncModel`)
+    :param model: A pyfunc model instance, or a URI referring to such a model.
 
     :param model_type: A string describing the model type. The default evaluator
                        supports "regressor" and "classifier" as model types.
@@ -388,8 +395,6 @@ def evaluate(
     if isinstance(model, str):
         model = mlflow.pyfunc.load_model(model)
 
-    predict = model.predict
-
     eval_results = []
     for evaluator_name in evaluators:
         config = evaluator_config[evaluator_name]
@@ -399,7 +404,7 @@ def evaluate(
             continue
 
         if evaluator.can_evaluate(model_type, config):
-            result = evaluator.evaluate(model_type, predict, dataset, run_id, config)
+            result = evaluator.evaluate(model, model_type, dataset, run_id, config)
             eval_results.append(result)
 
     merged_eval_result = EvaluationResult(EvaluationMetrics(), dict())
diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
index f3c0b6a90cf4b..2d4b762b69d2e 100644
--- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -29,13 +29,13 @@ def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
         return evaluator_config.get("can_evaluate") and model_type in ["classifier", "regressor"]
 
     def compute_metrics_and_compute_and_log_artifacts(
-        self, model_type, predict, dataset, evaluator_config, run_id
+        self, model, model_type, dataset, evaluator_config, run_id
     ):
         X = dataset.data
         assert isinstance(X, np.ndarray), "Only support array type feature input"
         assert dataset.name is not None, "Dataset name required"
         y = dataset.labels
-        y_pred = predict(X)
+        y_pred = model.predict(X)
         if model_type == "classifier":
             accuracy_score = sk_metrics.accuracy_score(y, y_pred)
 

From afc52e800e2659305f24f13e19f2469ce1bcac2c Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Sun, 21 Nov 2021 17:43:42 +0800
Subject: [PATCH 015/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/evaluation.py                                  | 11 ++++++++++-
 .../mlflow_test_plugin/dummy_evaluator.py             |  5 +----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/mlflow/evaluation.py b/mlflow/evaluation.py
index a7b5d3795f7a5..9b5b28e35c33b 100644
--- a/mlflow/evaluation.py
+++ b/mlflow/evaluation.py
@@ -128,7 +128,7 @@ class EvaluationDataset:
 
     NUM_SAMPLE_ROWS_FOR_HASH = 5
 
-    def __init__(self, data, labels=None, name=None, path=None):
+    def __init__(self, data, labels, name=None, path=None):
         """
         :param data: One of the following:
          - A numpy array or list of evaluation features, excluding labels.
@@ -152,6 +152,15 @@ def __init__(self, data, labels=None, name=None, path=None):
         self.path = path
         self._hash = None
 
+    def extract_features_and_labels(self):
+        if isinstance(self.data, np.ndarray):
+            return self.data, self.labels
+        elif isinstance(self.data, pd.DataFrame):
+            feature_cols = [x for x in self.data.columns if x != self.labels]
+            return self.data[feature_cols], self.data[self.labels]
+        else:
+            raise ValueError(f'Unsupported data type: {type(self.data)}')
+
     @staticmethod
     def _gen_md5_for_arraylike_obj(md5_gen, data):
         md5_gen.update(pickle.dumps(len(data)))
diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
index 2d4b762b69d2e..5598b394b88f7 100644
--- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -31,10 +31,7 @@ def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
     def compute_metrics_and_compute_and_log_artifacts(
         self, model, model_type, dataset, evaluator_config, run_id
     ):
-        X = dataset.data
-        assert isinstance(X, np.ndarray), "Only support array type feature input"
-        assert dataset.name is not None, "Dataset name required"
-        y = dataset.labels
+        X, y = dataset.extract_features_and_labels()
         y_pred = model.predict(X)
         if model_type == "classifier":
             accuracy_score = sk_metrics.accuracy_score(y, y_pred)

From feddc1e4107935fbee18f40f9befe07993fedf5f Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Sun, 21 Nov 2021 20:05:12 +0800
Subject: [PATCH 016/120] rename module

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/{evaluation.py => evaluation/__init__.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename mlflow/{evaluation.py => evaluation/__init__.py} (100%)

diff --git a/mlflow/evaluation.py b/mlflow/evaluation/__init__.py
similarity index 100%
rename from mlflow/evaluation.py
rename to mlflow/evaluation/__init__.py

From 27814cf9cff73bf1c19e60c3a4c4c8c90ff7fb09 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 22 Nov 2021 14:41:59 +0800
Subject: [PATCH 017/120] address comments

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/evaluation/__init__.py                 | 92 ++++++++-----------
 .../mlflow_test_plugin/dummy_evaluator.py     |  2 +-
 2 files changed, 40 insertions(+), 54 deletions(-)

diff --git a/mlflow/evaluation/__init__.py b/mlflow/evaluation/__init__.py
index 9b5b28e35c33b..29c7550b979cb 100644
--- a/mlflow/evaluation/__init__.py
+++ b/mlflow/evaluation/__init__.py
@@ -54,12 +54,6 @@ def location(self) -> str:
         """
         return self._location
 
-    def __getstate__(self, state):
-        state = state.__dict__.copy()
-        # skip pickling artifact content
-        del state["_content"]
-        return state
-
 
 class EvaluationResult:
     def __init__(self, metrics, artifacts):
@@ -143,16 +137,37 @@ def __init__(self, data, labels, name=None, path=None):
 
         :param name: (Optional) The name of the dataset (must not contain ").
 
-        :param path: (Optional) the path to a serialized DataFrame
+        :param path: (Optional) the path to a serialized DataFrame (must not contain ").
           (e.g. a delta table, parquet file)
         """
-        self.user_specified_name = name
+        if name is not None and '"' in name:
+            raise ValueError(f'Dataset name cannot include " but get name {name}')
+        if path is not None and '"' in path:
+            raise ValueError(f'Dataset path cannot include " but get name {path}')
+
+        if isinstance(data, (np.ndarray, list)):
+            if not isinstance(labels, (np.ndarray, list)):
+                raise ValueError(
+                    'If data is a numpy array or list of evaluation features, '
+                    'labels must be a numpy array or list of evaluation labels'
+                )
+        elif isinstance(data, pd.DataFrame):
+            if not isinstance(labels, str):
+                raise ValueError(
+                    'If data is a Pandas DataFrame, labels must be the string name of a column '
+                    'from `data` that contains evaluation labels'
+                )
+        else:
+            raise ValueError('The data argument must be a numpy array, a list or a '
+                             'Pandas DataFrame.')
+
+        self._user_specified_name = name
         self.data = data
         self.labels = labels
         self.path = path
         self._hash = None
 
-    def extract_features_and_labels(self):
+    def _extract_features_and_labels(self):
         if isinstance(self.data, np.ndarray):
             return self.data, self.labels
         elif isinstance(self.data, pd.DataFrame):
@@ -172,7 +187,7 @@ def _gen_md5_for_arraylike_obj(md5_gen, data):
 
     @property
     def name(self):
-        return self.user_specified_name if self.user_specified_name is not None else self.hash
+        return self._user_specified_name if self._user_specified_name is not None else self.hash
 
     @property
     def hash(self):
@@ -180,9 +195,7 @@ def hash(self):
         Compute a hash from the specified dataset by selecting the first 5 records, last 5 records,
         dataset size and feeding them through a cheap, low-collision hash function
         """
-        if self._hash is not None:
-            return self._hash
-        else:
+        if self._hash is None:
             md5_gen = hashlib.md5()
             if isinstance(self.data, np.ndarray):
                 EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
@@ -190,48 +203,20 @@ def hash(self):
             elif isinstance(self.data, pd.DataFrame):
                 EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
                 md5_gen.update(self.labels.encode("UTF-8"))
-            return md5_gen.hexdigest()
+            self._hash = md5_gen.hexdigest()
+        return self._hash
 
     @property
-    def metadata(self):
+    def _metadata(self):
         metadata = {
+            "name": self.name,
             "hash": self.hash,
-            "path": self.path,
         }
-        if self.user_specified_name is not None:
-            metadata["name"] = self.user_specified_name
+        if self.path is not None:
+            metadata["path"] = self.path
         return metadata
 
 
-class GetOrCreateRunId:
-    """
-    Get or create a run, return a run_id
-    if user specified a run_id, use it.
-    otherwise if there's an active run, use it
-    otherwise create a managed run.
-    """
-
-    def __init__(self, run_id):
-        self.managed_run_context = None
-        if run_id is not None:
-            self.run_id = run_id
-        elif mlflow.active_run() is not None:
-            self.run_id = mlflow.active_run().info.run_id
-        else:
-            self.run_id = None
-
-    def __enter__(self):
-        if self.run_id is not None:
-            return self.run_id
-        else:
-            self.managed_run_context = mlflow.start_run()
-            return self.managed_run_context.__enter__().info.run_id
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if self.managed_run_context is not None:
-            return self.managed_run_context.__exit__(exc_type, exc_val, exc_tb)
-
-
 class ModelEvaluator:
     def can_evaluate(self, model_type, evaluator_config=None, **kwargs) -> bool:
         """
@@ -284,7 +269,7 @@ def evaluate(
         client = mlflow.tracking.MlflowClient()
         self.mlflow_client = client
 
-        with GetOrCreateRunId(run_id) as run_id:
+        with mlflow.start_run(run_id=run_id):
             timestamp = int(time.time() * 1000)
             existing_dataset_metadata_str = client.get_run(run_id).data.tags.get("mlflow.datasets")
             if existing_dataset_metadata_str is not None:
@@ -296,13 +281,13 @@ def evaluate(
             for metadata in dataset_metadata_list:
                 if (
                     metadata["hash"] == dataset.hash
-                    and metadata["name"] == dataset.user_specified_name
+                    and metadata["name"] == dataset._user_specified_name
                 ):
                     metadata_exists = True
                     break
 
             if not metadata_exists:
-                dataset_metadata_list.append(dataset.metadata)
+                dataset_metadata_list.append(dataset._metadata)
 
             dataset_metadata_str = json.dumps(dataset_metadata_list)
 
@@ -371,7 +356,7 @@ def evaluate(
         run_id=None,
         evaluators=None,
         evaluator_config=None
-) -> Union[EvaluationResult, Dict[str, EvaluationResult]]:
+) -> EvaluationResult:
     """
     :param model: A pyfunc model instance, or a URI referring to such a model.
 
@@ -387,7 +372,8 @@ def evaluate(
     :param evaluators: The name of the evaluator to use for model evaluations, or
                        a list of evaluator names. If unspecified, all evaluators
                        capable  of evaluating the specified model on the specified
-                       dataset are used.
+                       dataset are used. The default evaluator can be referred to
+                       by the name 'default'.
     :param evaluator_config: A dictionary of additional configurations to supply
                              to the evaluator. If multiple evaluators are
                              specified, each configuration should be supplied as
@@ -395,7 +381,7 @@ def evaluate(
     :return: An `EvaluationResult` instance containing evaluation results.
     """
     if evaluators is None:
-        evaluators = "default_evaluator"
+        evaluators = "default"
 
     if not isinstance(evaluators, list):
         evaluators = [evaluators]
diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
index 5598b394b88f7..8835da6425adc 100644
--- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -31,7 +31,7 @@ def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
     def compute_metrics_and_compute_and_log_artifacts(
         self, model, model_type, dataset, evaluator_config, run_id
     ):
-        X, y = dataset.extract_features_and_labels()
+        X, y = dataset._extract_features_and_labels()
         y_pred = model.predict(X)
         if model_type == "classifier":
             accuracy_score = sk_metrics.accuracy_score(y, y_pred)

From fc9cf1a37fdfb6c0ead3b0331dc466405e27956a Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 22 Nov 2021 14:48:13 +0800
Subject: [PATCH 018/120] address comments

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/evaluation/__init__.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/mlflow/evaluation/__init__.py b/mlflow/evaluation/__init__.py
index 29c7550b979cb..f6c30310d1310 100644
--- a/mlflow/evaluation/__init__.py
+++ b/mlflow/evaluation/__init__.py
@@ -269,9 +269,9 @@ def evaluate(
         client = mlflow.tracking.MlflowClient()
         self.mlflow_client = client
 
-        with mlflow.start_run(run_id=run_id):
+        def do_evaluate(_run_id):
             timestamp = int(time.time() * 1000)
-            existing_dataset_metadata_str = client.get_run(run_id).data.tags.get("mlflow.datasets")
+            existing_dataset_metadata_str = client.get_run(_run_id).data.tags.get("mlflow.datasets")
             if existing_dataset_metadata_str is not None:
                 dataset_metadata_list = json.loads(existing_dataset_metadata_str)
             else:
@@ -292,10 +292,10 @@ def evaluate(
             dataset_metadata_str = json.dumps(dataset_metadata_list)
 
             metrics_dict, artifacts_dict = self.compute_metrics_and_compute_and_log_artifacts(
-                model, model_type, dataset, evaluator_config, run_id
+                model, model_type, dataset, evaluator_config, _run_id
             )
             client.log_batch(
-                run_id,
+                _run_id,
                 metrics=[
                     Metric(key=f"{key}_on_{dataset.name}", value=value, timestamp=timestamp, step=0)
                     for key, value in metrics_dict.items()
@@ -305,6 +305,12 @@ def evaluate(
 
             return EvaluationResult(metrics_dict, artifacts_dict)
 
+        if mlflow.active_run() is not None:
+            return do_evaluate(mlflow.active_run().info.run_id)
+        else:
+            with mlflow.start_run(run_id=run_id):
+                return do_evaluate(run_id)
+
 
 class ModelEvaluatorRegistry:
     """

From d64230997dd4ecbda1d5d4c7c1da21e68f44b6f7 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 22 Nov 2021 14:51:23 +0800
Subject: [PATCH 019/120] revert black change

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 tests/protos/test_message_pb2.py | 1350 +++++++++++-------------------
 1 file changed, 481 insertions(+), 869 deletions(-)

diff --git a/tests/protos/test_message_pb2.py b/tests/protos/test_message_pb2.py
index f8cf37f752c24..b2e4840487095 100644
--- a/tests/protos/test_message_pb2.py
+++ b/tests/protos/test_message_pb2.py
@@ -2,913 +2,529 @@
 # source: test_message.proto
 
 import sys
-
-_b = sys.version_info[0] < 3 and (lambda x: x) or (lambda x: x.encode("latin1"))
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
 from google.protobuf import descriptor as _descriptor
 from google.protobuf import message as _message
 from google.protobuf import reflection as _reflection
 from google.protobuf import symbol_database as _symbol_database
-
 # @@protoc_insertion_point(imports)
 
 _sym_db = _symbol_database.Default()
 
 
+
+
 DESCRIPTOR = _descriptor.FileDescriptor(
-    name="test_message.proto",
-    package="mlflow",
-    syntax="proto2",
-    serialized_options=None,
-    serialized_pb=_b(
-        '\n\x12test_message.proto\x12\x06mlflow"\x9d\t\n\x0bTestMessage\x12\x13\n\x0b\x66ield_int32\x18\x01 \x01(\x05\x12\x13\n\x0b\x66ield_int64\x18\x02 \x01(\x03\x12\x14\n\x0c\x66ield_uint32\x18\x03 \x01(\r\x12\x14\n\x0c\x66ield_uint64\x18\x04 \x01(\x04\x12\x14\n\x0c\x66ield_sint32\x18\x05 \x01(\x11\x12\x14\n\x0c\x66ield_sint64\x18\x06 \x01(\x12\x12\x15\n\rfield_fixed32\x18\x07 \x01(\x07\x12\x15\n\rfield_fixed64\x18\x08 \x01(\x06\x12\x16\n\x0e\x66ield_sfixed32\x18\t \x01(\x0f\x12\x16\n\x0e\x66ield_sfixed64\x18\n \x01(\x10\x12\x12\n\nfield_bool\x18\x0b \x01(\x08\x12\x14\n\x0c\x66ield_string\x18\x0c \x01(\t\x12 \n\x13\x66ield_with_default1\x18\r \x01(\x03:\x03\x31\x30\x30\x12 \n\x13\x66ield_with_default2\x18\x0e \x01(\x03:\x03\x32\x30\x30\x12\x1c\n\x14\x66ield_repeated_int64\x18\x0f \x03(\x03\x12\x30\n\nfield_enum\x18\x10 \x01(\x0e\x32\x1c.mlflow.TestMessage.TestEnum\x12\x41\n\x13\x66ield_inner_message\x18\x11 \x03(\x0b\x32$.mlflow.TestMessage.TestInnerMessage\x12\x10\n\x06oneof1\x18\x12 \x01(\x03H\x00\x12\x10\n\x06oneof2\x18\x13 \x01(\x03H\x00\x12\x36\n\nfield_map1\x18\x14 \x03(\x0b\x32".mlflow.TestMessage.FieldMap1Entry\x12\x36\n\nfield_map2\x18\x15 \x03(\x0b\x32".mlflow.TestMessage.FieldMap2Entry\x12\x36\n\nfield_map3\x18\x16 \x03(\x0b\x32".mlflow.TestMessage.FieldMap3Entry\x12\x36\n\nfield_map4\x18\x17 \x03(\x0b\x32".mlflow.TestMessage.FieldMap4Entry\x1am\n\x10TestInnerMessage\x12\x19\n\x11\x66ield_inner_int64\x18\x01 \x01(\x03\x12"\n\x1a\x66ield_inner_repeated_int64\x18\x02 \x03(\x03\x12\x1a\n\x12\x66ield_inner_string\x18\x03 \x01(\t\x1a\x30\n\x0e\x46ieldMap1Entry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x30\n\x0e\x46ieldMap2Entry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\x03:\x02\x38\x01\x1a\x30\n\x0e\x46ieldMap3Entry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\x03:\x02\x38\x01\x1aV\n\x0e\x46ieldMap4Entry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\x33\n\x05value\x18\x02 \x01(\x0b\x32$.mlflow.TestMessage.TestInnerMessage:\x02\x38\x01"6\n\x08TestEnum\x12\x08\n\x04NONE\x10\x00\x12\x0f\n\x0b\x45NUM_VALUE1\x10\x01\x12\x0f\n\x0b\x45NUM_VALUE2\x10\x02*\x06\x08\xe8\x07\x10\xd0\x0f\x42\x0c\n\ntest_oneof"F\n\x10\x45xtensionMessage22\n\x14\x66ield_extended_int64\x12\x13.mlflow.TestMessage\x18\xe9\x07 \x01(\x03'
-    ),
+  name='test_message.proto',
+  package='mlflow',
+  syntax='proto2',
+  serialized_options=None,
+  serialized_pb=_b('\n\x12test_message.proto\x12\x06mlflow\"\x9d\t\n\x0bTestMessage\x12\x13\n\x0b\x66ield_int32\x18\x01 \x01(\x05\x12\x13\n\x0b\x66ield_int64\x18\x02 \x01(\x03\x12\x14\n\x0c\x66ield_uint32\x18\x03 \x01(\r\x12\x14\n\x0c\x66ield_uint64\x18\x04 \x01(\x04\x12\x14\n\x0c\x66ield_sint32\x18\x05 \x01(\x11\x12\x14\n\x0c\x66ield_sint64\x18\x06 \x01(\x12\x12\x15\n\rfield_fixed32\x18\x07 \x01(\x07\x12\x15\n\rfield_fixed64\x18\x08 \x01(\x06\x12\x16\n\x0e\x66ield_sfixed32\x18\t \x01(\x0f\x12\x16\n\x0e\x66ield_sfixed64\x18\n \x01(\x10\x12\x12\n\nfield_bool\x18\x0b \x01(\x08\x12\x14\n\x0c\x66ield_string\x18\x0c \x01(\t\x12 \n\x13\x66ield_with_default1\x18\r \x01(\x03:\x03\x31\x30\x30\x12 \n\x13\x66ield_with_default2\x18\x0e \x01(\x03:\x03\x32\x30\x30\x12\x1c\n\x14\x66ield_repeated_int64\x18\x0f \x03(\x03\x12\x30\n\nfield_enum\x18\x10 \x01(\x0e\x32\x1c.mlflow.TestMessage.TestEnum\x12\x41\n\x13\x66ield_inner_message\x18\x11 \x03(\x0b\x32$.mlflow.TestMessage.TestInnerMessage\x12\x10\n\x06oneof1\x18\x12 \x01(\x03H\x00\x12\x10\n\x06oneof2\x18\x13 \x01(\x03H\x00\x12\x36\n\nfield_map1\x18\x14 \x03(\x0b\x32\".mlflow.TestMessage.FieldMap1Entry\x12\x36\n\nfield_map2\x18\x15 \x03(\x0b\x32\".mlflow.TestMessage.FieldMap2Entry\x12\x36\n\nfield_map3\x18\x16 \x03(\x0b\x32\".mlflow.TestMessage.FieldMap3Entry\x12\x36\n\nfield_map4\x18\x17 \x03(\x0b\x32\".mlflow.TestMessage.FieldMap4Entry\x1am\n\x10TestInnerMessage\x12\x19\n\x11\x66ield_inner_int64\x18\x01 \x01(\x03\x12\"\n\x1a\x66ield_inner_repeated_int64\x18\x02 \x03(\x03\x12\x1a\n\x12\x66ield_inner_string\x18\x03 \x01(\t\x1a\x30\n\x0e\x46ieldMap1Entry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x30\n\x0e\x46ieldMap2Entry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\x03:\x02\x38\x01\x1a\x30\n\x0e\x46ieldMap3Entry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\r\n\x05value\x18\x02 \x01(\x03:\x02\x38\x01\x1aV\n\x0e\x46ieldMap4Entry\x12\x0b\n\x03key\x18\x01 \x01(\x03\x12\x33\n\x05value\x18\x02 \x01(\x0b\x32$.mlflow.TestMessage.TestInnerMessage:\x02\x38\x01\"6\n\x08TestEnum\x12\x08\n\x04NONE\x10\x00\x12\x0f\n\x0b\x45NUM_VALUE1\x10\x01\x12\x0f\n\x0b\x45NUM_VALUE2\x10\x02*\x06\x08\xe8\x07\x10\xd0\x0f\x42\x0c\n\ntest_oneof\"F\n\x10\x45xtensionMessage22\n\x14\x66ield_extended_int64\x12\x13.mlflow.TestMessage\x18\xe9\x07 \x01(\x03')
 )
 
 
+
 _TESTMESSAGE_TESTENUM = _descriptor.EnumDescriptor(
-    name="TestEnum",
-    full_name="mlflow.TestMessage.TestEnum",
-    filename=None,
-    file=DESCRIPTOR,
-    values=[
-        _descriptor.EnumValueDescriptor(
-            name="NONE", index=0, number=0, serialized_options=None, type=None
-        ),
-        _descriptor.EnumValueDescriptor(
-            name="ENUM_VALUE1", index=1, number=1, serialized_options=None, type=None
-        ),
-        _descriptor.EnumValueDescriptor(
-            name="ENUM_VALUE2", index=2, number=2, serialized_options=None, type=None
-        ),
-    ],
-    containing_type=None,
-    serialized_options=None,
-    serialized_start=1136,
-    serialized_end=1190,
+  name='TestEnum',
+  full_name='mlflow.TestMessage.TestEnum',
+  filename=None,
+  file=DESCRIPTOR,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='NONE', index=0, number=0,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='ENUM_VALUE1', index=1, number=1,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='ENUM_VALUE2', index=2, number=2,
+      serialized_options=None,
+      type=None),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=1136,
+  serialized_end=1190,
 )
 _sym_db.RegisterEnumDescriptor(_TESTMESSAGE_TESTENUM)
 
 
 _TESTMESSAGE_TESTINNERMESSAGE = _descriptor.Descriptor(
-    name="TestInnerMessage",
-    full_name="mlflow.TestMessage.TestInnerMessage",
-    filename=None,
-    file=DESCRIPTOR,
-    containing_type=None,
-    fields=[
-        _descriptor.FieldDescriptor(
-            name="field_inner_int64",
-            full_name="mlflow.TestMessage.TestInnerMessage.field_inner_int64",
-            index=0,
-            number=1,
-            type=3,
-            cpp_type=2,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="field_inner_repeated_int64",
-            full_name="mlflow.TestMessage.TestInnerMessage.field_inner_repeated_int64",
-            index=1,
-            number=2,
-            type=3,
-            cpp_type=2,
-            label=3,
-            has_default_value=False,
-            default_value=[],
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="field_inner_string",
-            full_name="mlflow.TestMessage.TestInnerMessage.field_inner_string",
-            index=2,
-            number=3,
-            type=9,
-            cpp_type=9,
-            label=1,
-            has_default_value=False,
-            default_value=_b("").decode("utf-8"),
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-    ],
-    extensions=[],
-    nested_types=[],
-    enum_types=[],
-    serialized_options=None,
-    is_extendable=False,
-    syntax="proto2",
-    extension_ranges=[],
-    oneofs=[],
-    serialized_start=787,
-    serialized_end=896,
+  name='TestInnerMessage',
+  full_name='mlflow.TestMessage.TestInnerMessage',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='field_inner_int64', full_name='mlflow.TestMessage.TestInnerMessage.field_inner_int64', index=0,
+      number=1, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='field_inner_repeated_int64', full_name='mlflow.TestMessage.TestInnerMessage.field_inner_repeated_int64', index=1,
+      number=2, type=3, cpp_type=2, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='field_inner_string', full_name='mlflow.TestMessage.TestInnerMessage.field_inner_string', index=2,
+      number=3, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=787,
+  serialized_end=896,
 )
 
 _TESTMESSAGE_FIELDMAP1ENTRY = _descriptor.Descriptor(
-    name="FieldMap1Entry",
-    full_name="mlflow.TestMessage.FieldMap1Entry",
-    filename=None,
-    file=DESCRIPTOR,
-    containing_type=None,
-    fields=[
-        _descriptor.FieldDescriptor(
-            name="key",
-            full_name="mlflow.TestMessage.FieldMap1Entry.key",
-            index=0,
-            number=1,
-            type=3,
-            cpp_type=2,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="value",
-            full_name="mlflow.TestMessage.FieldMap1Entry.value",
-            index=1,
-            number=2,
-            type=9,
-            cpp_type=9,
-            label=1,
-            has_default_value=False,
-            default_value=_b("").decode("utf-8"),
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-    ],
-    extensions=[],
-    nested_types=[],
-    enum_types=[],
-    serialized_options=_b("8\001"),
-    is_extendable=False,
-    syntax="proto2",
-    extension_ranges=[],
-    oneofs=[],
-    serialized_start=898,
-    serialized_end=946,
+  name='FieldMap1Entry',
+  full_name='mlflow.TestMessage.FieldMap1Entry',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='key', full_name='mlflow.TestMessage.FieldMap1Entry.key', index=0,
+      number=1, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='value', full_name='mlflow.TestMessage.FieldMap1Entry.value', index=1,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=_b('8\001'),
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=898,
+  serialized_end=946,
 )
 
 _TESTMESSAGE_FIELDMAP2ENTRY = _descriptor.Descriptor(
-    name="FieldMap2Entry",
-    full_name="mlflow.TestMessage.FieldMap2Entry",
-    filename=None,
-    file=DESCRIPTOR,
-    containing_type=None,
-    fields=[
-        _descriptor.FieldDescriptor(
-            name="key",
-            full_name="mlflow.TestMessage.FieldMap2Entry.key",
-            index=0,
-            number=1,
-            type=9,
-            cpp_type=9,
-            label=1,
-            has_default_value=False,
-            default_value=_b("").decode("utf-8"),
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="value",
-            full_name="mlflow.TestMessage.FieldMap2Entry.value",
-            index=1,
-            number=2,
-            type=3,
-            cpp_type=2,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-    ],
-    extensions=[],
-    nested_types=[],
-    enum_types=[],
-    serialized_options=_b("8\001"),
-    is_extendable=False,
-    syntax="proto2",
-    extension_ranges=[],
-    oneofs=[],
-    serialized_start=948,
-    serialized_end=996,
+  name='FieldMap2Entry',
+  full_name='mlflow.TestMessage.FieldMap2Entry',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='key', full_name='mlflow.TestMessage.FieldMap2Entry.key', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='value', full_name='mlflow.TestMessage.FieldMap2Entry.value', index=1,
+      number=2, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=_b('8\001'),
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=948,
+  serialized_end=996,
 )
 
 _TESTMESSAGE_FIELDMAP3ENTRY = _descriptor.Descriptor(
-    name="FieldMap3Entry",
-    full_name="mlflow.TestMessage.FieldMap3Entry",
-    filename=None,
-    file=DESCRIPTOR,
-    containing_type=None,
-    fields=[
-        _descriptor.FieldDescriptor(
-            name="key",
-            full_name="mlflow.TestMessage.FieldMap3Entry.key",
-            index=0,
-            number=1,
-            type=3,
-            cpp_type=2,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="value",
-            full_name="mlflow.TestMessage.FieldMap3Entry.value",
-            index=1,
-            number=2,
-            type=3,
-            cpp_type=2,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-    ],
-    extensions=[],
-    nested_types=[],
-    enum_types=[],
-    serialized_options=_b("8\001"),
-    is_extendable=False,
-    syntax="proto2",
-    extension_ranges=[],
-    oneofs=[],
-    serialized_start=998,
-    serialized_end=1046,
+  name='FieldMap3Entry',
+  full_name='mlflow.TestMessage.FieldMap3Entry',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='key', full_name='mlflow.TestMessage.FieldMap3Entry.key', index=0,
+      number=1, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='value', full_name='mlflow.TestMessage.FieldMap3Entry.value', index=1,
+      number=2, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=_b('8\001'),
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=998,
+  serialized_end=1046,
 )
 
 _TESTMESSAGE_FIELDMAP4ENTRY = _descriptor.Descriptor(
-    name="FieldMap4Entry",
-    full_name="mlflow.TestMessage.FieldMap4Entry",
-    filename=None,
-    file=DESCRIPTOR,
-    containing_type=None,
-    fields=[
-        _descriptor.FieldDescriptor(
-            name="key",
-            full_name="mlflow.TestMessage.FieldMap4Entry.key",
-            index=0,
-            number=1,
-            type=3,
-            cpp_type=2,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="value",
-            full_name="mlflow.TestMessage.FieldMap4Entry.value",
-            index=1,
-            number=2,
-            type=11,
-            cpp_type=10,
-            label=1,
-            has_default_value=False,
-            default_value=None,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-    ],
-    extensions=[],
-    nested_types=[],
-    enum_types=[],
-    serialized_options=_b("8\001"),
-    is_extendable=False,
-    syntax="proto2",
-    extension_ranges=[],
-    oneofs=[],
-    serialized_start=1048,
-    serialized_end=1134,
+  name='FieldMap4Entry',
+  full_name='mlflow.TestMessage.FieldMap4Entry',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='key', full_name='mlflow.TestMessage.FieldMap4Entry.key', index=0,
+      number=1, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='value', full_name='mlflow.TestMessage.FieldMap4Entry.value', index=1,
+      number=2, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=_b('8\001'),
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=1048,
+  serialized_end=1134,
 )
 
 _TESTMESSAGE = _descriptor.Descriptor(
-    name="TestMessage",
-    full_name="mlflow.TestMessage",
-    filename=None,
-    file=DESCRIPTOR,
-    containing_type=None,
-    fields=[
-        _descriptor.FieldDescriptor(
-            name="field_int32",
-            full_name="mlflow.TestMessage.field_int32",
-            index=0,
-            number=1,
-            type=5,
-            cpp_type=1,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="field_int64",
-            full_name="mlflow.TestMessage.field_int64",
-            index=1,
-            number=2,
-            type=3,
-            cpp_type=2,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="field_uint32",
-            full_name="mlflow.TestMessage.field_uint32",
-            index=2,
-            number=3,
-            type=13,
-            cpp_type=3,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="field_uint64",
-            full_name="mlflow.TestMessage.field_uint64",
-            index=3,
-            number=4,
-            type=4,
-            cpp_type=4,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="field_sint32",
-            full_name="mlflow.TestMessage.field_sint32",
-            index=4,
-            number=5,
-            type=17,
-            cpp_type=1,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="field_sint64",
-            full_name="mlflow.TestMessage.field_sint64",
-            index=5,
-            number=6,
-            type=18,
-            cpp_type=2,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="field_fixed32",
-            full_name="mlflow.TestMessage.field_fixed32",
-            index=6,
-            number=7,
-            type=7,
-            cpp_type=3,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="field_fixed64",
-            full_name="mlflow.TestMessage.field_fixed64",
-            index=7,
-            number=8,
-            type=6,
-            cpp_type=4,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="field_sfixed32",
-            full_name="mlflow.TestMessage.field_sfixed32",
-            index=8,
-            number=9,
-            type=15,
-            cpp_type=1,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="field_sfixed64",
-            full_name="mlflow.TestMessage.field_sfixed64",
-            index=9,
-            number=10,
-            type=16,
-            cpp_type=2,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="field_bool",
-            full_name="mlflow.TestMessage.field_bool",
-            index=10,
-            number=11,
-            type=8,
-            cpp_type=7,
-            label=1,
-            has_default_value=False,
-            default_value=False,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="field_string",
-            full_name="mlflow.TestMessage.field_string",
-            index=11,
-            number=12,
-            type=9,
-            cpp_type=9,
-            label=1,
-            has_default_value=False,
-            default_value=_b("").decode("utf-8"),
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="field_with_default1",
-            full_name="mlflow.TestMessage.field_with_default1",
-            index=12,
-            number=13,
-            type=3,
-            cpp_type=2,
-            label=1,
-            has_default_value=True,
-            default_value=100,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="field_with_default2",
-            full_name="mlflow.TestMessage.field_with_default2",
-            index=13,
-            number=14,
-            type=3,
-            cpp_type=2,
-            label=1,
-            has_default_value=True,
-            default_value=200,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="field_repeated_int64",
-            full_name="mlflow.TestMessage.field_repeated_int64",
-            index=14,
-            number=15,
-            type=3,
-            cpp_type=2,
-            label=3,
-            has_default_value=False,
-            default_value=[],
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="field_enum",
-            full_name="mlflow.TestMessage.field_enum",
-            index=15,
-            number=16,
-            type=14,
-            cpp_type=8,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="field_inner_message",
-            full_name="mlflow.TestMessage.field_inner_message",
-            index=16,
-            number=17,
-            type=11,
-            cpp_type=10,
-            label=3,
-            has_default_value=False,
-            default_value=[],
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="oneof1",
-            full_name="mlflow.TestMessage.oneof1",
-            index=17,
-            number=18,
-            type=3,
-            cpp_type=2,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="oneof2",
-            full_name="mlflow.TestMessage.oneof2",
-            index=18,
-            number=19,
-            type=3,
-            cpp_type=2,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="field_map1",
-            full_name="mlflow.TestMessage.field_map1",
-            index=19,
-            number=20,
-            type=11,
-            cpp_type=10,
-            label=3,
-            has_default_value=False,
-            default_value=[],
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="field_map2",
-            full_name="mlflow.TestMessage.field_map2",
-            index=20,
-            number=21,
-            type=11,
-            cpp_type=10,
-            label=3,
-            has_default_value=False,
-            default_value=[],
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="field_map3",
-            full_name="mlflow.TestMessage.field_map3",
-            index=21,
-            number=22,
-            type=11,
-            cpp_type=10,
-            label=3,
-            has_default_value=False,
-            default_value=[],
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-        _descriptor.FieldDescriptor(
-            name="field_map4",
-            full_name="mlflow.TestMessage.field_map4",
-            index=22,
-            number=23,
-            type=11,
-            cpp_type=10,
-            label=3,
-            has_default_value=False,
-            default_value=[],
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-    ],
-    extensions=[],
-    nested_types=[
-        _TESTMESSAGE_TESTINNERMESSAGE,
-        _TESTMESSAGE_FIELDMAP1ENTRY,
-        _TESTMESSAGE_FIELDMAP2ENTRY,
-        _TESTMESSAGE_FIELDMAP3ENTRY,
-        _TESTMESSAGE_FIELDMAP4ENTRY,
-    ],
-    enum_types=[_TESTMESSAGE_TESTENUM,],
-    serialized_options=None,
-    is_extendable=True,
-    syntax="proto2",
-    extension_ranges=[(1000, 2000),],
-    oneofs=[
-        _descriptor.OneofDescriptor(
-            name="test_oneof",
-            full_name="mlflow.TestMessage.test_oneof",
-            index=0,
-            containing_type=None,
-            fields=[],
-        ),
-    ],
-    serialized_start=31,
-    serialized_end=1212,
+  name='TestMessage',
+  full_name='mlflow.TestMessage',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='field_int32', full_name='mlflow.TestMessage.field_int32', index=0,
+      number=1, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='field_int64', full_name='mlflow.TestMessage.field_int64', index=1,
+      number=2, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='field_uint32', full_name='mlflow.TestMessage.field_uint32', index=2,
+      number=3, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='field_uint64', full_name='mlflow.TestMessage.field_uint64', index=3,
+      number=4, type=4, cpp_type=4, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='field_sint32', full_name='mlflow.TestMessage.field_sint32', index=4,
+      number=5, type=17, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='field_sint64', full_name='mlflow.TestMessage.field_sint64', index=5,
+      number=6, type=18, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='field_fixed32', full_name='mlflow.TestMessage.field_fixed32', index=6,
+      number=7, type=7, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='field_fixed64', full_name='mlflow.TestMessage.field_fixed64', index=7,
+      number=8, type=6, cpp_type=4, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='field_sfixed32', full_name='mlflow.TestMessage.field_sfixed32', index=8,
+      number=9, type=15, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='field_sfixed64', full_name='mlflow.TestMessage.field_sfixed64', index=9,
+      number=10, type=16, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='field_bool', full_name='mlflow.TestMessage.field_bool', index=10,
+      number=11, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='field_string', full_name='mlflow.TestMessage.field_string', index=11,
+      number=12, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='field_with_default1', full_name='mlflow.TestMessage.field_with_default1', index=12,
+      number=13, type=3, cpp_type=2, label=1,
+      has_default_value=True, default_value=100,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='field_with_default2', full_name='mlflow.TestMessage.field_with_default2', index=13,
+      number=14, type=3, cpp_type=2, label=1,
+      has_default_value=True, default_value=200,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='field_repeated_int64', full_name='mlflow.TestMessage.field_repeated_int64', index=14,
+      number=15, type=3, cpp_type=2, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='field_enum', full_name='mlflow.TestMessage.field_enum', index=15,
+      number=16, type=14, cpp_type=8, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='field_inner_message', full_name='mlflow.TestMessage.field_inner_message', index=16,
+      number=17, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='oneof1', full_name='mlflow.TestMessage.oneof1', index=17,
+      number=18, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='oneof2', full_name='mlflow.TestMessage.oneof2', index=18,
+      number=19, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='field_map1', full_name='mlflow.TestMessage.field_map1', index=19,
+      number=20, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='field_map2', full_name='mlflow.TestMessage.field_map2', index=20,
+      number=21, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='field_map3', full_name='mlflow.TestMessage.field_map3', index=21,
+      number=22, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='field_map4', full_name='mlflow.TestMessage.field_map4', index=22,
+      number=23, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[_TESTMESSAGE_TESTINNERMESSAGE, _TESTMESSAGE_FIELDMAP1ENTRY, _TESTMESSAGE_FIELDMAP2ENTRY, _TESTMESSAGE_FIELDMAP3ENTRY, _TESTMESSAGE_FIELDMAP4ENTRY, ],
+  enum_types=[
+    _TESTMESSAGE_TESTENUM,
+  ],
+  serialized_options=None,
+  is_extendable=True,
+  syntax='proto2',
+  extension_ranges=[(1000, 2000), ],
+  oneofs=[
+    _descriptor.OneofDescriptor(
+      name='test_oneof', full_name='mlflow.TestMessage.test_oneof',
+      index=0, containing_type=None, fields=[]),
+  ],
+  serialized_start=31,
+  serialized_end=1212,
 )
 
 
 _EXTENSIONMESSAGE = _descriptor.Descriptor(
-    name="ExtensionMessage",
-    full_name="mlflow.ExtensionMessage",
-    filename=None,
-    file=DESCRIPTOR,
-    containing_type=None,
-    fields=[],
-    extensions=[
-        _descriptor.FieldDescriptor(
-            name="field_extended_int64",
-            full_name="mlflow.ExtensionMessage.field_extended_int64",
-            index=0,
-            number=1001,
-            type=3,
-            cpp_type=2,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=True,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-        ),
-    ],
-    nested_types=[],
-    enum_types=[],
-    serialized_options=None,
-    is_extendable=False,
-    syntax="proto2",
-    extension_ranges=[],
-    oneofs=[],
-    serialized_start=1214,
-    serialized_end=1284,
+  name='ExtensionMessage',
+  full_name='mlflow.ExtensionMessage',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+  ],
+  extensions=[
+    _descriptor.FieldDescriptor(
+      name='field_extended_int64', full_name='mlflow.ExtensionMessage.field_extended_int64', index=0,
+      number=1001, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=True, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=1214,
+  serialized_end=1284,
 )
 
 _TESTMESSAGE_TESTINNERMESSAGE.containing_type = _TESTMESSAGE
 _TESTMESSAGE_FIELDMAP1ENTRY.containing_type = _TESTMESSAGE
 _TESTMESSAGE_FIELDMAP2ENTRY.containing_type = _TESTMESSAGE
 _TESTMESSAGE_FIELDMAP3ENTRY.containing_type = _TESTMESSAGE
-_TESTMESSAGE_FIELDMAP4ENTRY.fields_by_name["value"].message_type = _TESTMESSAGE_TESTINNERMESSAGE
+_TESTMESSAGE_FIELDMAP4ENTRY.fields_by_name['value'].message_type = _TESTMESSAGE_TESTINNERMESSAGE
 _TESTMESSAGE_FIELDMAP4ENTRY.containing_type = _TESTMESSAGE
-_TESTMESSAGE.fields_by_name["field_enum"].enum_type = _TESTMESSAGE_TESTENUM
-_TESTMESSAGE.fields_by_name["field_inner_message"].message_type = _TESTMESSAGE_TESTINNERMESSAGE
-_TESTMESSAGE.fields_by_name["field_map1"].message_type = _TESTMESSAGE_FIELDMAP1ENTRY
-_TESTMESSAGE.fields_by_name["field_map2"].message_type = _TESTMESSAGE_FIELDMAP2ENTRY
-_TESTMESSAGE.fields_by_name["field_map3"].message_type = _TESTMESSAGE_FIELDMAP3ENTRY
-_TESTMESSAGE.fields_by_name["field_map4"].message_type = _TESTMESSAGE_FIELDMAP4ENTRY
+_TESTMESSAGE.fields_by_name['field_enum'].enum_type = _TESTMESSAGE_TESTENUM
+_TESTMESSAGE.fields_by_name['field_inner_message'].message_type = _TESTMESSAGE_TESTINNERMESSAGE
+_TESTMESSAGE.fields_by_name['field_map1'].message_type = _TESTMESSAGE_FIELDMAP1ENTRY
+_TESTMESSAGE.fields_by_name['field_map2'].message_type = _TESTMESSAGE_FIELDMAP2ENTRY
+_TESTMESSAGE.fields_by_name['field_map3'].message_type = _TESTMESSAGE_FIELDMAP3ENTRY
+_TESTMESSAGE.fields_by_name['field_map4'].message_type = _TESTMESSAGE_FIELDMAP4ENTRY
 _TESTMESSAGE_TESTENUM.containing_type = _TESTMESSAGE
-_TESTMESSAGE.oneofs_by_name["test_oneof"].fields.append(_TESTMESSAGE.fields_by_name["oneof1"])
-_TESTMESSAGE.fields_by_name["oneof1"].containing_oneof = _TESTMESSAGE.oneofs_by_name["test_oneof"]
-_TESTMESSAGE.oneofs_by_name["test_oneof"].fields.append(_TESTMESSAGE.fields_by_name["oneof2"])
-_TESTMESSAGE.fields_by_name["oneof2"].containing_oneof = _TESTMESSAGE.oneofs_by_name["test_oneof"]
-DESCRIPTOR.message_types_by_name["TestMessage"] = _TESTMESSAGE
-DESCRIPTOR.message_types_by_name["ExtensionMessage"] = _EXTENSIONMESSAGE
+_TESTMESSAGE.oneofs_by_name['test_oneof'].fields.append(
+  _TESTMESSAGE.fields_by_name['oneof1'])
+_TESTMESSAGE.fields_by_name['oneof1'].containing_oneof = _TESTMESSAGE.oneofs_by_name['test_oneof']
+_TESTMESSAGE.oneofs_by_name['test_oneof'].fields.append(
+  _TESTMESSAGE.fields_by_name['oneof2'])
+_TESTMESSAGE.fields_by_name['oneof2'].containing_oneof = _TESTMESSAGE.oneofs_by_name['test_oneof']
+DESCRIPTOR.message_types_by_name['TestMessage'] = _TESTMESSAGE
+DESCRIPTOR.message_types_by_name['ExtensionMessage'] = _EXTENSIONMESSAGE
 _sym_db.RegisterFileDescriptor(DESCRIPTOR)
 
-TestMessage = _reflection.GeneratedProtocolMessageType(
-    "TestMessage",
-    (_message.Message,),
-    dict(
-        TestInnerMessage=_reflection.GeneratedProtocolMessageType(
-            "TestInnerMessage",
-            (_message.Message,),
-            dict(
-                DESCRIPTOR=_TESTMESSAGE_TESTINNERMESSAGE,
-                __module__="test_message_pb2"
-                # @@protoc_insertion_point(class_scope:mlflow.TestMessage.TestInnerMessage)
-            ),
-        ),
-        FieldMap1Entry=_reflection.GeneratedProtocolMessageType(
-            "FieldMap1Entry",
-            (_message.Message,),
-            dict(
-                DESCRIPTOR=_TESTMESSAGE_FIELDMAP1ENTRY,
-                __module__="test_message_pb2"
-                # @@protoc_insertion_point(class_scope:mlflow.TestMessage.FieldMap1Entry)
-            ),
-        ),
-        FieldMap2Entry=_reflection.GeneratedProtocolMessageType(
-            "FieldMap2Entry",
-            (_message.Message,),
-            dict(
-                DESCRIPTOR=_TESTMESSAGE_FIELDMAP2ENTRY,
-                __module__="test_message_pb2"
-                # @@protoc_insertion_point(class_scope:mlflow.TestMessage.FieldMap2Entry)
-            ),
-        ),
-        FieldMap3Entry=_reflection.GeneratedProtocolMessageType(
-            "FieldMap3Entry",
-            (_message.Message,),
-            dict(
-                DESCRIPTOR=_TESTMESSAGE_FIELDMAP3ENTRY,
-                __module__="test_message_pb2"
-                # @@protoc_insertion_point(class_scope:mlflow.TestMessage.FieldMap3Entry)
-            ),
-        ),
-        FieldMap4Entry=_reflection.GeneratedProtocolMessageType(
-            "FieldMap4Entry",
-            (_message.Message,),
-            dict(
-                DESCRIPTOR=_TESTMESSAGE_FIELDMAP4ENTRY,
-                __module__="test_message_pb2"
-                # @@protoc_insertion_point(class_scope:mlflow.TestMessage.FieldMap4Entry)
-            ),
-        ),
-        DESCRIPTOR=_TESTMESSAGE,
-        __module__="test_message_pb2"
-        # @@protoc_insertion_point(class_scope:mlflow.TestMessage)
-    ),
-)
+TestMessage = _reflection.GeneratedProtocolMessageType('TestMessage', (_message.Message,), dict(
+
+  TestInnerMessage = _reflection.GeneratedProtocolMessageType('TestInnerMessage', (_message.Message,), dict(
+    DESCRIPTOR = _TESTMESSAGE_TESTINNERMESSAGE,
+    __module__ = 'test_message_pb2'
+    # @@protoc_insertion_point(class_scope:mlflow.TestMessage.TestInnerMessage)
+    ))
+  ,
+
+  FieldMap1Entry = _reflection.GeneratedProtocolMessageType('FieldMap1Entry', (_message.Message,), dict(
+    DESCRIPTOR = _TESTMESSAGE_FIELDMAP1ENTRY,
+    __module__ = 'test_message_pb2'
+    # @@protoc_insertion_point(class_scope:mlflow.TestMessage.FieldMap1Entry)
+    ))
+  ,
+
+  FieldMap2Entry = _reflection.GeneratedProtocolMessageType('FieldMap2Entry', (_message.Message,), dict(
+    DESCRIPTOR = _TESTMESSAGE_FIELDMAP2ENTRY,
+    __module__ = 'test_message_pb2'
+    # @@protoc_insertion_point(class_scope:mlflow.TestMessage.FieldMap2Entry)
+    ))
+  ,
+
+  FieldMap3Entry = _reflection.GeneratedProtocolMessageType('FieldMap3Entry', (_message.Message,), dict(
+    DESCRIPTOR = _TESTMESSAGE_FIELDMAP3ENTRY,
+    __module__ = 'test_message_pb2'
+    # @@protoc_insertion_point(class_scope:mlflow.TestMessage.FieldMap3Entry)
+    ))
+  ,
+
+  FieldMap4Entry = _reflection.GeneratedProtocolMessageType('FieldMap4Entry', (_message.Message,), dict(
+    DESCRIPTOR = _TESTMESSAGE_FIELDMAP4ENTRY,
+    __module__ = 'test_message_pb2'
+    # @@protoc_insertion_point(class_scope:mlflow.TestMessage.FieldMap4Entry)
+    ))
+  ,
+  DESCRIPTOR = _TESTMESSAGE,
+  __module__ = 'test_message_pb2'
+  # @@protoc_insertion_point(class_scope:mlflow.TestMessage)
+  ))
 _sym_db.RegisterMessage(TestMessage)
 _sym_db.RegisterMessage(TestMessage.TestInnerMessage)
 _sym_db.RegisterMessage(TestMessage.FieldMap1Entry)
@@ -916,18 +532,14 @@
 _sym_db.RegisterMessage(TestMessage.FieldMap3Entry)
 _sym_db.RegisterMessage(TestMessage.FieldMap4Entry)
 
-ExtensionMessage = _reflection.GeneratedProtocolMessageType(
-    "ExtensionMessage",
-    (_message.Message,),
-    dict(
-        DESCRIPTOR=_EXTENSIONMESSAGE,
-        __module__="test_message_pb2"
-        # @@protoc_insertion_point(class_scope:mlflow.ExtensionMessage)
-    ),
-)
+ExtensionMessage = _reflection.GeneratedProtocolMessageType('ExtensionMessage', (_message.Message,), dict(
+  DESCRIPTOR = _EXTENSIONMESSAGE,
+  __module__ = 'test_message_pb2'
+  # @@protoc_insertion_point(class_scope:mlflow.ExtensionMessage)
+  ))
 _sym_db.RegisterMessage(ExtensionMessage)
 
-TestMessage.RegisterExtension(_EXTENSIONMESSAGE.extensions_by_name["field_extended_int64"])
+TestMessage.RegisterExtension(_EXTENSIONMESSAGE.extensions_by_name['field_extended_int64'])
 
 _TESTMESSAGE_FIELDMAP1ENTRY._options = None
 _TESTMESSAGE_FIELDMAP2ENTRY._options = None

From 2a672977bc34cc6ae7f00e1fd93cb533eb65b76e Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 22 Nov 2021 14:56:39 +0800
Subject: [PATCH 020/120] change module path

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/{ => models}/evaluation/__init__.py    | 43 +------------------
 .../models/evaluation/evaluator_registry.py   | 43 +++++++++++++++++++
 .../mlflow_test_plugin/dummy_evaluator.py     |  2 +-
 tests/test_evaluation.py                      |  2 +-
 4 files changed, 46 insertions(+), 44 deletions(-)
 rename mlflow/{ => models}/evaluation/__init__.py (90%)
 create mode 100644 mlflow/models/evaluation/evaluator_registry.py

diff --git a/mlflow/evaluation/__init__.py b/mlflow/models/evaluation/__init__.py
similarity index 90%
rename from mlflow/evaluation/__init__.py
rename to mlflow/models/evaluation/__init__.py
index f6c30310d1310..042a3463c8b06 100644
--- a/mlflow/evaluation/__init__.py
+++ b/mlflow/models/evaluation/__init__.py
@@ -1,6 +1,4 @@
 from typing import Dict, Union
-import entrypoints
-import warnings
 import mlflow
 import hashlib
 import time
@@ -15,6 +13,7 @@
 from mlflow.tracking.artifact_utils import _download_artifact_from_uri
 from mlflow.utils import _get_fully_qualified_class_name, load_class
 from mlflow.pyfunc import PyFuncModel
+from mlflow.models.evaluation.evaluator_registry import ModelEvaluatorRegistry
 
 
 class EvaluationMetrics(dict):
@@ -312,46 +311,6 @@ def do_evaluate(_run_id):
                 return do_evaluate(run_id)
 
 
-class ModelEvaluatorRegistry:
-    """
-    Scheme-based registry for model evaluator implementations
-    """
-
-    def __init__(self):
-        self._registry = {}
-
-    def register(self, scheme, evaluator):
-        """Register model evaluator provided by other packages"""
-        self._registry[scheme] = evaluator
-
-    def register_entrypoints(self):
-        # Register artifact repositories provided by other packages
-        for entrypoint in entrypoints.get_group_all("mlflow.model_evaluator"):
-            try:
-                self.register(entrypoint.name, entrypoint.load())
-            except (AttributeError, ImportError) as exc:
-                warnings.warn(
-                    'Failure attempting to register model evaluator for scheme "{}": {}'.format(
-                        entrypoint.name, str(exc)
-                    ),
-                    stacklevel=2,
-                )
-
-    def get_evaluator(self, evaluator_name):
-        """
-        Get an evaluator instance from the registry based on the name of evaluator
-        """
-        evaluator_cls = self._registry.get(evaluator_name)
-        if evaluator_cls is None:
-            raise MlflowException(
-                "Could not find a registered model evaluator for: {}. "
-                "Currently registered evaluator names are: {}".format(
-                    evaluator_name, list(self._registry.keys())
-                )
-            )
-        return evaluator_cls()
-
-
 _model_evaluation_registry = ModelEvaluatorRegistry()
 _model_evaluation_registry.register_entrypoints()
 
diff --git a/mlflow/models/evaluation/evaluator_registry.py b/mlflow/models/evaluation/evaluator_registry.py
new file mode 100644
index 0000000000000..b026898aecde9
--- /dev/null
+++ b/mlflow/models/evaluation/evaluator_registry.py
@@ -0,0 +1,43 @@
+import entrypoints
+import warnings
+from mlflow.exceptions import MlflowException
+
+
+class ModelEvaluatorRegistry:
+    """
+    Scheme-based registry for model evaluator implementations
+    """
+
+    def __init__(self):
+        self._registry = {}
+
+    def register(self, scheme, evaluator):
+        """Register model evaluator provided by other packages"""
+        self._registry[scheme] = evaluator
+
+    def register_entrypoints(self):
+        # Register artifact repositories provided by other packages
+        for entrypoint in entrypoints.get_group_all("mlflow.model_evaluator"):
+            try:
+                self.register(entrypoint.name, entrypoint.load())
+            except (AttributeError, ImportError) as exc:
+                warnings.warn(
+                    'Failure attempting to register model evaluator for scheme "{}": {}'.format(
+                        entrypoint.name, str(exc)
+                    ),
+                    stacklevel=2,
+                )
+
+    def get_evaluator(self, evaluator_name):
+        """
+        Get an evaluator instance from the registry based on the name of evaluator
+        """
+        evaluator_cls = self._registry.get(evaluator_name)
+        if evaluator_cls is None:
+            raise MlflowException(
+                "Could not find a registered model evaluator for: {}. "
+                "Currently registered evaluator names are: {}".format(
+                    evaluator_name, list(self._registry.keys())
+                )
+            )
+        return evaluator_cls()
diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
index 8835da6425adc..b1fcdab974078 100644
--- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -1,5 +1,5 @@
 import mlflow
-from mlflow.evaluation import (
+from mlflow.models.evaluation import (
     ModelEvaluator,
     EvaluationMetrics,
     EvaluationArtifact,
diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py
index 65b44d149d324..d79240f04d171 100644
--- a/tests/test_evaluation.py
+++ b/tests/test_evaluation.py
@@ -1,6 +1,6 @@
 import mlflow
 
-from mlflow.evaluation import evaluate, EvaluationDataset
+from mlflow.models.evaluation import evaluate, EvaluationDataset
 import sklearn
 import sklearn.datasets
 import sklearn.linear_model

From 66f760ded8511f66bc69f6e7e4867ee090bbf855 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 22 Nov 2021 23:33:51 +0800
Subject: [PATCH 021/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/__init__.py | 384 +--------------------------
 mlflow/models/evaluation/base.py     | 375 ++++++++++++++++++++++++++
 2 files changed, 384 insertions(+), 375 deletions(-)
 create mode 100644 mlflow/models/evaluation/base.py

diff --git a/mlflow/models/evaluation/__init__.py b/mlflow/models/evaluation/__init__.py
index 042a3463c8b06..476e0f28f6e63 100644
--- a/mlflow/models/evaluation/__init__.py
+++ b/mlflow/models/evaluation/__init__.py
@@ -1,375 +1,9 @@
-from typing import Dict, Union
-import mlflow
-import hashlib
-import time
-import numpy as np
-import pandas as pd
-import pickle
-import json
-import os
-from mlflow.exceptions import MlflowException
-from mlflow.utils.file_utils import TempDir
-from mlflow.entities import Metric, RunTag
-from mlflow.tracking.artifact_utils import _download_artifact_from_uri
-from mlflow.utils import _get_fully_qualified_class_name, load_class
-from mlflow.pyfunc import PyFuncModel
-from mlflow.models.evaluation.evaluator_registry import ModelEvaluatorRegistry
-
-
-class EvaluationMetrics(dict):
-    pass
-
-
-class EvaluationArtifact:
-    def __init__(self, location, content=None):
-        self._content = content
-        self._location = location
-
-    @classmethod
-    def load_content_from_file(cls, local_artifact_path):
-        raise NotImplementedError()
-
-    @classmethod
-    def save_content_to_file(cls, content, output_artifact_path):
-        raise NotImplementedError()
-
-    @property
-    def content(self):
-        """
-        The content of the artifact (representation varies)
-        """
-        if self._content is None:
-            with TempDir() as temp_dir:
-                local_artifact_file = temp_dir.path("local_artifact")
-                _download_artifact_from_uri(self._location, local_artifact_file)
-                self._content = self.load_content_from_file(local_artifact_file)
-
-        return self._content
-
-    @property
-    def location(self) -> str:
-        """
-        The location of the artifact
-        """
-        return self._location
-
-
-class EvaluationResult:
-    def __init__(self, metrics, artifacts):
-        self._metrics = metrics
-        self._artifacts = artifacts
-
-    @classmethod
-    def load(cls, path):
-        """Load the evaluation results from the specified local filesystem path"""
-        with open(os.path.join(path, "metrics.json"), "r") as fp:
-            metrics = EvaluationMetrics(json.load(fp))
-
-        with open(os.path.join(path, "artifacts_metadata.json"), "r") as fp:
-            artifacts_metadata = json.load(fp)
-
-        artifacts = {}
-
-        for artifact_name, meta in artifacts_metadata:
-            location = meta["location"]
-            ArtifactCls = load_class(meta["class_name"])
-            content = ArtifactCls.load_content_from_file(os.path.join(path, artifact_name))
-            artifacts[artifact_name] = ArtifactCls(location=location, content=content)
-
-        return EvaluationResult(metrics=metrics, artifacts=artifacts)
-
-    def save(self, path):
-        """Write the evaluation results to the specified local filesystem path"""
-        os.makedirs(path, exist_ok=True)
-        with open(os.path.join(path, "metrics.json"), "w") as fp:
-            json.dump(self.metrics, fp)
-
-        artifacts_metadata = {
-            artifact_name: {
-                "location": artifact.location,
-                "class_name": _get_fully_qualified_class_name(artifact),
-            }
-            for artifact_name, artifact in self.artifacts.items()
-        }
-        with open(os.path.join(path, "artifacts_metadata.json"), "w") as fp:
-            json.dump(artifacts_metadata, fp)
-
-        for artifact_name, artifact in self.artifacts.items():
-            artifact.save_content_to_file(artifact.content, os.path.join(path, artifact_name))
-
-    @property
-    def metrics(self) -> EvaluationMetrics:
-        """
-        A dictionary mapping scalar metric names to scalar metric values
-        """
-        return self._metrics
-
-    @property
-    def artifacts(self) -> Dict[str, EvaluationArtifact]:
-        """
-        A dictionary mapping standardized artifact names (e.g. "roc_data") to
-        artifact content and location information
-        """
-        return self._artifacts
-
-
-class EvaluationDataset:
-    """
-    Represents an input dataset for model evaluation. This is intended for
-    use with the `mlflow.evaluate()`API.
-    """
-
-    NUM_SAMPLE_ROWS_FOR_HASH = 5
-
-    def __init__(self, data, labels, name=None, path=None):
-        """
-        :param data: One of the following:
-         - A numpy array or list of evaluation features, excluding labels.
-         - A Pandas DataFrame, or the path to a serialized DataFrame,
-           containing evaluation features and labels. All columns will be regarded as feature
-           columns except the "labels" column.
-
-        :param labels: One of the following:
-         - A numpy array or list of evaluation labels, if `data` is also a numpy array or list.
-         - The string name of a column from `data` that contains evaluation labels, if `data`
-           is a DataFrame.
-
-        :param name: (Optional) The name of the dataset (must not contain ").
-
-        :param path: (Optional) the path to a serialized DataFrame (must not contain ").
-          (e.g. a delta table, parquet file)
-        """
-        if name is not None and '"' in name:
-            raise ValueError(f'Dataset name cannot include " but get name {name}')
-        if path is not None and '"' in path:
-            raise ValueError(f'Dataset path cannot include " but get name {path}')
-
-        if isinstance(data, (np.ndarray, list)):
-            if not isinstance(labels, (np.ndarray, list)):
-                raise ValueError(
-                    'If data is a numpy array or list of evaluation features, '
-                    'labels must be a numpy array or list of evaluation labels'
-                )
-        elif isinstance(data, pd.DataFrame):
-            if not isinstance(labels, str):
-                raise ValueError(
-                    'If data is a Pandas DataFrame, labels must be the string name of a column '
-                    'from `data` that contains evaluation labels'
-                )
-        else:
-            raise ValueError('The data argument must be a numpy array, a list or a '
-                             'Pandas DataFrame.')
-
-        self._user_specified_name = name
-        self.data = data
-        self.labels = labels
-        self.path = path
-        self._hash = None
-
-    def _extract_features_and_labels(self):
-        if isinstance(self.data, np.ndarray):
-            return self.data, self.labels
-        elif isinstance(self.data, pd.DataFrame):
-            feature_cols = [x for x in self.data.columns if x != self.labels]
-            return self.data[feature_cols], self.data[self.labels]
-        else:
-            raise ValueError(f'Unsupported data type: {type(self.data)}')
-
-    @staticmethod
-    def _gen_md5_for_arraylike_obj(md5_gen, data):
-        md5_gen.update(pickle.dumps(len(data)))
-        if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
-            md5_gen.update(pickle.dumps(data))
-        else:
-            md5_gen.update(pickle.dumps(data[: EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH]))
-            md5_gen.update(pickle.dumps(data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH :]))
-
-    @property
-    def name(self):
-        return self._user_specified_name if self._user_specified_name is not None else self.hash
-
-    @property
-    def hash(self):
-        """
-        Compute a hash from the specified dataset by selecting the first 5 records, last 5 records,
-        dataset size and feeding them through a cheap, low-collision hash function
-        """
-        if self._hash is None:
-            md5_gen = hashlib.md5()
-            if isinstance(self.data, np.ndarray):
-                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
-                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.labels)
-            elif isinstance(self.data, pd.DataFrame):
-                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
-                md5_gen.update(self.labels.encode("UTF-8"))
-            self._hash = md5_gen.hexdigest()
-        return self._hash
-
-    @property
-    def _metadata(self):
-        metadata = {
-            "name": self.name,
-            "hash": self.hash,
-        }
-        if self.path is not None:
-            metadata["path"] = self.path
-        return metadata
-
-
-class ModelEvaluator:
-    def can_evaluate(self, model_type, evaluator_config=None, **kwargs) -> bool:
-        """
-        :param model_type: A string describing the model type (e.g., "regressor",
-                           "classifier", …).
-        :param evaluator_config: A dictionary of additional configurations for
-                                 the evaluator.
-        :param **kwargs: For forwards compatibility, a placeholder for additional
-                         arguments that may be added to the evaluation interface
-                         in the future.
-        :return: True if the evaluator can evaluate the specified model on the
-                 specified dataset. False otherwise.
-        """
-        raise NotImplementedError()
-
-    def compute_metrics_and_compute_and_log_artifacts(
-        self, model, model_type, dataset, evaluator_config, run_id
-    ):
-        """
-        return an tuple of:
-         - an instance of EvaluationMetrics
-         - a dict of artifact_name -> instance_of_EvaluationArtifact
-        and log artifacts into run specified by run_id
-        """
-        raise NotImplementedError()
-
-    def evaluate(
-            self,
-            model: PyFuncModel,
-            model_type,
-            dataset,
-            run_id=None,
-            evaluator_config=None,
-            **kwargs
-    ) -> EvaluationResult:
-        """
-        :param model: A pyfunc model instance.
-        :param model_type: A string describing the model type (e.g., "regressor",
-                   "classifier", …).
-        :param dataset: An instance of `EvaluationDataset` containing features
-                        and labels (optional) for model evaluation.
-        :param run_id: The ID of the MLflow Run to which to log results.
-        :param evaluator_config: A dictionary of additional configurations for
-                                 the evaluator.
-        :param **kwargs: For forwards compatibility, a placeholder for additional
-                         arguments that may be added to the evaluation interface
-                         in the future.
-        :return: An `EvaluationResult` instance containing evaluation results.
-        """
-        client = mlflow.tracking.MlflowClient()
-        self.mlflow_client = client
-
-        def do_evaluate(_run_id):
-            timestamp = int(time.time() * 1000)
-            existing_dataset_metadata_str = client.get_run(_run_id).data.tags.get("mlflow.datasets")
-            if existing_dataset_metadata_str is not None:
-                dataset_metadata_list = json.loads(existing_dataset_metadata_str)
-            else:
-                dataset_metadata_list = []
-
-            metadata_exists = False
-            for metadata in dataset_metadata_list:
-                if (
-                    metadata["hash"] == dataset.hash
-                    and metadata["name"] == dataset._user_specified_name
-                ):
-                    metadata_exists = True
-                    break
-
-            if not metadata_exists:
-                dataset_metadata_list.append(dataset._metadata)
-
-            dataset_metadata_str = json.dumps(dataset_metadata_list)
-
-            metrics_dict, artifacts_dict = self.compute_metrics_and_compute_and_log_artifacts(
-                model, model_type, dataset, evaluator_config, _run_id
-            )
-            client.log_batch(
-                _run_id,
-                metrics=[
-                    Metric(key=f"{key}_on_{dataset.name}", value=value, timestamp=timestamp, step=0)
-                    for key, value in metrics_dict.items()
-                ],
-                tags=[RunTag("mlflow.datasets", dataset_metadata_str)],
-            )
-
-            return EvaluationResult(metrics_dict, artifacts_dict)
-
-        if mlflow.active_run() is not None:
-            return do_evaluate(mlflow.active_run().info.run_id)
-        else:
-            with mlflow.start_run(run_id=run_id):
-                return do_evaluate(run_id)
-
-
-_model_evaluation_registry = ModelEvaluatorRegistry()
-_model_evaluation_registry.register_entrypoints()
-
-
-def evaluate(
-        model: Union[str, PyFuncModel],
-        model_type, dataset,
-        run_id=None,
-        evaluators=None,
-        evaluator_config=None
-) -> EvaluationResult:
-    """
-    :param model: A pyfunc model instance, or a URI referring to such a model.
-
-    :param model_type: A string describing the model type. The default evaluator
-                       supports "regressor" and "classifier" as model types.
-    :param dataset: An instance of `EvaluationDataset` containing features
-                    labels (optional) for model evaluation.
-    :param run_id: The ID of the MLflow Run to which to log results. If
-                   unspecified, behavior depends on the specified `evaluator`.
-                   When `run_id` is unspecified, the default evaluator logs
-                   results to the current active run, creating a new active run if
-                   one does not exist.
-    :param evaluators: The name of the evaluator to use for model evaluations, or
-                       a list of evaluator names. If unspecified, all evaluators
-                       capable  of evaluating the specified model on the specified
-                       dataset are used. The default evaluator can be referred to
-                       by the name 'default'.
-    :param evaluator_config: A dictionary of additional configurations to supply
-                             to the evaluator. If multiple evaluators are
-                             specified, each configuration should be supplied as
-                             a nested dictionary whose key is the evaluator name.
-    :return: An `EvaluationResult` instance containing evaluation results.
-    """
-    if evaluators is None:
-        evaluators = "default"
-
-    if not isinstance(evaluators, list):
-        evaluators = [evaluators]
-        evaluator_config = {evaluators[0]: evaluator_config}
-
-    if isinstance(model, str):
-        model = mlflow.pyfunc.load_model(model)
-
-    eval_results = []
-    for evaluator_name in evaluators:
-        config = evaluator_config[evaluator_name]
-        try:
-            evaluator = _model_evaluation_registry.get_evaluator(evaluator_name)
-        except MlflowException:
-            continue
-
-        if evaluator.can_evaluate(model_type, config):
-            result = evaluator.evaluate(model, model_type, dataset, run_id, config)
-            eval_results.append(result)
-
-    merged_eval_result = EvaluationResult(EvaluationMetrics(), dict())
-    for eval_result in eval_results:
-        merged_eval_result.metrics.update(eval_result.metrics)
-        merged_eval_result.artifacts.update(eval_result.artifacts)
-
-    return merged_eval_result
+from .base import (
+    _model_evaluation_registry,
+    evaluate,
+    EvaluationDataset,
+    EvaluationArtifact,
+    EvaluationMetrics,
+    ModelEvaluator
+)
+from .default_evaluator import DefaultEvaluator
diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
new file mode 100644
index 0000000000000..b451e42f81933
--- /dev/null
+++ b/mlflow/models/evaluation/base.py
@@ -0,0 +1,375 @@
+from typing import Dict, Union
+import mlflow
+import hashlib
+import time
+import numpy as np
+import pandas as pd
+import pickle
+import json
+import os
+from mlflow.exceptions import MlflowException
+from mlflow.utils.file_utils import TempDir
+from mlflow.entities import Metric, RunTag
+from mlflow.tracking.artifact_utils import _download_artifact_from_uri
+from mlflow.utils import _get_fully_qualified_class_name, load_class
+from mlflow.pyfunc import PyFuncModel
+from mlflow.models.evaluation.evaluator_registry import ModelEvaluatorRegistry
+
+
+class EvaluationMetrics(dict):
+    pass
+
+
+class EvaluationArtifact:
+    def __init__(self, location, content=None):
+        self._content = content
+        self._location = location
+
+    @classmethod
+    def load_content_from_file(cls, local_artifact_path):
+        raise NotImplementedError()
+
+    @classmethod
+    def save_content_to_file(cls, content, output_artifact_path):
+        raise NotImplementedError()
+
+    @property
+    def content(self):
+        """
+        The content of the artifact (representation varies)
+        """
+        if self._content is None:
+            with TempDir() as temp_dir:
+                local_artifact_file = temp_dir.path("local_artifact")
+                _download_artifact_from_uri(self._location, local_artifact_file)
+                self._content = self.load_content_from_file(local_artifact_file)
+
+        return self._content
+
+    @property
+    def location(self) -> str:
+        """
+        The location of the artifact
+        """
+        return self._location
+
+
+class EvaluationResult:
+    def __init__(self, metrics, artifacts):
+        self._metrics = metrics
+        self._artifacts = artifacts
+
+    @classmethod
+    def load(cls, path):
+        """Load the evaluation results from the specified local filesystem path"""
+        with open(os.path.join(path, "metrics.json"), "r") as fp:
+            metrics = EvaluationMetrics(json.load(fp))
+
+        with open(os.path.join(path, "artifacts_metadata.json"), "r") as fp:
+            artifacts_metadata = json.load(fp)
+
+        artifacts = {}
+
+        for artifact_name, meta in artifacts_metadata:
+            location = meta["location"]
+            ArtifactCls = load_class(meta["class_name"])
+            content = ArtifactCls.load_content_from_file(os.path.join(path, artifact_name))
+            artifacts[artifact_name] = ArtifactCls(location=location, content=content)
+
+        return EvaluationResult(metrics=metrics, artifacts=artifacts)
+
+    def save(self, path):
+        """Write the evaluation results to the specified local filesystem path"""
+        os.makedirs(path, exist_ok=True)
+        with open(os.path.join(path, "metrics.json"), "w") as fp:
+            json.dump(self.metrics, fp)
+
+        artifacts_metadata = {
+            artifact_name: {
+                "location": artifact.location,
+                "class_name": _get_fully_qualified_class_name(artifact),
+            }
+            for artifact_name, artifact in self.artifacts.items()
+        }
+        with open(os.path.join(path, "artifacts_metadata.json"), "w") as fp:
+            json.dump(artifacts_metadata, fp)
+
+        for artifact_name, artifact in self.artifacts.items():
+            artifact.save_content_to_file(artifact.content, os.path.join(path, artifact_name))
+
+    @property
+    def metrics(self) -> EvaluationMetrics:
+        """
+        A dictionary mapping scalar metric names to scalar metric values
+        """
+        return self._metrics
+
+    @property
+    def artifacts(self) -> Dict[str, EvaluationArtifact]:
+        """
+        A dictionary mapping standardized artifact names (e.g. "roc_data") to
+        artifact content and location information
+        """
+        return self._artifacts
+
+
+class EvaluationDataset:
+    """
+    Represents an input dataset for model evaluation. This is intended for
+    use with the `mlflow.evaluate()`API.
+    """
+
+    NUM_SAMPLE_ROWS_FOR_HASH = 5
+
+    def __init__(self, data, labels, name=None, path=None):
+        """
+        :param data: One of the following:
+         - A numpy array or list of evaluation features, excluding labels.
+         - A Pandas DataFrame, or the path to a serialized DataFrame,
+           containing evaluation features and labels. All columns will be regarded as feature
+           columns except the "labels" column.
+
+        :param labels: One of the following:
+         - A numpy array or list of evaluation labels, if `data` is also a numpy array or list.
+         - The string name of a column from `data` that contains evaluation labels, if `data`
+           is a DataFrame.
+
+        :param name: (Optional) The name of the dataset (must not contain ").
+
+        :param path: (Optional) the path to a serialized DataFrame (must not contain ").
+          (e.g. a delta table, parquet file)
+        """
+        if name is not None and '"' in name:
+            raise ValueError(f'Dataset name cannot include " but get name {name}')
+        if path is not None and '"' in path:
+            raise ValueError(f'Dataset path cannot include " but get name {path}')
+
+        if isinstance(data, (np.ndarray, list)):
+            if not isinstance(labels, (np.ndarray, list)):
+                raise ValueError(
+                    'If data is a numpy array or list of evaluation features, '
+                    'labels must be a numpy array or list of evaluation labels'
+                )
+        elif isinstance(data, pd.DataFrame):
+            if not isinstance(labels, str):
+                raise ValueError(
+                    'If data is a Pandas DataFrame, labels must be the string name of a column '
+                    'from `data` that contains evaluation labels'
+                )
+        else:
+            raise ValueError('The data argument must be a numpy array, a list or a '
+                             'Pandas DataFrame.')
+
+        self._user_specified_name = name
+        self.data = data
+        self.labels = labels
+        self.path = path
+        self._hash = None
+
+    def _extract_features_and_labels(self):
+        if isinstance(self.data, np.ndarray):
+            return self.data, self.labels
+        elif isinstance(self.data, pd.DataFrame):
+            feature_cols = [x for x in self.data.columns if x != self.labels]
+            return self.data[feature_cols], self.data[self.labels]
+        else:
+            raise ValueError(f'Unsupported data type: {type(self.data)}')
+
+    @staticmethod
+    def _gen_md5_for_arraylike_obj(md5_gen, data):
+        md5_gen.update(pickle.dumps(len(data)))
+        if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
+            md5_gen.update(pickle.dumps(data))
+        else:
+            md5_gen.update(pickle.dumps(data[: EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH]))
+            md5_gen.update(pickle.dumps(data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH :]))
+
+    @property
+    def name(self):
+        return self._user_specified_name if self._user_specified_name is not None else self.hash
+
+    @property
+    def hash(self):
+        """
+        Compute a hash from the specified dataset by selecting the first 5 records, last 5 records,
+        dataset size and feeding them through a cheap, low-collision hash function
+        """
+        if self._hash is None:
+            md5_gen = hashlib.md5()
+            if isinstance(self.data, np.ndarray):
+                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
+                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.labels)
+            elif isinstance(self.data, pd.DataFrame):
+                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
+                md5_gen.update(self.labels.encode("UTF-8"))
+            self._hash = md5_gen.hexdigest()
+        return self._hash
+
+    @property
+    def _metadata(self):
+        metadata = {
+            "name": self.name,
+            "hash": self.hash,
+        }
+        if self.path is not None:
+            metadata["path"] = self.path
+        return metadata
+
+
+class ModelEvaluator:
+    def can_evaluate(self, model_type, evaluator_config=None, **kwargs) -> bool:
+        """
+        :param model_type: A string describing the model type (e.g., "regressor",
+                           "classifier", …).
+        :param evaluator_config: A dictionary of additional configurations for
+                                 the evaluator.
+        :param **kwargs: For forwards compatibility, a placeholder for additional
+                         arguments that may be added to the evaluation interface
+                         in the future.
+        :return: True if the evaluator can evaluate the specified model on the
+                 specified dataset. False otherwise.
+        """
+        raise NotImplementedError()
+
+    def compute_metrics_and_compute_and_log_artifacts(
+        self, model, model_type, dataset, evaluator_config, run_id
+    ):
+        """
+        return an tuple of:
+         - an instance of EvaluationMetrics
+         - a dict of artifact_name -> instance_of_EvaluationArtifact
+        and log artifacts into run specified by run_id
+        """
+        raise NotImplementedError()
+
+    def evaluate(
+            self,
+            model: PyFuncModel,
+            model_type,
+            dataset,
+            run_id=None,
+            evaluator_config=None,
+            **kwargs
+    ) -> EvaluationResult:
+        """
+        :param model: A pyfunc model instance.
+        :param model_type: A string describing the model type (e.g., "regressor",
+                   "classifier", …).
+        :param dataset: An instance of `EvaluationDataset` containing features
+                        and labels (optional) for model evaluation.
+        :param run_id: The ID of the MLflow Run to which to log results.
+        :param evaluator_config: A dictionary of additional configurations for
+                                 the evaluator.
+        :param **kwargs: For forwards compatibility, a placeholder for additional
+                         arguments that may be added to the evaluation interface
+                         in the future.
+        :return: An `EvaluationResult` instance containing evaluation results.
+        """
+        client = mlflow.tracking.MlflowClient()
+        self.mlflow_client = client
+
+        def do_evaluate(_run_id):
+            timestamp = int(time.time() * 1000)
+            existing_dataset_metadata_str = client.get_run(_run_id).data.tags.get("mlflow.datasets")
+            if existing_dataset_metadata_str is not None:
+                dataset_metadata_list = json.loads(existing_dataset_metadata_str)
+            else:
+                dataset_metadata_list = []
+
+            metadata_exists = False
+            for metadata in dataset_metadata_list:
+                if (
+                    metadata["hash"] == dataset.hash
+                    and metadata["name"] == dataset._user_specified_name
+                ):
+                    metadata_exists = True
+                    break
+
+            if not metadata_exists:
+                dataset_metadata_list.append(dataset._metadata)
+
+            dataset_metadata_str = json.dumps(dataset_metadata_list)
+
+            metrics_dict, artifacts_dict = self.compute_metrics_and_compute_and_log_artifacts(
+                model, model_type, dataset, evaluator_config, _run_id
+            )
+            client.log_batch(
+                _run_id,
+                metrics=[
+                    Metric(key=f"{key}_on_{dataset.name}", value=value, timestamp=timestamp, step=0)
+                    for key, value in metrics_dict.items()
+                ],
+                tags=[RunTag("mlflow.datasets", dataset_metadata_str)],
+            )
+
+            return EvaluationResult(metrics_dict, artifacts_dict)
+
+        if mlflow.active_run() is not None:
+            return do_evaluate(mlflow.active_run().info.run_id)
+        else:
+            with mlflow.start_run(run_id=run_id) as run:
+                return do_evaluate(run.info.run_id)
+
+
+_model_evaluation_registry = ModelEvaluatorRegistry()
+_model_evaluation_registry.register_entrypoints()
+
+
+def evaluate(
+        model: Union[str, PyFuncModel],
+        model_type, dataset,
+        run_id=None,
+        evaluators=None,
+        evaluator_config=None
+) -> EvaluationResult:
+    """
+    :param model: A pyfunc model instance, or a URI referring to such a model.
+
+    :param model_type: A string describing the model type. The default evaluator
+                       supports "regressor" and "classifier" as model types.
+    :param dataset: An instance of `EvaluationDataset` containing features
+                    labels (optional) for model evaluation.
+    :param run_id: The ID of the MLflow Run to which to log results. If
+                   unspecified, behavior depends on the specified `evaluator`.
+                   When `run_id` is unspecified, the default evaluator logs
+                   results to the current active run, creating a new active run if
+                   one does not exist.
+    :param evaluators: The name of the evaluator to use for model evaluations, or
+                       a list of evaluator names. If unspecified, all evaluators
+                       capable  of evaluating the specified model on the specified
+                       dataset are used. The default evaluator can be referred to
+                       by the name 'default'.
+    :param evaluator_config: A dictionary of additional configurations to supply
+                             to the evaluator. If multiple evaluators are
+                             specified, each configuration should be supplied as
+                             a nested dictionary whose key is the evaluator name.
+    :return: An `EvaluationResult` instance containing evaluation results.
+    """
+    if evaluators is None:
+        evaluators = "default"
+
+    if not isinstance(evaluators, list):
+        evaluators = [evaluators]
+        evaluator_config = {evaluators[0]: evaluator_config}
+
+    if isinstance(model, str):
+        model = mlflow.pyfunc.load_model(model)
+
+    eval_results = []
+    for evaluator_name in evaluators:
+        config = evaluator_config[evaluator_name]
+        try:
+            evaluator = _model_evaluation_registry.get_evaluator(evaluator_name)
+        except MlflowException:
+            continue
+
+        if evaluator.can_evaluate(model_type, config):
+            result = evaluator.evaluate(model, model_type, dataset, run_id, config)
+            eval_results.append(result)
+
+    merged_eval_result = EvaluationResult(EvaluationMetrics(), dict())
+    for eval_result in eval_results:
+        merged_eval_result.metrics.update(eval_result.metrics)
+        merged_eval_result.artifacts.update(eval_result.artifacts)
+
+    return merged_eval_result

From f7d673833c6f9067cd2f1264af30cae358194be9 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Tue, 23 Nov 2021 21:37:33 +0800
Subject: [PATCH 022/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/__init__.py          | 389 +++++++++++++++++-
 mlflow/models/evaluation/base.py              | 375 -----------------
 .../models/evaluation/evaluator_registry.py   |  12 +
 .../mlflow_test_plugin/dummy_evaluator.py     |  16 +-
 tests/test_evaluation.py                      |   7 +-
 5 files changed, 403 insertions(+), 396 deletions(-)
 delete mode 100644 mlflow/models/evaluation/base.py

diff --git a/mlflow/models/evaluation/__init__.py b/mlflow/models/evaluation/__init__.py
index 476e0f28f6e63..beb42fda6735e 100644
--- a/mlflow/models/evaluation/__init__.py
+++ b/mlflow/models/evaluation/__init__.py
@@ -1,9 +1,380 @@
-from .base import (
-    _model_evaluation_registry,
-    evaluate,
-    EvaluationDataset,
-    EvaluationArtifact,
-    EvaluationMetrics,
-    ModelEvaluator
-)
-from .default_evaluator import DefaultEvaluator
+from typing import Dict, Union
+import mlflow
+import hashlib
+import time
+import numpy as np
+import pandas as pd
+import pickle
+import json
+import os
+from mlflow.exceptions import MlflowException
+from mlflow.utils.file_utils import TempDir
+from mlflow.entities import Metric, RunTag
+from mlflow.tracking.artifact_utils import _download_artifact_from_uri
+from mlflow.utils import _get_fully_qualified_class_name, load_class
+from mlflow.pyfunc import PyFuncModel
+
+
+class EvaluationMetrics(dict):
+    pass
+
+
+class EvaluationArtifact:
+    def __init__(self, location, content=None):
+        self._content = content
+        self._location = location
+
+    def _load_content_from_file(self, local_artifact_path):
+        raise NotImplementedError()
+
+    def _save_content_to_file(self, output_artifact_path):
+        raise NotImplementedError()
+
+    @property
+    def content(self):
+        """
+        The content of the artifact (representation varies)
+        """
+        if self._content is None:
+            with TempDir() as temp_dir:
+                local_artifact_file = temp_dir.path("local_artifact")
+                _download_artifact_from_uri(self._location, local_artifact_file)
+                self._load_content_from_file(local_artifact_file)
+
+        return self._content
+
+    @property
+    def location(self) -> str:
+        """
+        The location of the artifact
+        """
+        return self._location
+
+
+class EvaluationResult:
+    def __init__(self, metrics, artifacts):
+        self._metrics = metrics
+        self._artifacts = artifacts
+
+    @classmethod
+    def load(cls, path):
+        """Load the evaluation results from the specified local filesystem path"""
+        with open(os.path.join(path, "metrics.json"), "r") as fp:
+            metrics = EvaluationMetrics(json.load(fp))
+
+        with open(os.path.join(path, "artifacts_metadata.json"), "r") as fp:
+            artifacts_metadata = json.load(fp)
+
+        artifacts = {}
+
+        artifacts_dir = os.path.join(path, 'artifacts')
+
+        for artifact_name, meta in artifacts_metadata:
+            location = meta["location"]
+            ArtifactCls = load_class(meta["class_name"])
+            content = ArtifactCls.load_content_from_file(
+                os.path.join(artifacts_dir, artifact_name)
+            )
+            artifacts[artifact_name] = ArtifactCls(location=location, content=content)
+
+        return EvaluationResult(metrics=metrics, artifacts=artifacts)
+
+    def save(self, path):
+        """Write the evaluation results to the specified local filesystem path"""
+        os.makedirs(path, exist_ok=True)
+        with open(os.path.join(path, "metrics.json"), "w") as fp:
+            json.dump(self.metrics, fp)
+
+        artifacts_metadata = {
+            artifact_name: {
+                "location": artifact.location,
+                "class_name": _get_fully_qualified_class_name(artifact),
+            }
+            for artifact_name, artifact in self.artifacts.items()
+        }
+        with open(os.path.join(path, "artifacts_metadata.json"), "w") as fp:
+            json.dump(artifacts_metadata, fp)
+
+        artifacts_dir = os.path.join(path, 'artifacts')
+        os.mkdir(artifacts_dir)
+
+        for artifact_name, artifact in self.artifacts.items():
+            artifact._save_content_to_file(
+                os.path.join(artifacts_dir, artifact_name)
+            )
+
+    @property
+    def metrics(self) -> EvaluationMetrics:
+        """
+        A dictionary mapping scalar metric names to scalar metric values
+        """
+        return self._metrics
+
+    @property
+    def artifacts(self) -> Dict[str, EvaluationArtifact]:
+        """
+        A dictionary mapping standardized artifact names (e.g. "roc_data") to
+        artifact content and location information
+        """
+        return self._artifacts
+
+
+class EvaluationDataset:
+    """
+    Represents an input dataset for model evaluation. This is intended for
+    use with the `mlflow.evaluate()`API.
+    """
+
+    NUM_SAMPLE_ROWS_FOR_HASH = 5
+
+    def __init__(self, data, labels, name=None, path=None):
+        """
+        :param data: One of the following:
+         - A numpy array or list of evaluation features, excluding labels.
+         - A Pandas DataFrame, or the path to a serialized DataFrame,
+           containing evaluation features and labels. All columns will be regarded as feature
+           columns except the "labels" column.
+
+        :param labels: One of the following:
+         - A numpy array or list of evaluation labels, if `data` is also a numpy array or list.
+         - The string name of a column from `data` that contains evaluation labels, if `data`
+           is a DataFrame.
+
+        :param name: (Optional) The name of the dataset (must not contain ").
+
+        :param path: (Optional) the path to a serialized DataFrame (must not contain ").
+          (e.g. a delta table, parquet file)
+        """
+        if name is not None and '"' in name:
+            raise ValueError(f'Dataset name cannot include " but got name {name}')
+        if path is not None and '"' in path:
+            raise ValueError(f'Dataset path cannot include " but got name {path}')
+
+        if isinstance(data, (np.ndarray, list)):
+            if not isinstance(labels, (np.ndarray, list)):
+                raise ValueError(
+                    'If data is a numpy array or list of evaluation features, '
+                    'labels must be a numpy array or list of evaluation labels'
+                )
+        elif isinstance(data, pd.DataFrame):
+            if not isinstance(labels, str):
+                raise ValueError(
+                    'If data is a Pandas DataFrame, labels must be the string name of a column '
+                    'from `data` that contains evaluation labels'
+                )
+        else:
+            raise ValueError('The data argument must be a numpy array, a list or a '
+                             'Pandas DataFrame.')
+
+        self._user_specified_name = name
+        self.data = data
+        self.labels = labels
+        self.path = path
+        self._hash = None
+
+    def _extract_features_and_labels(self):
+        if isinstance(self.data, np.ndarray):
+            return self.data, self.labels
+        elif isinstance(self.data, pd.DataFrame):
+            feature_cols = [x for x in self.data.columns if x != self.labels]
+            return self.data[feature_cols], self.data[self.labels]
+        else:
+            raise ValueError(f'Unsupported data type: {type(self.data)}')
+
+    @staticmethod
+    def _gen_md5_for_arraylike_obj(md5_gen, data):
+        md5_gen.update(pickle.dumps(len(data)))
+        if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
+            md5_gen.update(pickle.dumps(data))
+        else:
+            md5_gen.update(pickle.dumps(data[: EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH]))
+            md5_gen.update(pickle.dumps(data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH :]))
+
+    @property
+    def name(self):
+        return self._user_specified_name if self._user_specified_name is not None else self.hash
+
+    @property
+    def hash(self):
+        """
+        Compute a hash from the specified dataset by selecting the first 5 records, last 5 records,
+        dataset size and feeding them through a cheap, low-collision hash function
+        """
+        if self._hash is None:
+            md5_gen = hashlib.md5()
+            if isinstance(self.data, np.ndarray):
+                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
+                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.labels)
+            elif isinstance(self.data, pd.DataFrame):
+                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
+                md5_gen.update(self.labels.encode("UTF-8"))
+            self._hash = md5_gen.hexdigest()
+        return self._hash
+
+    @property
+    def _metadata(self):
+        metadata = {
+            "name": self.name,
+            "hash": self.hash,
+        }
+        if self.path is not None:
+            metadata["path"] = self.path
+        return metadata
+
+
+class ModelEvaluator:
+    def can_evaluate(self, model_type, evaluator_config=None, **kwargs) -> bool:
+        """
+        :param model_type: A string describing the model type (e.g., "regressor",
+                           "classifier", …).
+        :param evaluator_config: A dictionary of additional configurations for
+                                 the evaluator.
+        :param **kwargs: For forwards compatibility, a placeholder for additional
+                         arguments that may be added to the evaluation interface
+                         in the future.
+        :return: True if the evaluator can evaluate the specified model on the
+                 specified dataset. False otherwise.
+        """
+        raise NotImplementedError()
+
+    def compute_metrics_and_compute_and_log_artifacts(
+        self, model, model_type, dataset, evaluator_config, run_id
+    ):
+        """
+        return an tuple of:
+         - an instance of EvaluationMetrics
+         - a dict of artifact_name -> instance_of_EvaluationArtifact
+        and log artifacts into run specified by run_id
+        """
+        raise NotImplementedError()
+
+    def evaluate(
+            self,
+            model: PyFuncModel,
+            model_type,
+            dataset,
+            run_id=None,
+            evaluator_config=None,
+            **kwargs
+    ) -> EvaluationResult:
+        """
+        :param model: A pyfunc model instance.
+        :param model_type: A string describing the model type (e.g., "regressor",
+                   "classifier", …).
+        :param dataset: An instance of `EvaluationDataset` containing features
+                        and labels (optional) for model evaluation.
+        :param run_id: The ID of the MLflow Run to which to log results.
+        :param evaluator_config: A dictionary of additional configurations for
+                                 the evaluator.
+        :param **kwargs: For forwards compatibility, a placeholder for additional
+                         arguments that may be added to the evaluation interface
+                         in the future.
+        :return: An `EvaluationResult` instance containing evaluation results.
+        """
+        client = mlflow.tracking.MlflowClient()
+        self.mlflow_client = client
+
+        def do_evaluate(_run_id):
+            timestamp = int(time.time() * 1000)
+            existing_dataset_metadata_str = client.get_run(_run_id).data.tags.get("mlflow.datasets")
+            if existing_dataset_metadata_str is not None:
+                dataset_metadata_list = json.loads(existing_dataset_metadata_str)
+            else:
+                dataset_metadata_list = []
+
+            metadata_exists = False
+            for metadata in dataset_metadata_list:
+                if (
+                    metadata["hash"] == dataset.hash
+                    and metadata["name"] == dataset._user_specified_name
+                ):
+                    metadata_exists = True
+                    break
+
+            if not metadata_exists:
+                dataset_metadata_list.append(dataset._metadata)
+
+            dataset_metadata_str = json.dumps(dataset_metadata_list)
+
+            metrics_dict, artifacts_dict = self.compute_metrics_and_compute_and_log_artifacts(
+                model, model_type, dataset, evaluator_config, _run_id
+            )
+            client.log_batch(
+                _run_id,
+                metrics=[
+                    Metric(key=f"{key}_on_{dataset.name}", value=value, timestamp=timestamp, step=0)
+                    for key, value in metrics_dict.items()
+                ],
+                tags=[RunTag("mlflow.datasets", dataset_metadata_str)],
+            )
+
+            return EvaluationResult(metrics_dict, artifacts_dict)
+
+        if mlflow.active_run() is not None:
+            return do_evaluate(mlflow.active_run().info.run_id)
+        else:
+            with mlflow.start_run(run_id=run_id) as run:
+                return do_evaluate(run.info.run_id)
+
+
+def evaluate(
+        model: Union[str, PyFuncModel],
+        model_type, dataset,
+        run_id=None,
+        evaluators=None,
+        evaluator_config=None
+) -> EvaluationResult:
+    """
+    :param model: A pyfunc model instance, or a URI referring to such a model.
+
+    :param model_type: A string describing the model type. The default evaluator
+                       supports "regressor" and "classifier" as model types.
+    :param dataset: An instance of `EvaluationDataset` containing features
+                    labels (optional) for model evaluation.
+    :param run_id: The ID of the MLflow Run to which to log results. If
+                   unspecified, behavior depends on the specified `evaluator`.
+                   When `run_id` is unspecified, the default evaluator logs
+                   results to the current active run, creating a new active run if
+                   one does not exist.
+    :param evaluators: The name of the evaluator to use for model evaluations, or
+                       a list of evaluator names. If unspecified, all evaluators
+                       capable  of evaluating the specified model on the specified
+                       dataset are used. The default evaluator can be referred to
+                       by the name 'default'.
+    :param evaluator_config: A dictionary of additional configurations to supply
+                             to the evaluator. If multiple evaluators are
+                             specified, each configuration should be supplied as
+                             a nested dictionary whose key is the evaluator name.
+    :return: An `EvaluationResult` instance containing evaluation results.
+    """
+    # import _model_evaluation_registry inside function to avoid circuit importing
+    from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
+
+    if evaluators is None:
+        evaluators = "default"
+
+    if not isinstance(evaluators, list):
+        evaluators = [evaluators]
+        evaluator_config = {evaluators[0]: evaluator_config}
+
+    if isinstance(model, str):
+        model = mlflow.pyfunc.load_model(model)
+
+    eval_results = []
+    for evaluator_name in evaluators:
+        config = evaluator_config[evaluator_name]
+        try:
+            evaluator = _model_evaluation_registry.get_evaluator(evaluator_name)
+        except MlflowException:
+            continue
+
+        if evaluator.can_evaluate(model_type, config):
+            result = evaluator.evaluate(model, model_type, dataset, run_id, config)
+            eval_results.append(result)
+
+    merged_eval_result = EvaluationResult(EvaluationMetrics(), dict())
+    for eval_result in eval_results:
+        merged_eval_result.metrics.update(eval_result.metrics)
+        merged_eval_result.artifacts.update(eval_result.artifacts)
+
+    return merged_eval_result
diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
deleted file mode 100644
index b451e42f81933..0000000000000
--- a/mlflow/models/evaluation/base.py
+++ /dev/null
@@ -1,375 +0,0 @@
-from typing import Dict, Union
-import mlflow
-import hashlib
-import time
-import numpy as np
-import pandas as pd
-import pickle
-import json
-import os
-from mlflow.exceptions import MlflowException
-from mlflow.utils.file_utils import TempDir
-from mlflow.entities import Metric, RunTag
-from mlflow.tracking.artifact_utils import _download_artifact_from_uri
-from mlflow.utils import _get_fully_qualified_class_name, load_class
-from mlflow.pyfunc import PyFuncModel
-from mlflow.models.evaluation.evaluator_registry import ModelEvaluatorRegistry
-
-
-class EvaluationMetrics(dict):
-    pass
-
-
-class EvaluationArtifact:
-    def __init__(self, location, content=None):
-        self._content = content
-        self._location = location
-
-    @classmethod
-    def load_content_from_file(cls, local_artifact_path):
-        raise NotImplementedError()
-
-    @classmethod
-    def save_content_to_file(cls, content, output_artifact_path):
-        raise NotImplementedError()
-
-    @property
-    def content(self):
-        """
-        The content of the artifact (representation varies)
-        """
-        if self._content is None:
-            with TempDir() as temp_dir:
-                local_artifact_file = temp_dir.path("local_artifact")
-                _download_artifact_from_uri(self._location, local_artifact_file)
-                self._content = self.load_content_from_file(local_artifact_file)
-
-        return self._content
-
-    @property
-    def location(self) -> str:
-        """
-        The location of the artifact
-        """
-        return self._location
-
-
-class EvaluationResult:
-    def __init__(self, metrics, artifacts):
-        self._metrics = metrics
-        self._artifacts = artifacts
-
-    @classmethod
-    def load(cls, path):
-        """Load the evaluation results from the specified local filesystem path"""
-        with open(os.path.join(path, "metrics.json"), "r") as fp:
-            metrics = EvaluationMetrics(json.load(fp))
-
-        with open(os.path.join(path, "artifacts_metadata.json"), "r") as fp:
-            artifacts_metadata = json.load(fp)
-
-        artifacts = {}
-
-        for artifact_name, meta in artifacts_metadata:
-            location = meta["location"]
-            ArtifactCls = load_class(meta["class_name"])
-            content = ArtifactCls.load_content_from_file(os.path.join(path, artifact_name))
-            artifacts[artifact_name] = ArtifactCls(location=location, content=content)
-
-        return EvaluationResult(metrics=metrics, artifacts=artifacts)
-
-    def save(self, path):
-        """Write the evaluation results to the specified local filesystem path"""
-        os.makedirs(path, exist_ok=True)
-        with open(os.path.join(path, "metrics.json"), "w") as fp:
-            json.dump(self.metrics, fp)
-
-        artifacts_metadata = {
-            artifact_name: {
-                "location": artifact.location,
-                "class_name": _get_fully_qualified_class_name(artifact),
-            }
-            for artifact_name, artifact in self.artifacts.items()
-        }
-        with open(os.path.join(path, "artifacts_metadata.json"), "w") as fp:
-            json.dump(artifacts_metadata, fp)
-
-        for artifact_name, artifact in self.artifacts.items():
-            artifact.save_content_to_file(artifact.content, os.path.join(path, artifact_name))
-
-    @property
-    def metrics(self) -> EvaluationMetrics:
-        """
-        A dictionary mapping scalar metric names to scalar metric values
-        """
-        return self._metrics
-
-    @property
-    def artifacts(self) -> Dict[str, EvaluationArtifact]:
-        """
-        A dictionary mapping standardized artifact names (e.g. "roc_data") to
-        artifact content and location information
-        """
-        return self._artifacts
-
-
-class EvaluationDataset:
-    """
-    Represents an input dataset for model evaluation. This is intended for
-    use with the `mlflow.evaluate()`API.
-    """
-
-    NUM_SAMPLE_ROWS_FOR_HASH = 5
-
-    def __init__(self, data, labels, name=None, path=None):
-        """
-        :param data: One of the following:
-         - A numpy array or list of evaluation features, excluding labels.
-         - A Pandas DataFrame, or the path to a serialized DataFrame,
-           containing evaluation features and labels. All columns will be regarded as feature
-           columns except the "labels" column.
-
-        :param labels: One of the following:
-         - A numpy array or list of evaluation labels, if `data` is also a numpy array or list.
-         - The string name of a column from `data` that contains evaluation labels, if `data`
-           is a DataFrame.
-
-        :param name: (Optional) The name of the dataset (must not contain ").
-
-        :param path: (Optional) the path to a serialized DataFrame (must not contain ").
-          (e.g. a delta table, parquet file)
-        """
-        if name is not None and '"' in name:
-            raise ValueError(f'Dataset name cannot include " but get name {name}')
-        if path is not None and '"' in path:
-            raise ValueError(f'Dataset path cannot include " but get name {path}')
-
-        if isinstance(data, (np.ndarray, list)):
-            if not isinstance(labels, (np.ndarray, list)):
-                raise ValueError(
-                    'If data is a numpy array or list of evaluation features, '
-                    'labels must be a numpy array or list of evaluation labels'
-                )
-        elif isinstance(data, pd.DataFrame):
-            if not isinstance(labels, str):
-                raise ValueError(
-                    'If data is a Pandas DataFrame, labels must be the string name of a column '
-                    'from `data` that contains evaluation labels'
-                )
-        else:
-            raise ValueError('The data argument must be a numpy array, a list or a '
-                             'Pandas DataFrame.')
-
-        self._user_specified_name = name
-        self.data = data
-        self.labels = labels
-        self.path = path
-        self._hash = None
-
-    def _extract_features_and_labels(self):
-        if isinstance(self.data, np.ndarray):
-            return self.data, self.labels
-        elif isinstance(self.data, pd.DataFrame):
-            feature_cols = [x for x in self.data.columns if x != self.labels]
-            return self.data[feature_cols], self.data[self.labels]
-        else:
-            raise ValueError(f'Unsupported data type: {type(self.data)}')
-
-    @staticmethod
-    def _gen_md5_for_arraylike_obj(md5_gen, data):
-        md5_gen.update(pickle.dumps(len(data)))
-        if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
-            md5_gen.update(pickle.dumps(data))
-        else:
-            md5_gen.update(pickle.dumps(data[: EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH]))
-            md5_gen.update(pickle.dumps(data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH :]))
-
-    @property
-    def name(self):
-        return self._user_specified_name if self._user_specified_name is not None else self.hash
-
-    @property
-    def hash(self):
-        """
-        Compute a hash from the specified dataset by selecting the first 5 records, last 5 records,
-        dataset size and feeding them through a cheap, low-collision hash function
-        """
-        if self._hash is None:
-            md5_gen = hashlib.md5()
-            if isinstance(self.data, np.ndarray):
-                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
-                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.labels)
-            elif isinstance(self.data, pd.DataFrame):
-                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
-                md5_gen.update(self.labels.encode("UTF-8"))
-            self._hash = md5_gen.hexdigest()
-        return self._hash
-
-    @property
-    def _metadata(self):
-        metadata = {
-            "name": self.name,
-            "hash": self.hash,
-        }
-        if self.path is not None:
-            metadata["path"] = self.path
-        return metadata
-
-
-class ModelEvaluator:
-    def can_evaluate(self, model_type, evaluator_config=None, **kwargs) -> bool:
-        """
-        :param model_type: A string describing the model type (e.g., "regressor",
-                           "classifier", …).
-        :param evaluator_config: A dictionary of additional configurations for
-                                 the evaluator.
-        :param **kwargs: For forwards compatibility, a placeholder for additional
-                         arguments that may be added to the evaluation interface
-                         in the future.
-        :return: True if the evaluator can evaluate the specified model on the
-                 specified dataset. False otherwise.
-        """
-        raise NotImplementedError()
-
-    def compute_metrics_and_compute_and_log_artifacts(
-        self, model, model_type, dataset, evaluator_config, run_id
-    ):
-        """
-        return an tuple of:
-         - an instance of EvaluationMetrics
-         - a dict of artifact_name -> instance_of_EvaluationArtifact
-        and log artifacts into run specified by run_id
-        """
-        raise NotImplementedError()
-
-    def evaluate(
-            self,
-            model: PyFuncModel,
-            model_type,
-            dataset,
-            run_id=None,
-            evaluator_config=None,
-            **kwargs
-    ) -> EvaluationResult:
-        """
-        :param model: A pyfunc model instance.
-        :param model_type: A string describing the model type (e.g., "regressor",
-                   "classifier", …).
-        :param dataset: An instance of `EvaluationDataset` containing features
-                        and labels (optional) for model evaluation.
-        :param run_id: The ID of the MLflow Run to which to log results.
-        :param evaluator_config: A dictionary of additional configurations for
-                                 the evaluator.
-        :param **kwargs: For forwards compatibility, a placeholder for additional
-                         arguments that may be added to the evaluation interface
-                         in the future.
-        :return: An `EvaluationResult` instance containing evaluation results.
-        """
-        client = mlflow.tracking.MlflowClient()
-        self.mlflow_client = client
-
-        def do_evaluate(_run_id):
-            timestamp = int(time.time() * 1000)
-            existing_dataset_metadata_str = client.get_run(_run_id).data.tags.get("mlflow.datasets")
-            if existing_dataset_metadata_str is not None:
-                dataset_metadata_list = json.loads(existing_dataset_metadata_str)
-            else:
-                dataset_metadata_list = []
-
-            metadata_exists = False
-            for metadata in dataset_metadata_list:
-                if (
-                    metadata["hash"] == dataset.hash
-                    and metadata["name"] == dataset._user_specified_name
-                ):
-                    metadata_exists = True
-                    break
-
-            if not metadata_exists:
-                dataset_metadata_list.append(dataset._metadata)
-
-            dataset_metadata_str = json.dumps(dataset_metadata_list)
-
-            metrics_dict, artifacts_dict = self.compute_metrics_and_compute_and_log_artifacts(
-                model, model_type, dataset, evaluator_config, _run_id
-            )
-            client.log_batch(
-                _run_id,
-                metrics=[
-                    Metric(key=f"{key}_on_{dataset.name}", value=value, timestamp=timestamp, step=0)
-                    for key, value in metrics_dict.items()
-                ],
-                tags=[RunTag("mlflow.datasets", dataset_metadata_str)],
-            )
-
-            return EvaluationResult(metrics_dict, artifacts_dict)
-
-        if mlflow.active_run() is not None:
-            return do_evaluate(mlflow.active_run().info.run_id)
-        else:
-            with mlflow.start_run(run_id=run_id) as run:
-                return do_evaluate(run.info.run_id)
-
-
-_model_evaluation_registry = ModelEvaluatorRegistry()
-_model_evaluation_registry.register_entrypoints()
-
-
-def evaluate(
-        model: Union[str, PyFuncModel],
-        model_type, dataset,
-        run_id=None,
-        evaluators=None,
-        evaluator_config=None
-) -> EvaluationResult:
-    """
-    :param model: A pyfunc model instance, or a URI referring to such a model.
-
-    :param model_type: A string describing the model type. The default evaluator
-                       supports "regressor" and "classifier" as model types.
-    :param dataset: An instance of `EvaluationDataset` containing features
-                    labels (optional) for model evaluation.
-    :param run_id: The ID of the MLflow Run to which to log results. If
-                   unspecified, behavior depends on the specified `evaluator`.
-                   When `run_id` is unspecified, the default evaluator logs
-                   results to the current active run, creating a new active run if
-                   one does not exist.
-    :param evaluators: The name of the evaluator to use for model evaluations, or
-                       a list of evaluator names. If unspecified, all evaluators
-                       capable  of evaluating the specified model on the specified
-                       dataset are used. The default evaluator can be referred to
-                       by the name 'default'.
-    :param evaluator_config: A dictionary of additional configurations to supply
-                             to the evaluator. If multiple evaluators are
-                             specified, each configuration should be supplied as
-                             a nested dictionary whose key is the evaluator name.
-    :return: An `EvaluationResult` instance containing evaluation results.
-    """
-    if evaluators is None:
-        evaluators = "default"
-
-    if not isinstance(evaluators, list):
-        evaluators = [evaluators]
-        evaluator_config = {evaluators[0]: evaluator_config}
-
-    if isinstance(model, str):
-        model = mlflow.pyfunc.load_model(model)
-
-    eval_results = []
-    for evaluator_name in evaluators:
-        config = evaluator_config[evaluator_name]
-        try:
-            evaluator = _model_evaluation_registry.get_evaluator(evaluator_name)
-        except MlflowException:
-            continue
-
-        if evaluator.can_evaluate(model_type, config):
-            result = evaluator.evaluate(model, model_type, dataset, run_id, config)
-            eval_results.append(result)
-
-    merged_eval_result = EvaluationResult(EvaluationMetrics(), dict())
-    for eval_result in eval_results:
-        merged_eval_result.metrics.update(eval_result.metrics)
-        merged_eval_result.artifacts.update(eval_result.artifacts)
-
-    return merged_eval_result
diff --git a/mlflow/models/evaluation/evaluator_registry.py b/mlflow/models/evaluation/evaluator_registry.py
index b026898aecde9..b97c66424b122 100644
--- a/mlflow/models/evaluation/evaluator_registry.py
+++ b/mlflow/models/evaluation/evaluator_registry.py
@@ -1,6 +1,7 @@
 import entrypoints
 import warnings
 from mlflow.exceptions import MlflowException
+from mlflow.utils.import_hooks import register_post_import_hook
 
 
 class ModelEvaluatorRegistry:
@@ -41,3 +42,14 @@ def get_evaluator(self, evaluator_name):
                 )
             )
         return evaluator_cls()
+
+
+_model_evaluation_registry = ModelEvaluatorRegistry()
+
+
+def register_entrypoints(module):
+    module._model_evaluation_registry.register_entrypoints()
+
+
+# Put it in post-importing hook to avoid circuit importing
+register_post_import_hook(register_entrypoints, __name__, overwrite=True)
diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
index b1fcdab974078..28040223136a1 100644
--- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -14,14 +14,14 @@
 
 
 class Array2DEvaluationArtifact(EvaluationArtifact):
-    @classmethod
-    def save_content_to_file(cls, content, output_artifact_path):
-        pd.DataFrame(content).to_csv(output_artifact_path, index=False)
 
-    @classmethod
-    def load_content_from_file(cls, local_artifact_path):
+    def _save_content_to_file(self, output_artifact_path):
+        pd.DataFrame(self._content).to_csv(output_artifact_path, index=False)
+
+    def _load_content_from_file(self, local_artifact_path):
         pdf = pd.read_csv(local_artifact_path)
-        return pdf.to_numpy()
+        self._content = pdf.to_numpy()
+        return self._content
 
 
 class DummyEvaluator(ModelEvaluator):
@@ -44,9 +44,7 @@ def compute_metrics_and_compute_and_log_artifacts(
                 content=confusion_matrix,
             )
             confusion_matrix_csv_buff = io.StringIO()
-            Array2DEvaluationArtifact.save_content_to_file(
-                confusion_matrix, confusion_matrix_csv_buff
-            )
+            confusion_matrix_artifact._save_content_to_file(confusion_matrix_csv_buff)
             self.mlflow_client.log_text(
                 run_id, confusion_matrix_csv_buff.getvalue(), confusion_matrix_artifact_name
             )
diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py
index d79240f04d171..6231d0e722a9e 100644
--- a/tests/test_evaluation.py
+++ b/tests/test_evaluation.py
@@ -92,12 +92,13 @@ def test_classifier_evaluate(classifier_model, iris_dataset):
     assert eval_result.metrics == expected_metrics
     returned_confusion_matrix_artifact = eval_result.artifacts[artifact_name]
     assert np.array_equal(returned_confusion_matrix_artifact.content, expected_artifact)
-    assert np.array_equal(
-        Array2DEvaluationArtifact.load_content_from_file(saved_artifact_path), expected_artifact
-    )
     assert returned_confusion_matrix_artifact.location == get_artifact_uri(
         run.info.run_id, artifact_name
     )
+    assert np.array_equal(
+        returned_confusion_matrix_artifact._load_content_from_file(saved_artifact_path),
+        expected_artifact
+    )
 
 
 def test_regressor_evaluate(regressor_model, iris_dataset):

From c7f0360403c944c8fd882d99c75313e241a1d88d Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Tue, 23 Nov 2021 23:02:36 +0800
Subject: [PATCH 023/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/__init__.py | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/mlflow/models/evaluation/__init__.py b/mlflow/models/evaluation/__init__.py
index beb42fda6735e..2810899b3c8f3 100644
--- a/mlflow/models/evaluation/__init__.py
+++ b/mlflow/models/evaluation/__init__.py
@@ -181,14 +181,31 @@ def _extract_features_and_labels(self):
         else:
             raise ValueError(f'Unsupported data type: {type(self.data)}')
 
+    @staticmethod
+    def _array_like_obj_to_bytes(self, data):
+        if isinstance(data, pd.DataFrame):
+            return data.to_numpy().tobytes()
+        elif isinstance(data, np.ndarray):
+            return data.tobytes()
+        elif isinstance(data, list):
+            return np.array(data).tobytes()
+
     @staticmethod
     def _gen_md5_for_arraylike_obj(md5_gen, data):
         md5_gen.update(pickle.dumps(len(data)))
         if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
-            md5_gen.update(pickle.dumps(data))
+            md5_gen.update(EvaluationDataset._array_like_obj_to_bytes(data))
         else:
-            md5_gen.update(pickle.dumps(data[: EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH]))
-            md5_gen.update(pickle.dumps(data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH :]))
+            md5_gen.update(
+                EvaluationDataset._array_like_obj_to_bytes(
+                    data[: EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH]
+                )
+            )
+            md5_gen.update(
+                EvaluationDataset._array_like_obj_to_bytes(
+                    data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH:]
+                )
+            )
 
     @property
     def name(self):

From c2b673e050830a9f7dabbd6e984632e7e0fe53e6 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Tue, 23 Nov 2021 23:03:05 +0800
Subject: [PATCH 024/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mlflow/models/evaluation/__init__.py b/mlflow/models/evaluation/__init__.py
index 2810899b3c8f3..c55569263cff1 100644
--- a/mlflow/models/evaluation/__init__.py
+++ b/mlflow/models/evaluation/__init__.py
@@ -182,13 +182,15 @@ def _extract_features_and_labels(self):
             raise ValueError(f'Unsupported data type: {type(self.data)}')
 
     @staticmethod
-    def _array_like_obj_to_bytes(self, data):
+    def _array_like_obj_to_bytes(data):
         if isinstance(data, pd.DataFrame):
             return data.to_numpy().tobytes()
         elif isinstance(data, np.ndarray):
             return data.tobytes()
         elif isinstance(data, list):
             return np.array(data).tobytes()
+        else:
+            raise ValueError('Unsupported data type.')
 
     @staticmethod
     def _gen_md5_for_arraylike_obj(md5_gen, data):

From b046af3dbb9324c3882ced8da04db9c56c4153fc Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 25 Nov 2021 22:50:11 +0800
Subject: [PATCH 025/120] address comments

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/__init__.py          | 214 +++++++++++-------
 .../models/evaluation/evaluator_registry.py   |   2 +-
 .../mlflow_test_plugin/dummy_evaluator.py     |   6 +-
 tests/test_evaluation.py                      |   2 +-
 4 files changed, 133 insertions(+), 91 deletions(-)

diff --git a/mlflow/models/evaluation/__init__.py b/mlflow/models/evaluation/__init__.py
index c55569263cff1..e2c357ed7b09a 100644
--- a/mlflow/models/evaluation/__init__.py
+++ b/mlflow/models/evaluation/__init__.py
@@ -13,6 +13,11 @@
 from mlflow.tracking.artifact_utils import _download_artifact_from_uri
 from mlflow.utils import _get_fully_qualified_class_name, load_class
 from mlflow.pyfunc import PyFuncModel
+import pyspark.sql
+import logging
+
+
+_logger = logging.getLogger(__name__)
 
 
 class EvaluationMetrics(dict):
@@ -20,14 +25,23 @@ class EvaluationMetrics(dict):
 
 
 class EvaluationArtifact:
-    def __init__(self, location, content=None):
+    def __init__(self, uri, content=None):
         self._content = content
-        self._location = location
+        self._uri = uri
 
     def _load_content_from_file(self, local_artifact_path):
         raise NotImplementedError()
 
-    def _save_content_to_file(self, output_artifact_path):
+    def load(self, local_artifact_path=None):
+        if local_artifact_path is None:
+            return self._load_content_from_file(local_artifact_path)
+        else:
+            with TempDir() as temp_dir:
+                local_artifact_file = temp_dir.path("local_artifact")
+                _download_artifact_from_uri(self._uri, local_artifact_file)
+                self._load_content_from_file(local_artifact_file)
+
+    def save(self, output_artifact_path):
         raise NotImplementedError()
 
     @property
@@ -36,19 +50,15 @@ def content(self):
         The content of the artifact (representation varies)
         """
         if self._content is None:
-            with TempDir() as temp_dir:
-                local_artifact_file = temp_dir.path("local_artifact")
-                _download_artifact_from_uri(self._location, local_artifact_file)
-                self._load_content_from_file(local_artifact_file)
-
+            self.load()
         return self._content
 
     @property
-    def location(self) -> str:
+    def uri(self) -> str:
         """
-        The location of the artifact
+        The URI of the artifact
         """
-        return self._location
+        return self._uri
 
 
 class EvaluationResult:
@@ -99,7 +109,7 @@ def save(self, path):
         os.mkdir(artifacts_dir)
 
         for artifact_name, artifact in self.artifacts.items():
-            artifact._save_content_to_file(
+            artifact.save(
                 os.path.join(artifacts_dir, artifact_name)
             )
 
@@ -119,6 +129,16 @@ def artifacts(self) -> Dict[str, EvaluationArtifact]:
         return self._artifacts
 
 
+_cached_mlflow_client = None
+
+
+def _get_mlflow_client():
+    global _cached_mlflow_client
+    if _cached_mlflow_client is None:
+        _cached_mlflow_client = mlflow.tracking.MlflowClient()
+    return _cached_mlflow_client
+
+
 class EvaluationDataset:
     """
     Represents an input dataset for model evaluation. This is intended for
@@ -126,12 +146,13 @@ class EvaluationDataset:
     """
 
     NUM_SAMPLE_ROWS_FOR_HASH = 5
+    SPARK_DATAFRAME_LIMIT = 10000
 
     def __init__(self, data, labels, name=None, path=None):
         """
         :param data: One of the following:
          - A numpy array or list of evaluation features, excluding labels.
-         - A Pandas DataFrame, or the path to a serialized DataFrame,
+         - A Pandas DataFrame, or a spark DataFrame,
            containing evaluation features and labels. All columns will be regarded as feature
            columns except the "labels" column.
 
@@ -146,9 +167,9 @@ def __init__(self, data, labels, name=None, path=None):
           (e.g. a delta table, parquet file)
         """
         if name is not None and '"' in name:
-            raise ValueError(f'Dataset name cannot include " but got name {name}')
+            raise ValueError(f'Dataset name cannot include a double quote (") but got name {name}')
         if path is not None and '"' in path:
-            raise ValueError(f'Dataset path cannot include " but got name {path}')
+            raise ValueError(f'Dataset path cannot include a double quote (") but got name {path}')
 
         if isinstance(data, (np.ndarray, list)):
             if not isinstance(labels, (np.ndarray, list)):
@@ -156,7 +177,7 @@ def __init__(self, data, labels, name=None, path=None):
                     'If data is a numpy array or list of evaluation features, '
                     'labels must be a numpy array or list of evaluation labels'
                 )
-        elif isinstance(data, pd.DataFrame):
+        elif isinstance(data, (pd.DataFrame, pyspark.sql.DataFrame)):
             if not isinstance(labels, str):
                 raise ValueError(
                     'If data is a Pandas DataFrame, labels must be the string name of a column '
@@ -175,16 +196,23 @@ def __init__(self, data, labels, name=None, path=None):
     def _extract_features_and_labels(self):
         if isinstance(self.data, np.ndarray):
             return self.data, self.labels
-        elif isinstance(self.data, pd.DataFrame):
-            feature_cols = [x for x in self.data.columns if x != self.labels]
-            return self.data[feature_cols], self.data[self.labels]
+        elif isinstance(self.data, (pd.DataFrame, pyspark.sql.DataFrame)):
+            if isinstance(self.data, pyspark.sql.DataFrame):
+                data = self.data.limit(EvaluationDataset.SPARK_DATAFRAME_LIMIT).toPandas()
+                _logger.warning(
+                    f'Only the first {EvaluationDataset.SPARK_DATAFRAME_LIMIT} rows in the '
+                    f'spark dataframe are examined.')
+            else:
+                data = self.data
+            feature_cols = [x for x in data.columns if x != self.labels]
+            return data[feature_cols], data[self.labels]
         else:
             raise ValueError(f'Unsupported data type: {type(self.data)}')
 
     @staticmethod
     def _array_like_obj_to_bytes(data):
         if isinstance(data, pd.DataFrame):
-            return data.to_numpy().tobytes()
+            return data.to_numpy().tobytes() + data.columns.to_numpy().tobytes()
         elif isinstance(data, np.ndarray):
             return data.tobytes()
         elif isinstance(data, list):
@@ -194,7 +222,7 @@ def _array_like_obj_to_bytes(data):
 
     @staticmethod
     def _gen_md5_for_arraylike_obj(md5_gen, data):
-        md5_gen.update(pickle.dumps(len(data)))
+        md5_gen.update(np.int64(len(data)).tobytes())
         if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
             md5_gen.update(EvaluationDataset._array_like_obj_to_bytes(data))
         else:
@@ -240,6 +268,33 @@ def _metadata(self):
             metadata["path"] = self.path
         return metadata
 
+    def _log_dataset_tag(self, run_id):
+        client = _get_mlflow_client()
+        timestamp = int(time.time() * 1000)
+        existing_dataset_metadata_str = client.get_run(run_id).data.tags.get("mlflow.datasets")
+        if existing_dataset_metadata_str is not None:
+            dataset_metadata_list = json.loads(existing_dataset_metadata_str)
+        else:
+            dataset_metadata_list = []
+
+        metadata_exists = False
+        for metadata in dataset_metadata_list:
+            if (
+                    metadata["hash"] == self.hash
+                    and metadata["name"] == self._user_specified_name
+            ):
+                metadata_exists = True
+                break
+
+        if not metadata_exists:
+            dataset_metadata_list.append(self._metadata)
+
+        dataset_metadata_str = json.dumps(dataset_metadata_list)
+        client.log_batch(
+            run_id,
+            tags=[RunTag("mlflow.datasets", dataset_metadata_str)],
+        )
+
 
 class ModelEvaluator:
     def can_evaluate(self, model_type, evaluator_config=None, **kwargs) -> bool:
@@ -256,17 +311,6 @@ def can_evaluate(self, model_type, evaluator_config=None, **kwargs) -> bool:
         """
         raise NotImplementedError()
 
-    def compute_metrics_and_compute_and_log_artifacts(
-        self, model, model_type, dataset, evaluator_config, run_id
-    ):
-        """
-        return an tuple of:
-         - an instance of EvaluationMetrics
-         - a dict of artifact_name -> instance_of_EvaluationArtifact
-        and log artifacts into run specified by run_id
-        """
-        raise NotImplementedError()
-
     def evaluate(
             self,
             model: PyFuncModel,
@@ -290,50 +334,43 @@ def evaluate(
                          in the future.
         :return: An `EvaluationResult` instance containing evaluation results.
         """
-        client = mlflow.tracking.MlflowClient()
-        self.mlflow_client = client
-
-        def do_evaluate(_run_id):
-            timestamp = int(time.time() * 1000)
-            existing_dataset_metadata_str = client.get_run(_run_id).data.tags.get("mlflow.datasets")
-            if existing_dataset_metadata_str is not None:
-                dataset_metadata_list = json.loads(existing_dataset_metadata_str)
-            else:
-                dataset_metadata_list = []
+        raise NotImplementedError()
 
-            metadata_exists = False
-            for metadata in dataset_metadata_list:
-                if (
-                    metadata["hash"] == dataset.hash
-                    and metadata["name"] == dataset._user_specified_name
-                ):
-                    metadata_exists = True
-                    break
 
-            if not metadata_exists:
-                dataset_metadata_list.append(dataset._metadata)
+def list_evaluators():
+    """
+    Return a name list for all available Evaluators.
+    """
+    # import _model_evaluation_registry inside function to avoid circuit importing
+    from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
+    return list(_model_evaluation_registry._registry.keys())
 
-            dataset_metadata_str = json.dumps(dataset_metadata_list)
 
-            metrics_dict, artifacts_dict = self.compute_metrics_and_compute_and_log_artifacts(
-                model, model_type, dataset, evaluator_config, _run_id
-            )
-            client.log_batch(
-                _run_id,
-                metrics=[
-                    Metric(key=f"{key}_on_{dataset.name}", value=value, timestamp=timestamp, step=0)
-                    for key, value in metrics_dict.items()
-                ],
-                tags=[RunTag("mlflow.datasets", dataset_metadata_str)],
-            )
+class StartRunOrReuseActiveRun:
 
-            return EvaluationResult(metrics_dict, artifacts_dict)
+    def __init__(self, run_id):
+        self.user_specified_run_id = run_id
+        self.managed_run = None
 
+    def __enter__(self):
         if mlflow.active_run() is not None:
-            return do_evaluate(mlflow.active_run().info.run_id)
+            active_run_id = mlflow.active_run().info.run_id
+            if self.user_specified_run_id is not None and \
+                    self.user_specified_run_id != active_run_id:
+                raise ValueError("An active run exists, you cannot specify another run_id when "
+                                 "evaluating.")
+            return active_run_id
         else:
-            with mlflow.start_run(run_id=run_id) as run:
-                return do_evaluate(run.info.run_id)
+            if self.user_specified_run_id is None:
+                raise ValueError("Active run does not exist, you need specify a run_id when "
+                                 "evaluating.")
+            self.managed_run = \
+                mlflow.start_run(run_id=self.user_specified_run_id).__enter__()
+            return self.user_specified_run_id
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.managed_run is not None:
+            return self.managed_run.__exit__(exc_type, exc_val, exc_tb)
 
 
 def evaluate(
@@ -379,21 +416,26 @@ def evaluate(
     if isinstance(model, str):
         model = mlflow.pyfunc.load_model(model)
 
-    eval_results = []
-    for evaluator_name in evaluators:
-        config = evaluator_config[evaluator_name]
-        try:
-            evaluator = _model_evaluation_registry.get_evaluator(evaluator_name)
-        except MlflowException:
-            continue
-
-        if evaluator.can_evaluate(model_type, config):
-            result = evaluator.evaluate(model, model_type, dataset, run_id, config)
-            eval_results.append(result)
-
-    merged_eval_result = EvaluationResult(EvaluationMetrics(), dict())
-    for eval_result in eval_results:
-        merged_eval_result.metrics.update(eval_result.metrics)
-        merged_eval_result.artifacts.update(eval_result.artifacts)
-
-    return merged_eval_result
+    with StartRunOrReuseActiveRun(run_id) as actual_run_id:
+        dataset._log_dataset_tag(actual_run_id)
+
+        eval_results = []
+        for evaluator_name in evaluators:
+            config = evaluator_config[evaluator_name]
+            try:
+                evaluator = _model_evaluation_registry.get_evaluator(evaluator_name)
+            except MlflowException:
+                _logger.warning(f"Evaluator '{evaluator_name}' is not registered.")
+                continue
+
+            if evaluator.can_evaluate(model_type, config):
+                _logger.info(f"Evaluating the model with the {evaluator_name} evaluator.")
+                result = evaluator.evaluate(model, model_type, dataset, actual_run_id, config)
+                eval_results.append(result)
+
+        merged_eval_result = EvaluationResult(EvaluationMetrics(), dict())
+        for eval_result in eval_results:
+            merged_eval_result.metrics.update(eval_result.metrics)
+            merged_eval_result.artifacts.update(eval_result.artifacts)
+
+        return merged_eval_result
diff --git a/mlflow/models/evaluation/evaluator_registry.py b/mlflow/models/evaluation/evaluator_registry.py
index b97c66424b122..1c2366bf9849f 100644
--- a/mlflow/models/evaluation/evaluator_registry.py
+++ b/mlflow/models/evaluation/evaluator_registry.py
@@ -17,7 +17,7 @@ def register(self, scheme, evaluator):
         self._registry[scheme] = evaluator
 
     def register_entrypoints(self):
-        # Register artifact repositories provided by other packages
+        # Register ModelEvaluator implementation provided by other packages
         for entrypoint in entrypoints.get_group_all("mlflow.model_evaluator"):
             try:
                 self.register(entrypoint.name, entrypoint.load())
diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
index 28040223136a1..a832710fc4a7b 100644
--- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -15,10 +15,10 @@
 
 class Array2DEvaluationArtifact(EvaluationArtifact):
 
-    def _save_content_to_file(self, output_artifact_path):
+    def save(self, output_artifact_path):
         pd.DataFrame(self._content).to_csv(output_artifact_path, index=False)
 
-    def _load_content_from_file(self, local_artifact_path):
+    def load(self, local_artifact_path):
         pdf = pd.read_csv(local_artifact_path)
         self._content = pdf.to_numpy()
         return self._content
@@ -44,7 +44,7 @@ def compute_metrics_and_compute_and_log_artifacts(
                 content=confusion_matrix,
             )
             confusion_matrix_csv_buff = io.StringIO()
-            confusion_matrix_artifact._save_content_to_file(confusion_matrix_csv_buff)
+            confusion_matrix_artifact.save(confusion_matrix_csv_buff)
             self.mlflow_client.log_text(
                 run_id, confusion_matrix_csv_buff.getvalue(), confusion_matrix_artifact_name
             )
diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py
index 6231d0e722a9e..1e9a5897d8ae8 100644
--- a/tests/test_evaluation.py
+++ b/tests/test_evaluation.py
@@ -96,7 +96,7 @@ def test_classifier_evaluate(classifier_model, iris_dataset):
         run.info.run_id, artifact_name
     )
     assert np.array_equal(
-        returned_confusion_matrix_artifact._load_content_from_file(saved_artifact_path),
+        returned_confusion_matrix_artifact.load(saved_artifact_path),
         expected_artifact
     )
 

From 14c8e82a9a6ae8c49fbab3e0eedb4cad89ebe827 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Sun, 28 Nov 2021 11:46:50 +0800
Subject: [PATCH 026/120] fix

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/__init__.py          | 34 +++++++++++--------
 mlflow/utils/__init__.py                      |  9 -----
 tests/{ => models}/test_evaluation.py         |  2 +-
 .../mlflow_test_plugin/dummy_evaluator.py     | 33 +++++++++++-------
 4 files changed, 41 insertions(+), 37 deletions(-)
 rename tests/{ => models}/test_evaluation.py (98%)

diff --git a/mlflow/models/evaluation/__init__.py b/mlflow/models/evaluation/__init__.py
index e2c357ed7b09a..bff6e2fd39bcc 100644
--- a/mlflow/models/evaluation/__init__.py
+++ b/mlflow/models/evaluation/__init__.py
@@ -11,7 +11,8 @@
 from mlflow.utils.file_utils import TempDir
 from mlflow.entities import Metric, RunTag
 from mlflow.tracking.artifact_utils import _download_artifact_from_uri
-from mlflow.utils import _get_fully_qualified_class_name, load_class
+from mlflow.utils import _get_fully_qualified_class_name
+from mlflow.utils.class_utils import _get_class_from_string
 from mlflow.pyfunc import PyFuncModel
 import pyspark.sql
 import logging
@@ -26,8 +27,8 @@ class EvaluationMetrics(dict):
 
 class EvaluationArtifact:
     def __init__(self, uri, content=None):
-        self._content = content
         self._uri = uri
+        self._content = content
 
     def _load_content_from_file(self, local_artifact_path):
         raise NotImplementedError()
@@ -81,7 +82,7 @@ def load(cls, path):
 
         for artifact_name, meta in artifacts_metadata:
             location = meta["location"]
-            ArtifactCls = load_class(meta["class_name"])
+            ArtifactCls = _get_class_from_string(meta["class_name"])
             content = ArtifactCls.load_content_from_file(
                 os.path.join(artifacts_dir, artifact_name)
             )
@@ -132,13 +133,6 @@ def artifacts(self) -> Dict[str, EvaluationArtifact]:
 _cached_mlflow_client = None
 
 
-def _get_mlflow_client():
-    global _cached_mlflow_client
-    if _cached_mlflow_client is None:
-        _cached_mlflow_client = mlflow.tracking.MlflowClient()
-    return _cached_mlflow_client
-
-
 class EvaluationDataset:
     """
     Represents an input dataset for model evaluation. This is intended for
@@ -268,9 +262,7 @@ def _metadata(self):
             metadata["path"] = self.path
         return metadata
 
-    def _log_dataset_tag(self, run_id):
-        client = _get_mlflow_client()
-        timestamp = int(time.time() * 1000)
+    def _log_dataset_tag(self, client, run_id):
         existing_dataset_metadata_str = client.get_run(run_id).data.tags.get("mlflow.datasets")
         if existing_dataset_metadata_str is not None:
             dataset_metadata_list = json.loads(existing_dataset_metadata_str)
@@ -311,12 +303,23 @@ def can_evaluate(self, model_type, evaluator_config=None, **kwargs) -> bool:
         """
         raise NotImplementedError()
 
+    def _log_metrics(self, run_id, metrics, dataset_name):
+        client = mlflow.tracking.MlflowClient()
+        timestamp = int(time.time() * 1000)
+        client.log_batch(
+            run_id,
+            metrics=[
+                Metric(key=f"{key}_on_{dataset_name}", value=value, timestamp=timestamp, step=0)
+                for key, value in metrics.items()
+            ],
+        )
+
     def evaluate(
             self,
             model: PyFuncModel,
             model_type,
             dataset,
-            run_id=None,
+            run_id,
             evaluator_config=None,
             **kwargs
     ) -> EvaluationResult:
@@ -417,7 +420,8 @@ def evaluate(
         model = mlflow.pyfunc.load_model(model)
 
     with StartRunOrReuseActiveRun(run_id) as actual_run_id:
-        dataset._log_dataset_tag(actual_run_id)
+        client = mlflow.tracking.MlflowClient()
+        dataset._log_dataset_tag(client, actual_run_id)
 
         eval_results = []
         for evaluator_name in evaluators:
diff --git a/mlflow/utils/__init__.py b/mlflow/utils/__init__.py
index 2bb183e281fd4..270637954e784 100644
--- a/mlflow/utils/__init__.py
+++ b/mlflow/utils/__init__.py
@@ -1,7 +1,6 @@
 import logging
 from itertools import islice
 from sys import version_info
-import importlib
 
 
 _logger = logging.getLogger(__name__)
@@ -173,11 +172,3 @@ def _inspect_original_var_name(var, fallback_name):
 
     except Exception:
         return fallback_name
-
-
-def load_class(kls):
-    parts = kls.split(".")
-    module = ".".join(parts[:-1])
-    m = importlib.import_module(module)
-    name = parts[-1]
-    return getattr(m, name)
diff --git a/tests/test_evaluation.py b/tests/models/test_evaluation.py
similarity index 98%
rename from tests/test_evaluation.py
rename to tests/models/test_evaluation.py
index 1e9a5897d8ae8..64fe528092f1d 100644
--- a/tests/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -92,7 +92,7 @@ def test_classifier_evaluate(classifier_model, iris_dataset):
     assert eval_result.metrics == expected_metrics
     returned_confusion_matrix_artifact = eval_result.artifacts[artifact_name]
     assert np.array_equal(returned_confusion_matrix_artifact.content, expected_artifact)
-    assert returned_confusion_matrix_artifact.location == get_artifact_uri(
+    assert returned_confusion_matrix_artifact.uri == get_artifact_uri(
         run.info.run_id, artifact_name
     )
     assert np.array_equal(
diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
index a832710fc4a7b..15e53eda523e7 100644
--- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -28,36 +28,45 @@ class DummyEvaluator(ModelEvaluator):
     def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
         return evaluator_config.get("can_evaluate") and model_type in ["classifier", "regressor"]
 
-    def compute_metrics_and_compute_and_log_artifacts(
-        self, model, model_type, dataset, evaluator_config, run_id
-    ):
+    def evaluate(
+            self,
+            model,
+            model_type,
+            dataset,
+            run_id,
+            evaluator_config=None,
+            **kwargs
+    ) -> EvaluationResult:
+        client = mlflow.tracking.MlflowClient()
         X, y = dataset._extract_features_and_labels()
         y_pred = model.predict(X)
         if model_type == "classifier":
             accuracy_score = sk_metrics.accuracy_score(y, y_pred)
 
-            metrics = EvaluationMetrics(accuracy_score=accuracy_score,)
+            metrics = EvaluationMetrics(accuracy_score=accuracy_score)
+            self._log_metrics(run_id, metrics, dataset.name)
             confusion_matrix = sk_metrics.confusion_matrix(y, y_pred)
             confusion_matrix_artifact_name = f"confusion_matrix_on_{dataset.name}.csv"
             confusion_matrix_artifact = Array2DEvaluationArtifact(
-                location=get_artifact_uri(run_id, confusion_matrix_artifact_name),
+                uri=get_artifact_uri(run_id, confusion_matrix_artifact_name),
                 content=confusion_matrix,
             )
             confusion_matrix_csv_buff = io.StringIO()
             confusion_matrix_artifact.save(confusion_matrix_csv_buff)
-            self.mlflow_client.log_text(
+            client.log_text(
                 run_id, confusion_matrix_csv_buff.getvalue(), confusion_matrix_artifact_name
             )
             artifacts = {confusion_matrix_artifact_name: confusion_matrix_artifact}
-            return metrics, artifacts
         elif model_type == "regressor":
             mean_absolute_error = sk_metrics.mean_absolute_error(y, y_pred)
             mean_squared_error = sk_metrics.mean_squared_error(y, y_pred)
-            return (
-                EvaluationMetrics(
-                    mean_absolute_error=mean_absolute_error, mean_squared_error=mean_squared_error
-                ),
-                {},
+            metrics = EvaluationMetrics(
+                mean_absolute_error=mean_absolute_error,
+                mean_squared_error=mean_squared_error
             )
+            self._log_metrics(run_id, metrics, dataset.name)
+            artifacts = {}
         else:
             raise ValueError(f"Unsupported model type {model_type}")
+
+        return EvaluationResult(metrics=metrics, artifacts=artifacts)

From 25101f965984ba5b1cdc052eb62c3c1308bfa69c Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Sun, 28 Nov 2021 12:01:46 +0800
Subject: [PATCH 027/120] refactor

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/__init__.py                            |   4 +-
 mlflow/models/__init__.py                     |   2 +
 mlflow/models/evaluation/__init__.py          | 454 +-----------------
 mlflow/models/evaluation/base.py              | 438 +++++++++++++++++
 tests/models/test_evaluation.py               |  11 +-
 .../mlflow_test_plugin/dummy_evaluator.py     |  12 +-
 6 files changed, 461 insertions(+), 460 deletions(-)
 create mode 100644 mlflow/models/evaluation/base.py

diff --git a/mlflow/__init__.py b/mlflow/__init__.py
index 41291b8f6e833..c66ba9eff8201 100644
--- a/mlflow/__init__.py
+++ b/mlflow/__init__.py
@@ -31,6 +31,7 @@
 from mlflow.utils.logging_utils import _configure_mlflow_loggers
 import mlflow.tracking._model_registry.fluent
 import mlflow.tracking.fluent
+import mlflow.models
 
 # Filter annoying Cython warnings that serve no good purpose, and so before
 # importing other modules.
@@ -151,7 +152,7 @@
 delete_run = mlflow.tracking.fluent.delete_run
 register_model = mlflow.tracking._model_registry.fluent.register_model
 autolog = mlflow.tracking.fluent.autolog
-
+evaluate = mlflow.models.evaluate
 
 run = projects.run
 
@@ -191,4 +192,5 @@
     "set_registry_uri",
     "list_run_infos",
     "autolog",
+    "evaluate",
 ] + _model_flavors_supported
diff --git a/mlflow/models/__init__.py b/mlflow/models/__init__.py
index a0f6f5136e0e6..9e36cb45fb3e0 100644
--- a/mlflow/models/__init__.py
+++ b/mlflow/models/__init__.py
@@ -26,6 +26,7 @@
 from .signature import ModelSignature, infer_signature
 from .utils import ModelInputExample
 from ..utils.environment import infer_pip_requirements
+from .evaluation import evaluate
 
 __all__ = [
     "Model",
@@ -34,4 +35,5 @@
     "infer_signature",
     "FlavorBackend",
     "infer_pip_requirements",
+    "evaluate",
 ]
diff --git a/mlflow/models/evaluation/__init__.py b/mlflow/models/evaluation/__init__.py
index bff6e2fd39bcc..2ccda9894beca 100644
--- a/mlflow/models/evaluation/__init__.py
+++ b/mlflow/models/evaluation/__init__.py
@@ -1,445 +1,9 @@
-from typing import Dict, Union
-import mlflow
-import hashlib
-import time
-import numpy as np
-import pandas as pd
-import pickle
-import json
-import os
-from mlflow.exceptions import MlflowException
-from mlflow.utils.file_utils import TempDir
-from mlflow.entities import Metric, RunTag
-from mlflow.tracking.artifact_utils import _download_artifact_from_uri
-from mlflow.utils import _get_fully_qualified_class_name
-from mlflow.utils.class_utils import _get_class_from_string
-from mlflow.pyfunc import PyFuncModel
-import pyspark.sql
-import logging
-
-
-_logger = logging.getLogger(__name__)
-
-
-class EvaluationMetrics(dict):
-    pass
-
-
-class EvaluationArtifact:
-    def __init__(self, uri, content=None):
-        self._uri = uri
-        self._content = content
-
-    def _load_content_from_file(self, local_artifact_path):
-        raise NotImplementedError()
-
-    def load(self, local_artifact_path=None):
-        if local_artifact_path is None:
-            return self._load_content_from_file(local_artifact_path)
-        else:
-            with TempDir() as temp_dir:
-                local_artifact_file = temp_dir.path("local_artifact")
-                _download_artifact_from_uri(self._uri, local_artifact_file)
-                self._load_content_from_file(local_artifact_file)
-
-    def save(self, output_artifact_path):
-        raise NotImplementedError()
-
-    @property
-    def content(self):
-        """
-        The content of the artifact (representation varies)
-        """
-        if self._content is None:
-            self.load()
-        return self._content
-
-    @property
-    def uri(self) -> str:
-        """
-        The URI of the artifact
-        """
-        return self._uri
-
-
-class EvaluationResult:
-    def __init__(self, metrics, artifacts):
-        self._metrics = metrics
-        self._artifacts = artifacts
-
-    @classmethod
-    def load(cls, path):
-        """Load the evaluation results from the specified local filesystem path"""
-        with open(os.path.join(path, "metrics.json"), "r") as fp:
-            metrics = EvaluationMetrics(json.load(fp))
-
-        with open(os.path.join(path, "artifacts_metadata.json"), "r") as fp:
-            artifacts_metadata = json.load(fp)
-
-        artifacts = {}
-
-        artifacts_dir = os.path.join(path, 'artifacts')
-
-        for artifact_name, meta in artifacts_metadata:
-            location = meta["location"]
-            ArtifactCls = _get_class_from_string(meta["class_name"])
-            content = ArtifactCls.load_content_from_file(
-                os.path.join(artifacts_dir, artifact_name)
-            )
-            artifacts[artifact_name] = ArtifactCls(location=location, content=content)
-
-        return EvaluationResult(metrics=metrics, artifacts=artifacts)
-
-    def save(self, path):
-        """Write the evaluation results to the specified local filesystem path"""
-        os.makedirs(path, exist_ok=True)
-        with open(os.path.join(path, "metrics.json"), "w") as fp:
-            json.dump(self.metrics, fp)
-
-        artifacts_metadata = {
-            artifact_name: {
-                "location": artifact.location,
-                "class_name": _get_fully_qualified_class_name(artifact),
-            }
-            for artifact_name, artifact in self.artifacts.items()
-        }
-        with open(os.path.join(path, "artifacts_metadata.json"), "w") as fp:
-            json.dump(artifacts_metadata, fp)
-
-        artifacts_dir = os.path.join(path, 'artifacts')
-        os.mkdir(artifacts_dir)
-
-        for artifact_name, artifact in self.artifacts.items():
-            artifact.save(
-                os.path.join(artifacts_dir, artifact_name)
-            )
-
-    @property
-    def metrics(self) -> EvaluationMetrics:
-        """
-        A dictionary mapping scalar metric names to scalar metric values
-        """
-        return self._metrics
-
-    @property
-    def artifacts(self) -> Dict[str, EvaluationArtifact]:
-        """
-        A dictionary mapping standardized artifact names (e.g. "roc_data") to
-        artifact content and location information
-        """
-        return self._artifacts
-
-
-_cached_mlflow_client = None
-
-
-class EvaluationDataset:
-    """
-    Represents an input dataset for model evaluation. This is intended for
-    use with the `mlflow.evaluate()`API.
-    """
-
-    NUM_SAMPLE_ROWS_FOR_HASH = 5
-    SPARK_DATAFRAME_LIMIT = 10000
-
-    def __init__(self, data, labels, name=None, path=None):
-        """
-        :param data: One of the following:
-         - A numpy array or list of evaluation features, excluding labels.
-         - A Pandas DataFrame, or a spark DataFrame,
-           containing evaluation features and labels. All columns will be regarded as feature
-           columns except the "labels" column.
-
-        :param labels: One of the following:
-         - A numpy array or list of evaluation labels, if `data` is also a numpy array or list.
-         - The string name of a column from `data` that contains evaluation labels, if `data`
-           is a DataFrame.
-
-        :param name: (Optional) The name of the dataset (must not contain ").
-
-        :param path: (Optional) the path to a serialized DataFrame (must not contain ").
-          (e.g. a delta table, parquet file)
-        """
-        if name is not None and '"' in name:
-            raise ValueError(f'Dataset name cannot include a double quote (") but got name {name}')
-        if path is not None and '"' in path:
-            raise ValueError(f'Dataset path cannot include a double quote (") but got name {path}')
-
-        if isinstance(data, (np.ndarray, list)):
-            if not isinstance(labels, (np.ndarray, list)):
-                raise ValueError(
-                    'If data is a numpy array or list of evaluation features, '
-                    'labels must be a numpy array or list of evaluation labels'
-                )
-        elif isinstance(data, (pd.DataFrame, pyspark.sql.DataFrame)):
-            if not isinstance(labels, str):
-                raise ValueError(
-                    'If data is a Pandas DataFrame, labels must be the string name of a column '
-                    'from `data` that contains evaluation labels'
-                )
-        else:
-            raise ValueError('The data argument must be a numpy array, a list or a '
-                             'Pandas DataFrame.')
-
-        self._user_specified_name = name
-        self.data = data
-        self.labels = labels
-        self.path = path
-        self._hash = None
-
-    def _extract_features_and_labels(self):
-        if isinstance(self.data, np.ndarray):
-            return self.data, self.labels
-        elif isinstance(self.data, (pd.DataFrame, pyspark.sql.DataFrame)):
-            if isinstance(self.data, pyspark.sql.DataFrame):
-                data = self.data.limit(EvaluationDataset.SPARK_DATAFRAME_LIMIT).toPandas()
-                _logger.warning(
-                    f'Only the first {EvaluationDataset.SPARK_DATAFRAME_LIMIT} rows in the '
-                    f'spark dataframe are examined.')
-            else:
-                data = self.data
-            feature_cols = [x for x in data.columns if x != self.labels]
-            return data[feature_cols], data[self.labels]
-        else:
-            raise ValueError(f'Unsupported data type: {type(self.data)}')
-
-    @staticmethod
-    def _array_like_obj_to_bytes(data):
-        if isinstance(data, pd.DataFrame):
-            return data.to_numpy().tobytes() + data.columns.to_numpy().tobytes()
-        elif isinstance(data, np.ndarray):
-            return data.tobytes()
-        elif isinstance(data, list):
-            return np.array(data).tobytes()
-        else:
-            raise ValueError('Unsupported data type.')
-
-    @staticmethod
-    def _gen_md5_for_arraylike_obj(md5_gen, data):
-        md5_gen.update(np.int64(len(data)).tobytes())
-        if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
-            md5_gen.update(EvaluationDataset._array_like_obj_to_bytes(data))
-        else:
-            md5_gen.update(
-                EvaluationDataset._array_like_obj_to_bytes(
-                    data[: EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH]
-                )
-            )
-            md5_gen.update(
-                EvaluationDataset._array_like_obj_to_bytes(
-                    data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH:]
-                )
-            )
-
-    @property
-    def name(self):
-        return self._user_specified_name if self._user_specified_name is not None else self.hash
-
-    @property
-    def hash(self):
-        """
-        Compute a hash from the specified dataset by selecting the first 5 records, last 5 records,
-        dataset size and feeding them through a cheap, low-collision hash function
-        """
-        if self._hash is None:
-            md5_gen = hashlib.md5()
-            if isinstance(self.data, np.ndarray):
-                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
-                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.labels)
-            elif isinstance(self.data, pd.DataFrame):
-                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
-                md5_gen.update(self.labels.encode("UTF-8"))
-            self._hash = md5_gen.hexdigest()
-        return self._hash
-
-    @property
-    def _metadata(self):
-        metadata = {
-            "name": self.name,
-            "hash": self.hash,
-        }
-        if self.path is not None:
-            metadata["path"] = self.path
-        return metadata
-
-    def _log_dataset_tag(self, client, run_id):
-        existing_dataset_metadata_str = client.get_run(run_id).data.tags.get("mlflow.datasets")
-        if existing_dataset_metadata_str is not None:
-            dataset_metadata_list = json.loads(existing_dataset_metadata_str)
-        else:
-            dataset_metadata_list = []
-
-        metadata_exists = False
-        for metadata in dataset_metadata_list:
-            if (
-                    metadata["hash"] == self.hash
-                    and metadata["name"] == self._user_specified_name
-            ):
-                metadata_exists = True
-                break
-
-        if not metadata_exists:
-            dataset_metadata_list.append(self._metadata)
-
-        dataset_metadata_str = json.dumps(dataset_metadata_list)
-        client.log_batch(
-            run_id,
-            tags=[RunTag("mlflow.datasets", dataset_metadata_str)],
-        )
-
-
-class ModelEvaluator:
-    def can_evaluate(self, model_type, evaluator_config=None, **kwargs) -> bool:
-        """
-        :param model_type: A string describing the model type (e.g., "regressor",
-                           "classifier", …).
-        :param evaluator_config: A dictionary of additional configurations for
-                                 the evaluator.
-        :param **kwargs: For forwards compatibility, a placeholder for additional
-                         arguments that may be added to the evaluation interface
-                         in the future.
-        :return: True if the evaluator can evaluate the specified model on the
-                 specified dataset. False otherwise.
-        """
-        raise NotImplementedError()
-
-    def _log_metrics(self, run_id, metrics, dataset_name):
-        client = mlflow.tracking.MlflowClient()
-        timestamp = int(time.time() * 1000)
-        client.log_batch(
-            run_id,
-            metrics=[
-                Metric(key=f"{key}_on_{dataset_name}", value=value, timestamp=timestamp, step=0)
-                for key, value in metrics.items()
-            ],
-        )
-
-    def evaluate(
-            self,
-            model: PyFuncModel,
-            model_type,
-            dataset,
-            run_id,
-            evaluator_config=None,
-            **kwargs
-    ) -> EvaluationResult:
-        """
-        :param model: A pyfunc model instance.
-        :param model_type: A string describing the model type (e.g., "regressor",
-                   "classifier", …).
-        :param dataset: An instance of `EvaluationDataset` containing features
-                        and labels (optional) for model evaluation.
-        :param run_id: The ID of the MLflow Run to which to log results.
-        :param evaluator_config: A dictionary of additional configurations for
-                                 the evaluator.
-        :param **kwargs: For forwards compatibility, a placeholder for additional
-                         arguments that may be added to the evaluation interface
-                         in the future.
-        :return: An `EvaluationResult` instance containing evaluation results.
-        """
-        raise NotImplementedError()
-
-
-def list_evaluators():
-    """
-    Return a name list for all available Evaluators.
-    """
-    # import _model_evaluation_registry inside function to avoid circuit importing
-    from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
-    return list(_model_evaluation_registry._registry.keys())
-
-
-class StartRunOrReuseActiveRun:
-
-    def __init__(self, run_id):
-        self.user_specified_run_id = run_id
-        self.managed_run = None
-
-    def __enter__(self):
-        if mlflow.active_run() is not None:
-            active_run_id = mlflow.active_run().info.run_id
-            if self.user_specified_run_id is not None and \
-                    self.user_specified_run_id != active_run_id:
-                raise ValueError("An active run exists, you cannot specify another run_id when "
-                                 "evaluating.")
-            return active_run_id
-        else:
-            if self.user_specified_run_id is None:
-                raise ValueError("Active run does not exist, you need specify a run_id when "
-                                 "evaluating.")
-            self.managed_run = \
-                mlflow.start_run(run_id=self.user_specified_run_id).__enter__()
-            return self.user_specified_run_id
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if self.managed_run is not None:
-            return self.managed_run.__exit__(exc_type, exc_val, exc_tb)
-
-
-def evaluate(
-        model: Union[str, PyFuncModel],
-        model_type, dataset,
-        run_id=None,
-        evaluators=None,
-        evaluator_config=None
-) -> EvaluationResult:
-    """
-    :param model: A pyfunc model instance, or a URI referring to such a model.
-
-    :param model_type: A string describing the model type. The default evaluator
-                       supports "regressor" and "classifier" as model types.
-    :param dataset: An instance of `EvaluationDataset` containing features
-                    labels (optional) for model evaluation.
-    :param run_id: The ID of the MLflow Run to which to log results. If
-                   unspecified, behavior depends on the specified `evaluator`.
-                   When `run_id` is unspecified, the default evaluator logs
-                   results to the current active run, creating a new active run if
-                   one does not exist.
-    :param evaluators: The name of the evaluator to use for model evaluations, or
-                       a list of evaluator names. If unspecified, all evaluators
-                       capable  of evaluating the specified model on the specified
-                       dataset are used. The default evaluator can be referred to
-                       by the name 'default'.
-    :param evaluator_config: A dictionary of additional configurations to supply
-                             to the evaluator. If multiple evaluators are
-                             specified, each configuration should be supplied as
-                             a nested dictionary whose key is the evaluator name.
-    :return: An `EvaluationResult` instance containing evaluation results.
-    """
-    # import _model_evaluation_registry inside function to avoid circuit importing
-    from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
-
-    if evaluators is None:
-        evaluators = "default"
-
-    if not isinstance(evaluators, list):
-        evaluators = [evaluators]
-        evaluator_config = {evaluators[0]: evaluator_config}
-
-    if isinstance(model, str):
-        model = mlflow.pyfunc.load_model(model)
-
-    with StartRunOrReuseActiveRun(run_id) as actual_run_id:
-        client = mlflow.tracking.MlflowClient()
-        dataset._log_dataset_tag(client, actual_run_id)
-
-        eval_results = []
-        for evaluator_name in evaluators:
-            config = evaluator_config[evaluator_name]
-            try:
-                evaluator = _model_evaluation_registry.get_evaluator(evaluator_name)
-            except MlflowException:
-                _logger.warning(f"Evaluator '{evaluator_name}' is not registered.")
-                continue
-
-            if evaluator.can_evaluate(model_type, config):
-                _logger.info(f"Evaluating the model with the {evaluator_name} evaluator.")
-                result = evaluator.evaluate(model, model_type, dataset, actual_run_id, config)
-                eval_results.append(result)
-
-        merged_eval_result = EvaluationResult(EvaluationMetrics(), dict())
-        for eval_result in eval_results:
-            merged_eval_result.metrics.update(eval_result.metrics)
-            merged_eval_result.artifacts.update(eval_result.artifacts)
-
-        return merged_eval_result
+from .base import (
+    ModelEvaluator,
+    EvaluationDataset,
+    EvaluationResult,
+    EvaluationMetrics,
+    EvaluationArtifact,
+    evaluate,
+    list_evaluators,
+)
diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
new file mode 100644
index 0000000000000..8d4074410589a
--- /dev/null
+++ b/mlflow/models/evaluation/base.py
@@ -0,0 +1,438 @@
+from typing import Dict, Union
+import mlflow
+import hashlib
+import time
+import numpy as np
+import pandas as pd
+import pickle
+import json
+import os
+from mlflow.exceptions import MlflowException
+from mlflow.utils.file_utils import TempDir
+from mlflow.entities import Metric, RunTag
+from mlflow.tracking.artifact_utils import _download_artifact_from_uri
+from mlflow.utils import _get_fully_qualified_class_name
+from mlflow.utils.class_utils import _get_class_from_string
+from mlflow.pyfunc import PyFuncModel
+import pyspark.sql
+import logging
+
+
+_logger = logging.getLogger(__name__)
+
+
+class EvaluationMetrics(dict):
+    pass
+
+
+class EvaluationArtifact:
+    def __init__(self, uri, content=None):
+        self._uri = uri
+        self._content = content
+
+    def _load_content_from_file(self, local_artifact_path):
+        raise NotImplementedError()
+
+    def load(self, local_artifact_path=None):
+        if local_artifact_path is None:
+            return self._load_content_from_file(local_artifact_path)
+        else:
+            with TempDir() as temp_dir:
+                local_artifact_file = temp_dir.path("local_artifact")
+                _download_artifact_from_uri(self._uri, local_artifact_file)
+                self._load_content_from_file(local_artifact_file)
+
+    def save(self, output_artifact_path):
+        raise NotImplementedError()
+
+    @property
+    def content(self):
+        """
+        The content of the artifact (representation varies)
+        """
+        if self._content is None:
+            self.load()
+        return self._content
+
+    @property
+    def uri(self) -> str:
+        """
+        The URI of the artifact
+        """
+        return self._uri
+
+
+class EvaluationResult:
+    def __init__(self, metrics, artifacts):
+        self._metrics = metrics
+        self._artifacts = artifacts
+
+    @classmethod
+    def load(cls, path):
+        """Load the evaluation results from the specified local filesystem path"""
+        with open(os.path.join(path, "metrics.json"), "r") as fp:
+            metrics = EvaluationMetrics(json.load(fp))
+
+        with open(os.path.join(path, "artifacts_metadata.json"), "r") as fp:
+            artifacts_metadata = json.load(fp)
+
+        artifacts = {}
+
+        artifacts_dir = os.path.join(path, "artifacts")
+
+        for artifact_name, meta in artifacts_metadata:
+            location = meta["location"]
+            ArtifactCls = _get_class_from_string(meta["class_name"])
+            content = ArtifactCls.load_content_from_file(os.path.join(artifacts_dir, artifact_name))
+            artifacts[artifact_name] = ArtifactCls(location=location, content=content)
+
+        return EvaluationResult(metrics=metrics, artifacts=artifacts)
+
+    def save(self, path):
+        """Write the evaluation results to the specified local filesystem path"""
+        os.makedirs(path, exist_ok=True)
+        with open(os.path.join(path, "metrics.json"), "w") as fp:
+            json.dump(self.metrics, fp)
+
+        artifacts_metadata = {
+            artifact_name: {
+                "location": artifact.location,
+                "class_name": _get_fully_qualified_class_name(artifact),
+            }
+            for artifact_name, artifact in self.artifacts.items()
+        }
+        with open(os.path.join(path, "artifacts_metadata.json"), "w") as fp:
+            json.dump(artifacts_metadata, fp)
+
+        artifacts_dir = os.path.join(path, "artifacts")
+        os.mkdir(artifacts_dir)
+
+        for artifact_name, artifact in self.artifacts.items():
+            artifact.save(os.path.join(artifacts_dir, artifact_name))
+
+    @property
+    def metrics(self) -> EvaluationMetrics:
+        """
+        A dictionary mapping scalar metric names to scalar metric values
+        """
+        return self._metrics
+
+    @property
+    def artifacts(self) -> Dict[str, EvaluationArtifact]:
+        """
+        A dictionary mapping standardized artifact names (e.g. "roc_data") to
+        artifact content and location information
+        """
+        return self._artifacts
+
+
+_cached_mlflow_client = None
+
+
+class EvaluationDataset:
+    """
+    Represents an input dataset for model evaluation. This is intended for
+    use with the `mlflow.evaluate()`API.
+    """
+
+    NUM_SAMPLE_ROWS_FOR_HASH = 5
+    SPARK_DATAFRAME_LIMIT = 10000
+
+    def __init__(self, data, labels, name=None, path=None):
+        """
+        :param data: One of the following:
+         - A numpy array or list of evaluation features, excluding labels.
+         - A Pandas DataFrame, or a spark DataFrame,
+           containing evaluation features and labels. All columns will be regarded as feature
+           columns except the "labels" column.
+
+        :param labels: One of the following:
+         - A numpy array or list of evaluation labels, if `data` is also a numpy array or list.
+         - The string name of a column from `data` that contains evaluation labels, if `data`
+           is a DataFrame.
+
+        :param name: (Optional) The name of the dataset (must not contain ").
+
+        :param path: (Optional) the path to a serialized DataFrame (must not contain ").
+          (e.g. a delta table, parquet file)
+        """
+        if name is not None and '"' in name:
+            raise ValueError(f'Dataset name cannot include a double quote (") but got name {name}')
+        if path is not None and '"' in path:
+            raise ValueError(f'Dataset path cannot include a double quote (") but got name {path}')
+
+        if isinstance(data, (np.ndarray, list)):
+            if not isinstance(labels, (np.ndarray, list)):
+                raise ValueError(
+                    "If data is a numpy array or list of evaluation features, "
+                    "labels must be a numpy array or list of evaluation labels"
+                )
+        elif isinstance(data, (pd.DataFrame, pyspark.sql.DataFrame)):
+            if not isinstance(labels, str):
+                raise ValueError(
+                    "If data is a Pandas DataFrame, labels must be the string name of a column "
+                    "from `data` that contains evaluation labels"
+                )
+        else:
+            raise ValueError(
+                "The data argument must be a numpy array, a list or a " "Pandas DataFrame."
+            )
+
+        self._user_specified_name = name
+        self.data = data
+        self.labels = labels
+        self.path = path
+        self._hash = None
+
+    def _extract_features_and_labels(self):
+        if isinstance(self.data, np.ndarray):
+            return self.data, self.labels
+        elif isinstance(self.data, (pd.DataFrame, pyspark.sql.DataFrame)):
+            if isinstance(self.data, pyspark.sql.DataFrame):
+                data = self.data.limit(EvaluationDataset.SPARK_DATAFRAME_LIMIT).toPandas()
+                _logger.warning(
+                    f"Only the first {EvaluationDataset.SPARK_DATAFRAME_LIMIT} rows in the "
+                    f"spark dataframe are examined."
+                )
+            else:
+                data = self.data
+            feature_cols = [x for x in data.columns if x != self.labels]
+            return data[feature_cols], data[self.labels]
+        else:
+            raise ValueError(f"Unsupported data type: {type(self.data)}")
+
+    @staticmethod
+    def _array_like_obj_to_bytes(data):
+        if isinstance(data, pd.DataFrame):
+            return data.to_numpy().tobytes() + data.columns.to_numpy().tobytes()
+        elif isinstance(data, np.ndarray):
+            return data.tobytes()
+        elif isinstance(data, list):
+            return np.array(data).tobytes()
+        else:
+            raise ValueError("Unsupported data type.")
+
+    @staticmethod
+    def _gen_md5_for_arraylike_obj(md5_gen, data):
+        md5_gen.update(np.int64(len(data)).tobytes())
+        if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
+            md5_gen.update(EvaluationDataset._array_like_obj_to_bytes(data))
+        else:
+            md5_gen.update(
+                EvaluationDataset._array_like_obj_to_bytes(
+                    data[: EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH]
+                )
+            )
+            md5_gen.update(
+                EvaluationDataset._array_like_obj_to_bytes(
+                    data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH :]
+                )
+            )
+
+    @property
+    def name(self):
+        return self._user_specified_name if self._user_specified_name is not None else self.hash
+
+    @property
+    def hash(self):
+        """
+        Compute a hash from the specified dataset by selecting the first 5 records, last 5 records,
+        dataset size and feeding them through a cheap, low-collision hash function
+        """
+        if self._hash is None:
+            md5_gen = hashlib.md5()
+            if isinstance(self.data, np.ndarray):
+                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
+                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.labels)
+            elif isinstance(self.data, pd.DataFrame):
+                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
+                md5_gen.update(self.labels.encode("UTF-8"))
+            self._hash = md5_gen.hexdigest()
+        return self._hash
+
+    @property
+    def _metadata(self):
+        metadata = {
+            "name": self.name,
+            "hash": self.hash,
+        }
+        if self.path is not None:
+            metadata["path"] = self.path
+        return metadata
+
+    def _log_dataset_tag(self, client, run_id):
+        existing_dataset_metadata_str = client.get_run(run_id).data.tags.get("mlflow.datasets")
+        if existing_dataset_metadata_str is not None:
+            dataset_metadata_list = json.loads(existing_dataset_metadata_str)
+        else:
+            dataset_metadata_list = []
+
+        metadata_exists = False
+        for metadata in dataset_metadata_list:
+            if metadata["hash"] == self.hash and metadata["name"] == self._user_specified_name:
+                metadata_exists = True
+                break
+
+        if not metadata_exists:
+            dataset_metadata_list.append(self._metadata)
+
+        dataset_metadata_str = json.dumps(dataset_metadata_list)
+        client.log_batch(
+            run_id,
+            tags=[RunTag("mlflow.datasets", dataset_metadata_str)],
+        )
+
+
+class ModelEvaluator:
+    def can_evaluate(self, model_type, evaluator_config=None, **kwargs) -> bool:
+        """
+        :param model_type: A string describing the model type (e.g., "regressor",
+                           "classifier", …).
+        :param evaluator_config: A dictionary of additional configurations for
+                                 the evaluator.
+        :param **kwargs: For forwards compatibility, a placeholder for additional
+                         arguments that may be added to the evaluation interface
+                         in the future.
+        :return: True if the evaluator can evaluate the specified model on the
+                 specified dataset. False otherwise.
+        """
+        raise NotImplementedError()
+
+    def _log_metrics(self, run_id, metrics, dataset_name):
+        client = mlflow.tracking.MlflowClient()
+        timestamp = int(time.time() * 1000)
+        client.log_batch(
+            run_id,
+            metrics=[
+                Metric(key=f"{key}_on_{dataset_name}", value=value, timestamp=timestamp, step=0)
+                for key, value in metrics.items()
+            ],
+        )
+
+    def evaluate(
+        self, model: PyFuncModel, model_type, dataset, run_id, evaluator_config=None, **kwargs
+    ) -> EvaluationResult:
+        """
+        :param model: A pyfunc model instance.
+        :param model_type: A string describing the model type (e.g., "regressor",
+                   "classifier", …).
+        :param dataset: An instance of `EvaluationDataset` containing features
+                        and labels (optional) for model evaluation.
+        :param run_id: The ID of the MLflow Run to which to log results.
+        :param evaluator_config: A dictionary of additional configurations for
+                                 the evaluator.
+        :param **kwargs: For forwards compatibility, a placeholder for additional
+                         arguments that may be added to the evaluation interface
+                         in the future.
+        :return: An `EvaluationResult` instance containing evaluation results.
+        """
+        raise NotImplementedError()
+
+
+def list_evaluators():
+    """
+    Return a name list for all available Evaluators.
+    """
+    # import _model_evaluation_registry inside function to avoid circuit importing
+    from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
+
+    return list(_model_evaluation_registry._registry.keys())
+
+
+class StartRunOrReuseActiveRun:
+    def __init__(self, run_id):
+        self.user_specified_run_id = run_id
+        self.managed_run = None
+
+    def __enter__(self):
+        if mlflow.active_run() is not None:
+            active_run_id = mlflow.active_run().info.run_id
+            if (
+                self.user_specified_run_id is not None
+                and self.user_specified_run_id != active_run_id
+            ):
+                raise ValueError(
+                    "An active run exists, you cannot specify another run_id when " "evaluating."
+                )
+            return active_run_id
+        else:
+            if self.user_specified_run_id is None:
+                raise ValueError(
+                    "Active run does not exist, you need specify a run_id when " "evaluating."
+                )
+            self.managed_run = mlflow.start_run(run_id=self.user_specified_run_id).__enter__()
+            return self.user_specified_run_id
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.managed_run is not None:
+            return self.managed_run.__exit__(exc_type, exc_val, exc_tb)
+
+
+def evaluate(
+    model: Union[str, PyFuncModel],
+    model_type,
+    dataset,
+    run_id=None,
+    evaluators=None,
+    evaluator_config=None,
+) -> EvaluationResult:
+    """
+    :param model: A pyfunc model instance, or a URI referring to such a model.
+
+    :param model_type: A string describing the model type. The default evaluator
+                       supports "regressor" and "classifier" as model types.
+    :param dataset: An instance of `EvaluationDataset` containing features
+                    labels (optional) for model evaluation.
+    :param run_id: The ID of the MLflow Run to which to log results. If
+                   unspecified, behavior depends on the specified `evaluator`.
+                   When `run_id` is unspecified, the default evaluator logs
+                   results to the current active run, creating a new active run if
+                   one does not exist.
+    :param evaluators: The name of the evaluator to use for model evaluations, or
+                       a list of evaluator names. If unspecified, all evaluators
+                       capable  of evaluating the specified model on the specified
+                       dataset are used. The default evaluator can be referred to
+                       by the name 'default'.
+    :param evaluator_config: A dictionary of additional configurations to supply
+                             to the evaluator. If multiple evaluators are
+                             specified, each configuration should be supplied as
+                             a nested dictionary whose key is the evaluator name.
+    :return: An `EvaluationResult` instance containing evaluation results.
+    """
+    # import _model_evaluation_registry inside function to avoid circuit importing
+    from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
+
+    if evaluators is None:
+        evaluators = "default"
+
+    if not isinstance(evaluators, list):
+        evaluators = [evaluators]
+        evaluator_config = {evaluators[0]: evaluator_config}
+
+    if isinstance(model, str):
+        model = mlflow.pyfunc.load_model(model)
+
+    with StartRunOrReuseActiveRun(run_id) as actual_run_id:
+        client = mlflow.tracking.MlflowClient()
+        dataset._log_dataset_tag(client, actual_run_id)
+
+        eval_results = []
+        for evaluator_name in evaluators:
+            config = evaluator_config[evaluator_name]
+            try:
+                evaluator = _model_evaluation_registry.get_evaluator(evaluator_name)
+            except MlflowException:
+                _logger.warning(f"Evaluator '{evaluator_name}' is not registered.")
+                continue
+
+            if evaluator.can_evaluate(model_type, config):
+                _logger.info(f"Evaluating the model with the {evaluator_name} evaluator.")
+                result = evaluator.evaluate(model, model_type, dataset, actual_run_id, config)
+                eval_results.append(result)
+
+        merged_eval_result = EvaluationResult(EvaluationMetrics(), dict())
+        for eval_result in eval_results:
+            merged_eval_result.metrics.update(eval_result.metrics)
+            merged_eval_result.artifacts.update(eval_result.artifacts)
+
+        return merged_eval_result
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index 64fe528092f1d..a29d7e00ca774 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -79,7 +79,9 @@ def test_classifier_evaluate(classifier_model, iris_dataset):
             iris_dataset,
             run_id=None,
             evaluators="dummy_evaluator",
-            evaluator_config={"can_evaluate": True,},
+            evaluator_config={
+                "can_evaluate": True,
+            },
         )
 
     artifact_name = "confusion_matrix_on_iris_dataset.csv"
@@ -96,8 +98,7 @@ def test_classifier_evaluate(classifier_model, iris_dataset):
         run.info.run_id, artifact_name
     )
     assert np.array_equal(
-        returned_confusion_matrix_artifact.load(saved_artifact_path),
-        expected_artifact
+        returned_confusion_matrix_artifact.load(saved_artifact_path), expected_artifact
     )
 
 
@@ -121,7 +122,9 @@ def test_regressor_evaluate(regressor_model, iris_dataset):
             iris_dataset,
             run_id=None,
             evaluators="dummy_evaluator",
-            evaluator_config={"can_evaluate": True,},
+            evaluator_config={
+                "can_evaluate": True,
+            },
         )
     _, saved_metrics, _, _ = get_run_data(run.info.run_id)
     assert saved_metrics == expected_saved_metrics
diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
index 15e53eda523e7..9b6cd20e22920 100644
--- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -14,7 +14,6 @@
 
 
 class Array2DEvaluationArtifact(EvaluationArtifact):
-
     def save(self, output_artifact_path):
         pd.DataFrame(self._content).to_csv(output_artifact_path, index=False)
 
@@ -29,13 +28,7 @@ def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
         return evaluator_config.get("can_evaluate") and model_type in ["classifier", "regressor"]
 
     def evaluate(
-            self,
-            model,
-            model_type,
-            dataset,
-            run_id,
-            evaluator_config=None,
-            **kwargs
+        self, model, model_type, dataset, run_id, evaluator_config=None, **kwargs
     ) -> EvaluationResult:
         client = mlflow.tracking.MlflowClient()
         X, y = dataset._extract_features_and_labels()
@@ -61,8 +54,7 @@ def evaluate(
             mean_absolute_error = sk_metrics.mean_absolute_error(y, y_pred)
             mean_squared_error = sk_metrics.mean_squared_error(y, y_pred)
             metrics = EvaluationMetrics(
-                mean_absolute_error=mean_absolute_error,
-                mean_squared_error=mean_squared_error
+                mean_absolute_error=mean_absolute_error, mean_squared_error=mean_squared_error
             )
             self._log_metrics(run_id, metrics, dataset.name)
             artifacts = {}

From a4db525e68f84dcb1683221926e594d46dda5eee Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Sun, 28 Nov 2021 12:13:25 +0800
Subject: [PATCH 028/120] lazy load pyspark

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 8d4074410589a..68fc26b25f674 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -14,7 +14,6 @@
 from mlflow.utils import _get_fully_qualified_class_name
 from mlflow.utils.class_utils import _get_class_from_string
 from mlflow.pyfunc import PyFuncModel
-import pyspark.sql
 import logging
 
 
@@ -156,6 +155,13 @@ def __init__(self, data, labels, name=None, path=None):
         :param path: (Optional) the path to a serialized DataFrame (must not contain ").
           (e.g. a delta table, parquet file)
         """
+        try:
+            from pyspark.sql import DataFrame as SparkDataFrame
+
+            supported_dataframe_types = (pd.DataFrame, SparkDataFrame)
+        except ImportError:
+            supported_dataframe_types = (pd.DataFrame,)
+
         if name is not None and '"' in name:
             raise ValueError(f'Dataset name cannot include a double quote (") but got name {name}')
         if path is not None and '"' in path:
@@ -167,11 +173,11 @@ def __init__(self, data, labels, name=None, path=None):
                     "If data is a numpy array or list of evaluation features, "
                     "labels must be a numpy array or list of evaluation labels"
                 )
-        elif isinstance(data, (pd.DataFrame, pyspark.sql.DataFrame)):
+        elif isinstance(data, supported_dataframe_types):
             if not isinstance(labels, str):
                 raise ValueError(
-                    "If data is a Pandas DataFrame, labels must be the string name of a column "
-                    "from `data` that contains evaluation labels"
+                    "If data is a Pandas DataFrame or Spark DataFrame, labels must be the "
+                    "string name of a column from `data` that contains evaluation labels"
                 )
         else:
             raise ValueError(
@@ -187,8 +193,8 @@ def __init__(self, data, labels, name=None, path=None):
     def _extract_features_and_labels(self):
         if isinstance(self.data, np.ndarray):
             return self.data, self.labels
-        elif isinstance(self.data, (pd.DataFrame, pyspark.sql.DataFrame)):
-            if isinstance(self.data, pyspark.sql.DataFrame):
+        else:
+            if not isinstance(self.data, pd.DataFrame):
                 data = self.data.limit(EvaluationDataset.SPARK_DATAFRAME_LIMIT).toPandas()
                 _logger.warning(
                     f"Only the first {EvaluationDataset.SPARK_DATAFRAME_LIMIT} rows in the "
@@ -198,8 +204,6 @@ def _extract_features_and_labels(self):
                 data = self.data
             feature_cols = [x for x in data.columns if x != self.labels]
             return data[feature_cols], data[self.labels]
-        else:
-            raise ValueError(f"Unsupported data type: {type(self.data)}")
 
     @staticmethod
     def _array_like_obj_to_bytes(data):

From 035346f4dc9e34d4ac96afc47070abe042cef214 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Sun, 28 Nov 2021 17:26:27 +0800
Subject: [PATCH 029/120] revert export

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/__init__.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mlflow/__init__.py b/mlflow/__init__.py
index c66ba9eff8201..41291b8f6e833 100644
--- a/mlflow/__init__.py
+++ b/mlflow/__init__.py
@@ -31,7 +31,6 @@
 from mlflow.utils.logging_utils import _configure_mlflow_loggers
 import mlflow.tracking._model_registry.fluent
 import mlflow.tracking.fluent
-import mlflow.models
 
 # Filter annoying Cython warnings that serve no good purpose, and so before
 # importing other modules.
@@ -152,7 +151,7 @@
 delete_run = mlflow.tracking.fluent.delete_run
 register_model = mlflow.tracking._model_registry.fluent.register_model
 autolog = mlflow.tracking.fluent.autolog
-evaluate = mlflow.models.evaluate
+
 
 run = projects.run
 
@@ -192,5 +191,4 @@
     "set_registry_uri",
     "list_run_infos",
     "autolog",
-    "evaluate",
 ] + _model_flavors_supported

From 34227810c96422bb5733f9babc5722e89fed3cab Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Sun, 28 Nov 2021 20:44:33 +0800
Subject: [PATCH 030/120] fix curcit import

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/__init__.py | 12 ++++++++++-
 mlflow/models/evaluation/base.py     | 30 +++++++++++++++++++++-------
 tests/models/test_evaluation.py      | 23 +++++++++++++++++++++
 3 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/mlflow/models/evaluation/__init__.py b/mlflow/models/evaluation/__init__.py
index 2ccda9894beca..cf64e897f622f 100644
--- a/mlflow/models/evaluation/__init__.py
+++ b/mlflow/models/evaluation/__init__.py
@@ -1,4 +1,4 @@
-from .base import (
+from mlflow.models.evaluation.base import (
     ModelEvaluator,
     EvaluationDataset,
     EvaluationResult,
@@ -7,3 +7,13 @@
     evaluate,
     list_evaluators,
 )
+
+__all__ = [
+    "ModelEvaluator",
+    "EvaluationDataset",
+    "EvaluationResult",
+    "EvaluationMetrics",
+    "EvaluationArtifact",
+    "evaluate",
+    "list_evaluators",
+]
diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 68fc26b25f674..3ab2971d88fd4 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -4,7 +4,6 @@
 import time
 import numpy as np
 import pandas as pd
-import pickle
 import json
 import os
 from mlflow.exceptions import MlflowException
@@ -13,7 +12,6 @@
 from mlflow.tracking.artifact_utils import _download_artifact_from_uri
 from mlflow.utils import _get_fully_qualified_class_name
 from mlflow.utils.class_utils import _get_class_from_string
-from mlflow.pyfunc import PyFuncModel
 import logging
 
 
@@ -205,10 +203,16 @@ def _extract_features_and_labels(self):
             feature_cols = [x for x in data.columns if x != self.labels]
             return data[feature_cols], data[self.labels]
 
+    @staticmethod
+    def _gen_md5(data):
+        md5_gen = hashlib.md5()
+        md5_gen.update(data)
+        return md5_gen.hexdigest()
+
     @staticmethod
     def _array_like_obj_to_bytes(data):
         if isinstance(data, pd.DataFrame):
-            return data.to_numpy().tobytes() + data.columns.to_numpy().tobytes()
+            return data.to_numpy().tobytes() + ','.join(list(data.columns)).encode("UTF-8")
         elif isinstance(data, np.ndarray):
             return data.tobytes()
         elif isinstance(data, list):
@@ -229,7 +233,7 @@ def _gen_md5_for_arraylike_obj(md5_gen, data):
             )
             md5_gen.update(
                 EvaluationDataset._array_like_obj_to_bytes(
-                    data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH :]
+                    data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH:]
                 )
             )
 
@@ -314,7 +318,13 @@ def _log_metrics(self, run_id, metrics, dataset_name):
         )
 
     def evaluate(
-        self, model: PyFuncModel, model_type, dataset, run_id, evaluator_config=None, **kwargs
+            self,
+            model: "mlflow.pyfunc.PyFuncModel",
+            model_type,
+            dataset,
+            run_id,
+            evaluator_config=None,
+            **kwargs
     ) -> EvaluationResult:
         """
         :param model: A pyfunc model instance.
@@ -373,7 +383,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 
 def evaluate(
-    model: Union[str, PyFuncModel],
+    model: Union[str, "mlflow.pyfunc.PyFuncModel"],
     model_type,
     dataset,
     run_id=None,
@@ -403,8 +413,9 @@ def evaluate(
                              a nested dictionary whose key is the evaluator name.
     :return: An `EvaluationResult` instance containing evaluation results.
     """
-    # import _model_evaluation_registry inside function to avoid circuit importing
+    # import _model_evaluation_registry and PyFuncModel inside function to avoid circuit importing
     from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
+    from mlflow.pyfunc import PyFuncModel
 
     if evaluators is None:
         evaluators = "default"
@@ -415,6 +426,11 @@ def evaluate(
 
     if isinstance(model, str):
         model = mlflow.pyfunc.load_model(model)
+    elif isinstance(model, PyFuncModel):
+        pass
+    else:
+        raise ValueError('The model argument must be a URI str referring to mlflow model or '
+                         'an instance of `mlflow.pyfunc.PyFuncModel`.')
 
     with StartRunOrReuseActiveRun(run_id) as actual_run_id:
         client = mlflow.tracking.MlflowClient()
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index a29d7e00ca774..d54e017f95f89 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -6,6 +6,7 @@
 import sklearn.linear_model
 import pytest
 import numpy as np
+import pandas as pd
 
 from sklearn.metrics import (
     accuracy_score,
@@ -59,6 +60,15 @@ def iris_dataset():
     return EvaluationDataset(data=eval_X, labels=eval_y, name="iris_dataset")
 
 
+@pytest.fixture(scope="module")
+def iris_pandas_df_dataset():
+    X, y = get_iris()
+    eval_X, eval_y = X[0::3], y[0::3]
+    data = pd.DataFrame({'f1': eval_X[:, 0], 'f2': eval_X[:, 1], 'y': eval_y})
+    labels = 'y'
+    return EvaluationDataset(data=data, labels=labels, name="iris_pandas_df_dataset")
+
+
 def test_classifier_evaluate(classifier_model, iris_dataset):
     y_true = iris_dataset.labels
     y_pred = classifier_model.predict(iris_dataset.data)
@@ -130,3 +140,16 @@ def test_regressor_evaluate(regressor_model, iris_dataset):
     assert saved_metrics == expected_saved_metrics
 
     assert eval_result.metrics == expected_metrics
+
+
+def test_dataset_name():
+    X, y = get_iris()
+    d1 = EvaluationDataset(data=X, labels=y, name='a1')
+    assert d1.name == 'a1'
+    d2 = EvaluationDataset(data=X, labels=y)
+    d2.name == d2.hash
+
+
+def test_dataset_hash(iris_dataset, iris_pandas_df_dataset):
+    assert iris_dataset.hash == '49a04c127e5441e0f27e63a325b5fa69'
+    assert iris_pandas_df_dataset.hash == 'd6770fd5fffe651cb95e965854920df9'

From 0ad527e5fd4fb5fcfac86c1d4b198f6fa75095c5 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Sun, 28 Nov 2021 22:00:51 +0800
Subject: [PATCH 031/120] update tests

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              | 23 +++++-
 tests/models/test_evaluation.py               | 78 ++++++++++++-------
 .../mlflow_test_plugin/dummy_evaluator.py     |  2 +-
 3 files changed, 72 insertions(+), 31 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 3ab2971d88fd4..7c741098a0537 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -323,9 +323,9 @@ def evaluate(
             model_type,
             dataset,
             run_id,
-            evaluator_config=None,
+            evaluator_config,
             **kwargs
-    ) -> EvaluationResult:
+    ) -> "mlflow.models.evaluation.EvaluationResult":
         """
         :param model: A pyfunc model instance.
         :param model_type: A string describing the model type (e.g., "regressor",
@@ -420,9 +420,22 @@ def evaluate(
     if evaluators is None:
         evaluators = "default"
 
-    if not isinstance(evaluators, list):
+    if isinstance(evaluators, str):
         evaluators = [evaluators]
+        if not (evaluator_config is None or isinstance(evaluator_config, dict)):
+            raise ValueError('If `evaluators` argument is a str, evaluator_config must be None '
+                             'or a dict.')
         evaluator_config = {evaluators[0]: evaluator_config}
+    elif isinstance(evaluators, list):
+        if not (
+            isinstance(evaluator_config, dict) and all(
+                k in evaluators and isinstance(v, dict)
+                for k, v in evaluator_config.items()
+            )
+        ):
+            raise ValueError('If `evaluators` argument is a evaluator name list, evaluator_config'
+                             'must be a dict contains mapping from evaluator name to individual '
+                             'evaluator config dict.')
 
     if isinstance(model, str):
         model = mlflow.pyfunc.load_model(model)
@@ -438,7 +451,9 @@ def evaluate(
 
         eval_results = []
         for evaluator_name in evaluators:
-            config = evaluator_config[evaluator_name]
+            config = evaluator_config.get(evaluator_name)
+            if config is None:
+                config = {}
             try:
                 evaluator = _model_evaluation_registry.get_evaluator(evaluator_name)
             except MlflowException:
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index d54e017f95f89..159bcf7e33902 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -15,8 +15,8 @@
     mean_squared_error,
 )
 
-from mlflow_test_plugin.dummy_evaluator import Array2DEvaluationArtifact
 from mlflow.tracking.artifact_utils import get_artifact_uri
+import json
 
 
 def get_iris():
@@ -27,8 +27,7 @@ def get_iris():
 def get_run_data(run_id):
     client = mlflow.tracking.MlflowClient()
     data = client.get_run(run_id).data
-    # Ignore tags mlflow logs by default (e.g. "mlflow.user")
-    tags = {k: v for k, v in data.tags.items() if not k.startswith("mlflow.")}
+    tags = {k: v for k, v in data.tags.items()}
     artifacts = [f.path for f in client.list_artifacts(run_id)]
     return data.params, data.metrics, tags, artifacts
 
@@ -38,19 +37,25 @@ def get_local_artifact_path(run_id, artifact_path):
 
 
 @pytest.fixture(scope="module")
-def regressor_model():
+def regressor_model_uri():
     X, y = get_iris()
     reg = sklearn.linear_model.LinearRegression()
     reg.fit(X, y)
-    return reg
+    with mlflow.start_run() as run:
+        mlflow.sklearn.log_model(reg, 'reg_model')
+
+    return get_artifact_uri(run.info.run_id, 'reg_model')
 
 
 @pytest.fixture(scope="module")
-def classifier_model():
+def classifier_model_uri():
     X, y = get_iris()
     clf = sklearn.linear_model.LogisticRegression()
     clf.fit(X, y)
-    return clf
+    with mlflow.start_run() as run:
+        mlflow.sklearn.log_model(clf, 'clf_model')
+
+    return get_artifact_uri(run.info.run_id, 'clf_model')
 
 
 @pytest.fixture(scope="module")
@@ -69,8 +74,9 @@ def iris_pandas_df_dataset():
     return EvaluationDataset(data=data, labels=labels, name="iris_pandas_df_dataset")
 
 
-def test_classifier_evaluate(classifier_model, iris_dataset):
+def test_classifier_evaluate(classifier_model_uri, iris_dataset):
     y_true = iris_dataset.labels
+    classifier_model = mlflow.pyfunc.load_model(classifier_model_uri)
     y_pred = classifier_model.predict(iris_dataset.data)
     expected_accuracy_score = accuracy_score(y_true, y_pred)
     expected_metrics = {
@@ -89,9 +95,6 @@ def test_classifier_evaluate(classifier_model, iris_dataset):
             iris_dataset,
             run_id=None,
             evaluators="dummy_evaluator",
-            evaluator_config={
-                "can_evaluate": True,
-            },
         )
 
     artifact_name = "confusion_matrix_on_iris_dataset.csv"
@@ -112,8 +115,9 @@ def test_classifier_evaluate(classifier_model, iris_dataset):
     )
 
 
-def test_regressor_evaluate(regressor_model, iris_dataset):
+def test_regressor_evaluate(regressor_model_uri, iris_dataset):
     y_true = iris_dataset.labels
+    regressor_model = mlflow.pyfunc.load_model(regressor_model_uri)
     y_pred = regressor_model.predict(iris_dataset.data)
     expected_mae = mean_absolute_error(y_true, y_pred)
     expected_mse = mean_squared_error(y_true, y_pred)
@@ -125,21 +129,19 @@ def test_regressor_evaluate(regressor_model, iris_dataset):
         "mean_absolute_error_on_iris_dataset": expected_mae,
         "mean_squared_error_on_iris_dataset": expected_mse,
     }
-    with mlflow.start_run() as run:
-        eval_result = evaluate(
-            regressor_model,
-            "regressor",
-            iris_dataset,
-            run_id=None,
-            evaluators="dummy_evaluator",
-            evaluator_config={
-                "can_evaluate": True,
-            },
-        )
-    _, saved_metrics, _, _ = get_run_data(run.info.run_id)
-    assert saved_metrics == expected_saved_metrics
 
-    assert eval_result.metrics == expected_metrics
+    for model in [regressor_model, regressor_model_uri]:
+        with mlflow.start_run() as run:
+            eval_result = evaluate(
+                model,
+                "regressor",
+                iris_dataset,
+                run_id=None,
+                evaluators="dummy_evaluator",
+            )
+        _, saved_metrics, _, _ = get_run_data(run.info.run_id)
+        assert saved_metrics == expected_saved_metrics
+        assert eval_result.metrics == expected_metrics
 
 
 def test_dataset_name():
@@ -153,3 +155,27 @@ def test_dataset_name():
 def test_dataset_hash(iris_dataset, iris_pandas_df_dataset):
     assert iris_dataset.hash == '49a04c127e5441e0f27e63a325b5fa69'
     assert iris_pandas_df_dataset.hash == 'd6770fd5fffe651cb95e965854920df9'
+
+
+def test_log_dataset_tag(iris_dataset, iris_pandas_df_dataset):
+    with mlflow.start_run() as run:
+        client = mlflow.tracking.MlflowClient()
+        iris_dataset._log_dataset_tag(client, run.info.run_id)
+        _, _, tags, _ = get_run_data(run.info.run_id)
+        assert json.loads(tags['mlflow.datasets']) == [iris_dataset._metadata]
+
+        # Test appending dataset tag
+        iris_pandas_df_dataset._log_dataset_tag(client, run.info.run_id)
+        _, _, tags, _ = get_run_data(run.info.run_id)
+        assert json.loads(tags['mlflow.datasets']) == [
+            iris_dataset._metadata, iris_pandas_df_dataset._metadata
+        ]
+
+        # Test log repetitive dataset
+        iris_dataset._log_dataset_tag(client, run.info.run_id)
+        _, _, tags, _ = get_run_data(run.info.run_id)
+        assert json.loads(tags['mlflow.datasets']) == [
+            iris_dataset._metadata, iris_pandas_df_dataset._metadata
+        ]
+
+
diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
index 9b6cd20e22920..700b705a37124 100644
--- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -25,7 +25,7 @@ def load(self, local_artifact_path):
 
 class DummyEvaluator(ModelEvaluator):
     def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
-        return evaluator_config.get("can_evaluate") and model_type in ["classifier", "regressor"]
+        return model_type in ["classifier", "regressor"]
 
     def evaluate(
         self, model, model_type, dataset, run_id, evaluator_config=None, **kwargs

From 2ea29c62bfffc5461bf77f3da15b5c00f51de19b Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 29 Nov 2021 19:59:33 +0800
Subject: [PATCH 032/120] fix conftest.py

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 tests/conftest.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/conftest.py b/tests/conftest.py
index 94df3fd2a807c..b3b213c85a4eb 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,5 +1,6 @@
 import os
 import inspect
+import shutil
 from unittest import mock
 
 import pytest
@@ -96,3 +97,14 @@ def new_exception(msg, *_, **__):
             yield
     else:
         yield
+
+
+@pytest.fixture(autouse=True, scope="module")
+def clean_up_mlruns_direcotry(request):
+    """
+    Clean up an `mlruns` directory on each test module teardown.
+    """
+    yield
+    mlruns_dir = os.path.join(request.config.rootpath, "mlruns")
+    if os.path.exists(mlruns_dir):
+        shutil.rmtree(mlruns_dir)

From 6bcbb0c4730d4665c3e8a8dda39eae8a4a22f4b1 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 29 Nov 2021 21:10:25 +0800
Subject: [PATCH 033/120] Revert "fix conftest.py"

This reverts commit 2ea29c62bfffc5461bf77f3da15b5c00f51de19b.
---
 tests/conftest.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index b3b213c85a4eb..94df3fd2a807c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,6 +1,5 @@
 import os
 import inspect
-import shutil
 from unittest import mock
 
 import pytest
@@ -97,14 +96,3 @@ def new_exception(msg, *_, **__):
             yield
     else:
         yield
-
-
-@pytest.fixture(autouse=True, scope="module")
-def clean_up_mlruns_direcotry(request):
-    """
-    Clean up an `mlruns` directory on each test module teardown.
-    """
-    yield
-    mlruns_dir = os.path.join(request.config.rootpath, "mlruns")
-    if os.path.exists(mlruns_dir):
-        shutil.rmtree(mlruns_dir)

From 46f3264356d8caa1a77160b1a23008da2cb58011 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 29 Nov 2021 21:15:38 +0800
Subject: [PATCH 034/120] fix tests

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 tests/models/test_evaluation.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index 159bcf7e33902..b4b43b413b540 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -37,25 +37,19 @@ def get_local_artifact_path(run_id, artifact_path):
 
 
 @pytest.fixture(scope="module")
-def regressor_model_uri():
+def regressor_model():
     X, y = get_iris()
     reg = sklearn.linear_model.LinearRegression()
     reg.fit(X, y)
-    with mlflow.start_run() as run:
-        mlflow.sklearn.log_model(reg, 'reg_model')
-
-    return get_artifact_uri(run.info.run_id, 'reg_model')
+    return reg
 
 
 @pytest.fixture(scope="module")
-def classifier_model_uri():
+def classifier_model():
     X, y = get_iris()
     clf = sklearn.linear_model.LogisticRegression()
     clf.fit(X, y)
-    with mlflow.start_run() as run:
-        mlflow.sklearn.log_model(clf, 'clf_model')
-
-    return get_artifact_uri(run.info.run_id, 'clf_model')
+    return clf
 
 
 @pytest.fixture(scope="module")
@@ -74,7 +68,11 @@ def iris_pandas_df_dataset():
     return EvaluationDataset(data=data, labels=labels, name="iris_pandas_df_dataset")
 
 
-def test_classifier_evaluate(classifier_model_uri, iris_dataset):
+def test_classifier_evaluate(classifier_model, iris_dataset):
+    with mlflow.start_run() as run:
+        mlflow.sklearn.log_model(classifier_model, 'clf_model')
+        classifier_model_uri = get_artifact_uri(run.info.run_id, 'clf_model')
+
     y_true = iris_dataset.labels
     classifier_model = mlflow.pyfunc.load_model(classifier_model_uri)
     y_pred = classifier_model.predict(iris_dataset.data)
@@ -115,7 +113,11 @@ def test_classifier_evaluate(classifier_model_uri, iris_dataset):
     )
 
 
-def test_regressor_evaluate(regressor_model_uri, iris_dataset):
+def test_regressor_evaluate(regressor_model, iris_dataset):
+    with mlflow.start_run() as run:
+        mlflow.sklearn.log_model(regressor_model, 'reg_model')
+        regressor_model_uri = get_artifact_uri(run.info.run_id, 'reg_model')
+
     y_true = iris_dataset.labels
     regressor_model = mlflow.pyfunc.load_model(regressor_model_uri)
     y_pred = regressor_model.predict(iris_dataset.data)

From d310359fe595d2de43671de1767ccc77e9bdfef9 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 29 Nov 2021 22:39:55 +0800
Subject: [PATCH 035/120] update doc

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py | 110 +++++++++++++++++++++++--------
 tests/models/test_evaluation.py  |  34 +++++-----
 2 files changed, 100 insertions(+), 44 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 7c741098a0537..6325055d034b0 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -19,10 +19,18 @@
 
 
 class EvaluationMetrics(dict):
+    """
+    Represent a dict of metrics.
+    """
+
     pass
 
 
 class EvaluationArtifact:
+    """
+    Represent a artifact. Contains artifact uri and content.
+    """
+
     def __init__(self, uri, content=None):
         self._uri = uri
         self._content = content
@@ -31,6 +39,10 @@ def _load_content_from_file(self, local_artifact_path):
         raise NotImplementedError()
 
     def load(self, local_artifact_path=None):
+        """
+        If `local_artifact_path` is None, download artifact from the artifact uri and load it.
+        otherwise load artifact content from specified path.
+        """
         if local_artifact_path is None:
             return self._load_content_from_file(local_artifact_path)
         else:
@@ -40,6 +52,7 @@ def load(self, local_artifact_path=None):
                 self._load_content_from_file(local_artifact_file)
 
     def save(self, output_artifact_path):
+        """Save artifact content into specified path."""
         raise NotImplementedError()
 
     @property
@@ -60,6 +73,11 @@ def uri(self) -> str:
 
 
 class EvaluationResult:
+    """
+    Represent an return value of `mlflow.evaluate()` API. Contains metrics dict and
+    artifact dict.
+    """
+
     def __init__(self, metrics, artifacts):
         self._metrics = metrics
         self._artifacts = artifacts
@@ -189,6 +207,11 @@ def __init__(self, data, labels, name=None, path=None):
         self._hash = None
 
     def _extract_features_and_labels(self):
+        """
+        Extract features data and labels data.
+        For spark dataframe, will only extract the first SPARK_DATAFRAME_LIMIT rows data
+        and emit warning.
+        """
         if isinstance(self.data, np.ndarray):
             return self.data, self.labels
         else:
@@ -203,16 +226,14 @@ def _extract_features_and_labels(self):
             feature_cols = [x for x in data.columns if x != self.labels]
             return data[feature_cols], data[self.labels]
 
-    @staticmethod
-    def _gen_md5(data):
-        md5_gen = hashlib.md5()
-        md5_gen.update(data)
-        return md5_gen.hexdigest()
-
     @staticmethod
     def _array_like_obj_to_bytes(data):
+        """
+        Helper method to convert pandas dataframe/numpy array/list into bytes for
+        MD5 calculation purpose.
+        """
         if isinstance(data, pd.DataFrame):
-            return data.to_numpy().tobytes() + ','.join(list(data.columns)).encode("UTF-8")
+            return data.to_numpy().tobytes() + ",".join(list(data.columns)).encode("UTF-8")
         elif isinstance(data, np.ndarray):
             return data.tobytes()
         elif isinstance(data, list):
@@ -222,6 +243,12 @@ def _array_like_obj_to_bytes(data):
 
     @staticmethod
     def _gen_md5_for_arraylike_obj(md5_gen, data):
+        """
+        Helper method to generate MD5 hash array-like object, the MD5 will calculate over:
+         - array length
+         - first NUM_SAMPLE_ROWS_FOR_HASH rows content
+         - last NUM_SAMPLE_ROWS_FOR_HASH rows content
+        """
         md5_gen.update(np.int64(len(data)).tobytes())
         if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
             md5_gen.update(EvaluationDataset._array_like_obj_to_bytes(data))
@@ -233,12 +260,16 @@ def _gen_md5_for_arraylike_obj(md5_gen, data):
             )
             md5_gen.update(
                 EvaluationDataset._array_like_obj_to_bytes(
-                    data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH:]
+                    data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH :]
                 )
             )
 
     @property
     def name(self):
+        """
+        Dataset name, which is specified dataset name or the dataset hash if user don't specify
+        name.
+        """
         return self._user_specified_name if self._user_specified_name is not None else self.hash
 
     @property
@@ -260,6 +291,9 @@ def hash(self):
 
     @property
     def _metadata(self):
+        """
+        Return dataset metadata containing name, hash, and optional path.
+        """
         metadata = {
             "name": self.name,
             "hash": self.hash,
@@ -269,6 +303,10 @@ def _metadata(self):
         return metadata
 
     def _log_dataset_tag(self, client, run_id):
+        """
+        Log dataset metadata as a tag "mlflow.datasets", if the tag already exists, it will
+        append current dataset metadata into existing tag content.
+        """
         existing_dataset_metadata_str = client.get_run(run_id).data.tags.get("mlflow.datasets")
         if existing_dataset_metadata_str is not None:
             dataset_metadata_list = json.loads(existing_dataset_metadata_str)
@@ -307,6 +345,9 @@ def can_evaluate(self, model_type, evaluator_config=None, **kwargs) -> bool:
         raise NotImplementedError()
 
     def _log_metrics(self, run_id, metrics, dataset_name):
+        """
+        Helper method to log metrics into specified run.
+        """
         client = mlflow.tracking.MlflowClient()
         timestamp = int(time.time() * 1000)
         client.log_batch(
@@ -318,15 +359,17 @@ def _log_metrics(self, run_id, metrics, dataset_name):
         )
 
     def evaluate(
-            self,
-            model: "mlflow.pyfunc.PyFuncModel",
-            model_type,
-            dataset,
-            run_id,
-            evaluator_config,
-            **kwargs
+        self,
+        model: "mlflow.pyfunc.PyFuncModel",
+        model_type,
+        dataset,
+        run_id,
+        evaluator_config,
+        **kwargs,
     ) -> "mlflow.models.evaluation.EvaluationResult":
         """
+        The abstract API to log metrics and artifacts, and return evaluation results.
+
         :param model: A pyfunc model instance.
         :param model_type: A string describing the model type (e.g., "regressor",
                    "classifier", …).
@@ -354,6 +397,12 @@ def list_evaluators():
 
 
 class StartRunOrReuseActiveRun:
+    """
+    A manager context return:
+     - If there's an active run, return the active run id.
+     - otherwise start a mflow run with the specified run_id.
+    """
+
     def __init__(self, run_id):
         self.user_specified_run_id = run_id
         self.managed_run = None
@@ -389,8 +438,12 @@ def evaluate(
     run_id=None,
     evaluators=None,
     evaluator_config=None,
-) -> EvaluationResult:
+) -> "mlflow.models.evaluation.EvaluationResult":
     """
+    Evaluate a pyfunc model on specified dataset, log evaluation results (metrics and
+    artifacts) into active run or specified mlflow run), and return evaluation results
+    containing metrics and artifacts.
+
     :param model: A pyfunc model instance, or a URI referring to such a model.
 
     :param model_type: A string describing the model type. The default evaluator
@@ -423,27 +476,30 @@ def evaluate(
     if isinstance(evaluators, str):
         evaluators = [evaluators]
         if not (evaluator_config is None or isinstance(evaluator_config, dict)):
-            raise ValueError('If `evaluators` argument is a str, evaluator_config must be None '
-                             'or a dict.')
+            raise ValueError(
+                "If `evaluators` argument is a str, evaluator_config must be None " "or a dict."
+            )
         evaluator_config = {evaluators[0]: evaluator_config}
     elif isinstance(evaluators, list):
         if not (
-            isinstance(evaluator_config, dict) and all(
-                k in evaluators and isinstance(v, dict)
-                for k, v in evaluator_config.items()
-            )
+            isinstance(evaluator_config, dict)
+            and all(k in evaluators and isinstance(v, dict) for k, v in evaluator_config.items())
         ):
-            raise ValueError('If `evaluators` argument is a evaluator name list, evaluator_config'
-                             'must be a dict contains mapping from evaluator name to individual '
-                             'evaluator config dict.')
+            raise ValueError(
+                "If `evaluators` argument is a evaluator name list, evaluator_config"
+                "must be a dict contains mapping from evaluator name to individual "
+                "evaluator config dict."
+            )
 
     if isinstance(model, str):
         model = mlflow.pyfunc.load_model(model)
     elif isinstance(model, PyFuncModel):
         pass
     else:
-        raise ValueError('The model argument must be a URI str referring to mlflow model or '
-                         'an instance of `mlflow.pyfunc.PyFuncModel`.')
+        raise ValueError(
+            "The model argument must be a URI str referring to mlflow model or "
+            "an instance of `mlflow.pyfunc.PyFuncModel`."
+        )
 
     with StartRunOrReuseActiveRun(run_id) as actual_run_id:
         client = mlflow.tracking.MlflowClient()
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index b4b43b413b540..5b4b3658d46ac 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -63,15 +63,15 @@ def iris_dataset():
 def iris_pandas_df_dataset():
     X, y = get_iris()
     eval_X, eval_y = X[0::3], y[0::3]
-    data = pd.DataFrame({'f1': eval_X[:, 0], 'f2': eval_X[:, 1], 'y': eval_y})
-    labels = 'y'
+    data = pd.DataFrame({"f1": eval_X[:, 0], "f2": eval_X[:, 1], "y": eval_y})
+    labels = "y"
     return EvaluationDataset(data=data, labels=labels, name="iris_pandas_df_dataset")
 
 
 def test_classifier_evaluate(classifier_model, iris_dataset):
     with mlflow.start_run() as run:
-        mlflow.sklearn.log_model(classifier_model, 'clf_model')
-        classifier_model_uri = get_artifact_uri(run.info.run_id, 'clf_model')
+        mlflow.sklearn.log_model(classifier_model, "clf_model")
+        classifier_model_uri = get_artifact_uri(run.info.run_id, "clf_model")
 
     y_true = iris_dataset.labels
     classifier_model = mlflow.pyfunc.load_model(classifier_model_uri)
@@ -115,8 +115,8 @@ def test_classifier_evaluate(classifier_model, iris_dataset):
 
 def test_regressor_evaluate(regressor_model, iris_dataset):
     with mlflow.start_run() as run:
-        mlflow.sklearn.log_model(regressor_model, 'reg_model')
-        regressor_model_uri = get_artifact_uri(run.info.run_id, 'reg_model')
+        mlflow.sklearn.log_model(regressor_model, "reg_model")
+        regressor_model_uri = get_artifact_uri(run.info.run_id, "reg_model")
 
     y_true = iris_dataset.labels
     regressor_model = mlflow.pyfunc.load_model(regressor_model_uri)
@@ -148,15 +148,15 @@ def test_regressor_evaluate(regressor_model, iris_dataset):
 
 def test_dataset_name():
     X, y = get_iris()
-    d1 = EvaluationDataset(data=X, labels=y, name='a1')
-    assert d1.name == 'a1'
+    d1 = EvaluationDataset(data=X, labels=y, name="a1")
+    assert d1.name == "a1"
     d2 = EvaluationDataset(data=X, labels=y)
     d2.name == d2.hash
 
 
 def test_dataset_hash(iris_dataset, iris_pandas_df_dataset):
-    assert iris_dataset.hash == '49a04c127e5441e0f27e63a325b5fa69'
-    assert iris_pandas_df_dataset.hash == 'd6770fd5fffe651cb95e965854920df9'
+    assert iris_dataset.hash == "49a04c127e5441e0f27e63a325b5fa69"
+    assert iris_pandas_df_dataset.hash == "d6770fd5fffe651cb95e965854920df9"
 
 
 def test_log_dataset_tag(iris_dataset, iris_pandas_df_dataset):
@@ -164,20 +164,20 @@ def test_log_dataset_tag(iris_dataset, iris_pandas_df_dataset):
         client = mlflow.tracking.MlflowClient()
         iris_dataset._log_dataset_tag(client, run.info.run_id)
         _, _, tags, _ = get_run_data(run.info.run_id)
-        assert json.loads(tags['mlflow.datasets']) == [iris_dataset._metadata]
+        assert json.loads(tags["mlflow.datasets"]) == [iris_dataset._metadata]
 
         # Test appending dataset tag
         iris_pandas_df_dataset._log_dataset_tag(client, run.info.run_id)
         _, _, tags, _ = get_run_data(run.info.run_id)
-        assert json.loads(tags['mlflow.datasets']) == [
-            iris_dataset._metadata, iris_pandas_df_dataset._metadata
+        assert json.loads(tags["mlflow.datasets"]) == [
+            iris_dataset._metadata,
+            iris_pandas_df_dataset._metadata,
         ]
 
         # Test log repetitive dataset
         iris_dataset._log_dataset_tag(client, run.info.run_id)
         _, _, tags, _ = get_run_data(run.info.run_id)
-        assert json.loads(tags['mlflow.datasets']) == [
-            iris_dataset._metadata, iris_pandas_df_dataset._metadata
+        assert json.loads(tags["mlflow.datasets"]) == [
+            iris_dataset._metadata,
+            iris_pandas_df_dataset._metadata,
         ]
-
-

From 1b3519b8b1311fcd6dd0686d77700a4de3689f42 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Tue, 30 Nov 2021 17:49:46 +0800
Subject: [PATCH 036/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py | 3 ++-
 tests/models/test_evaluation.py  | 7 ++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 6325055d034b0..ff7860aed83e8 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -470,7 +470,7 @@ def evaluate(
     from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
     from mlflow.pyfunc import PyFuncModel
 
-    if evaluators is None:
+    if not evaluators:
         evaluators = "default"
 
     if isinstance(evaluators, str):
@@ -481,6 +481,7 @@ def evaluate(
             )
         evaluator_config = {evaluators[0]: evaluator_config}
     elif isinstance(evaluators, list):
+        evaluators = set(evaluators)
         if not (
             isinstance(evaluator_config, dict)
             and all(k in evaluators and isinstance(v, dict) for k, v in evaluator_config.items())
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index 5b4b3658d46ac..91e1b534bfe9e 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -24,6 +24,11 @@ def get_iris():
     return iris.data[:, :2], iris.target
 
 
+def get_diabetes_dataset():
+    data = sklearn.datasets.load_diabetes()
+    return data.data[:, :2], data.target
+
+
 def get_run_data(run_id):
     client = mlflow.tracking.MlflowClient()
     data = client.get_run(run_id).data
@@ -38,7 +43,7 @@ def get_local_artifact_path(run_id, artifact_path):
 
 @pytest.fixture(scope="module")
 def regressor_model():
-    X, y = get_iris()
+    X, y = get_diabetes_dataset()
     reg = sklearn.linear_model.LinearRegression()
     reg.fit(X, y)
     return reg

From 09099deedca96d8e0b6a423fa58fe4f499b8c585 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Tue, 30 Nov 2021 21:33:08 +0800
Subject: [PATCH 037/120] default evaluator

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              |   2 +-
 mlflow/models/evaluation/default_evaluator.py | 245 ++++++++++++++++++
 .../models/evaluation/evaluator_registry.py   |   7 +-
 3 files changed, 251 insertions(+), 3 deletions(-)
 create mode 100644 mlflow/models/evaluation/default_evaluator.py

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index ff7860aed83e8..3aa9916f898a1 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -49,7 +49,7 @@ def load(self, local_artifact_path=None):
             with TempDir() as temp_dir:
                 local_artifact_file = temp_dir.path("local_artifact")
                 _download_artifact_from_uri(self._uri, local_artifact_file)
-                self._load_content_from_file(local_artifact_file)
+                return self._load_content_from_file(local_artifact_file)
 
     def save(self, output_artifact_path):
         """Save artifact content into specified path."""
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
new file mode 100644
index 0000000000000..972fccc2ee421
--- /dev/null
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -0,0 +1,245 @@
+import mlflow
+from mlflow.models.evaluation.base import (
+    ModelEvaluator,
+    EvaluationMetrics,
+    EvaluationArtifact,
+)
+from mlflow.utils.file_utils import TempDir
+from mlflow.tracking.artifact_utils import get_artifact_uri
+from sklearn import metrics as sk_metrics
+import matplotlib.pyplot as pyplot
+import scikitplot
+import shap
+import math
+import pandas as pd
+
+shap.initjs()
+
+"""
+[P0] Accuracy: Calculates how often predictions equal labels.
+[P0] BinaryCrossentropy: Computes the crossentropy metric between the labels and predictions.
+[P0] Hinge: Computes the hinge metric between y_true and y_pred.
+[P0] Sum: Computes the (weighted) sum of the given values.
+[P0] Mean: Computes the (weighted) mean of the given values.
+[P0] ExampleCount: Computes the total number of evaluation examples.
+[P0] MeanAbsoluteError: Computes the mean absolute error between the labels and predictions.
+[P0] MeanSquaredError: Computes the mean squared error between y_true and y_pred.
+[P0] RootMeanSquaredError: Computes root mean squared error metric between y_true and y_pred.
+
+[P0] TrueNegatives: Calculates the number of true negatives.
+[P0] TruePositives: Calculates the number of true positives.
+[P0] FalseNegatives: Calculates the number of false negatives.
+[P0] FalsePositives: Calculates the number of false positives.
+https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html#sklearn.metrics.confusion_matrix
+
+[P0] Precision: Computes the precision of the predictions with respect to the labels.
+[P0] Recall: Computes the recall of the predictions with respect to the labels.
+[P0] AUC: Approximates the AUC (Area under the curve) of the ROC or PR curves.
+[P0] F1 Score: 2*precision*recall / (precision+recall)
+
+[P0] BinaryClassConfusionMatrix
+
+Plots
+[P0] Confusion matrix
+[P0] Interactive ROC curve with metrics (TP/TN/FP/FN/Acc/F1/AUC), binary classification
+[P0] Lift chart
+
+Global explainability
+[P0] Model built-in feature importance (supported models)
+[P0] SHAP explainers
+     [P0] Summary plot
+"""
+
+from PIL.Image import Image, open as open_image
+
+
+class ImageEvaluationArtifact(EvaluationArtifact):
+    def save(self, output_artifact_path):
+        self._content.save(output_artifact_path)
+
+    def _load_content_from_file(self, local_artifact_path):
+        self._content = open_image(local_artifact_path)
+        return self._content
+
+
+class CsvEvaluationArtifact(EvaluationArtifact):
+    def save(self, output_artifact_path):
+        self._content.to_csv(output_artifact_path, index=False)
+
+    def _load_content_from_file(self, local_artifact_path):
+        self._content = pd.read_csv(local_artifact_path)
+        return self._content
+
+
+class DefaultEvaluator(ModelEvaluator):
+    def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
+        return model_type in ["classifier", "regressor"]
+
+    def _log_image_artifact(
+        self, artifacts, temp_dir, do_plot, run_id, artifact_name, dataset_name
+    ):
+        client = mlflow.tracking.MlflowClient()
+        pyplot.clf()
+        do_plot()
+        artifact_file_name = f"{artifact_name}_on_{dataset_name}.png"
+        artifact_file_local_path = temp_dir.path(artifact_file_name)
+        pyplot.savefig(artifact_file_local_path)
+        client.log_artifact(run_id, artifact_file_local_path)
+        artifact = ImageEvaluationArtifact(uri=get_artifact_uri(run_id, artifact_file_name))
+        artifact.load(artifact_file_local_path)
+        artifacts[artifact_file_name] = artifact
+
+    def _log_pandas_df_artifact(
+        self, artifacts, temp_dir, pandas_df, run_id, artifact_name, dataset_name
+    ):
+        client = mlflow.tracking.MlflowClient()
+        artifact_file_name = f"{artifact_name}_on_{dataset_name}.csv"
+        artifact_file_local_path = temp_dir.path(artifact_file_name)
+        pandas_df.to_csv(artifact_file_local_path, index=False)
+        client.log_artifact(run_id, artifact_file_local_path)
+        artifact = CsvEvaluationArtifact(
+            uri=get_artifact_uri(run_id, artifact_file_name),
+            content=pandas_df,
+        )
+        artifact.load(artifact_file_local_path)
+        artifacts[artifact_file_name] = artifact
+
+    def _log_model_explainality(self, artifacts, temp_dir, model, X, dataset_name, run_id):
+        explainer = shap.Explainer(model, X)
+        shap_values = explainer(X)
+
+        def plot_summary():
+            shap.plots.beeswarm(shap_values, show=False)
+
+        self._log_image_artifact(
+            artifacts, temp_dir, plot_summary, run_id, "shap_summary", dataset_name
+        )
+
+        def plot_feature_importance():
+            shap.plots.bar(shap_values, show=False)
+
+        self._log_image_artifact(
+            artifacts,
+            temp_dir,
+            plot_feature_importance,
+            run_id,
+            "shap_feature_importance",
+            dataset_name,
+        )
+
+    def _evaluate_classifier(self, temp_dir, model, X, y, dataset_name, run_id, evaluator_config):
+        # Note: require labels to be number of 0, 1, 2, .. num_classes - 1
+        label_list = sorted(list(set(y)))
+        assert label_list[0] >= 0, "Evaluation dataset labels must be positive integers."
+        max_label = label_list[-1]
+        num_classes = max_label + 1
+
+        y_pred = model.predict(X)
+
+        is_binomial = num_classes <= 2
+
+        metrics = EvaluationMetrics()
+        artifacts = {}
+        metrics["accuracy"] = sk_metrics.accuracy_score(y, y_pred)
+        metrics["example_count"] = len(X)
+
+        # TODO: sum/mean on what data ?
+        #  [P0] Sum: Computes the (weighted) sum of the given values.
+        #  [P0] Mean: Computes the (weighted) mean of the given values.
+
+        if is_binomial:
+            if hasattr(model, "predict_proba"):
+                y_probs = model.predict_proba(X)
+                y_prob = y_probs[:, 1]
+            else:
+                y_probs = None
+                y_prob = None
+
+            confusion_matrix = sk_metrics.confusion_matrix(y, y_pred)
+            tn, fp, fn, tp = confusion_matrix.ravel()
+            metrics["true_negatives"] = tn
+            metrics["false_positives"] = fp
+            metrics["false_negatives"] = fn
+            metrics["true_positives"] = tp
+            metrics["recall"] = sk_metrics.recall_score(y, y_pred)
+            metrics["precision"] = sk_metrics.precision_score(y, y_pred)
+            metrics["f1_score"] = sk_metrics.f1_score(y, y_pred)
+
+            # TODO:
+            #  compute hinge loss, this requires calling decision_function of the model
+            #  e.g., see https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC.decision_function
+
+            if y_probs is not None:
+                metrics["roc_auc"] = sk_metrics.roc_auc_score(y, y_prob)
+                fpr, tpr, thresholds = sk_metrics.roc_curve(y, y_prob)
+                roc_curve_pandas_df = pd.DataFrame(
+                    {"fpr": fpr, "tpr": tpr, "thresholds": thresholds}
+                )
+                self._log_pandas_df_artifact(
+                    artifacts, temp_dir, roc_curve_pandas_df, run_id, "roc_curve", dataset_name
+                )
+
+                roc_auc = sk_metrics.auc(fpr, tpr)
+                metrics["precision_recall_auc"] = roc_auc
+
+                def plot_roc_curve():
+                    sk_metrics.RocCurveDisplay(
+                        fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name="example estimator"
+                    ).plot()
+
+                self._log_image_artifact(
+                    artifacts, temp_dir, plot_roc_curve, run_id, "roc_curve", dataset_name
+                )
+
+                def plot_lift_curve():
+                    scikitplot.metrics.plot_lift_curve(y, y_probs)
+
+                self._log_image_artifact(
+                    artifacts, temp_dir, plot_lift_curve, run_id, "lift_curve", dataset_name
+                )
+
+            def plot_confusion_matrix():
+                sk_metrics.ConfusionMatrixDisplay.from_predictions(y, y_pred)
+
+            self._log_image_artifact(
+                artifacts, temp_dir, plot_confusion_matrix, run_id, "confusion_matrix", dataset_name
+            )
+
+        self._log_metrics(run_id, metrics, dataset_name)
+        self._log_model_explainality(artifacts, temp_dir, model, X, dataset_name, run_id)
+
+        return metrics, artifacts
+
+    def _evaluate_regressor(self, temp_dir, model, X, y, dataset_name, run_id, evaluator_config):
+        metrics = EvaluationMetrics()
+        artifacts = {}
+        y_pred = model.predict(X)
+        metrics["example_count"] = len(X)
+        metrics["mean_absolute_error"] = sk_metrics.mean_absolute_error(y, y_pred)
+        metrics["mean_squared_error"] = sk_metrics.mean_squared_error(y, y_pred)
+        metrics["root_mean_squared_error"] = math.sqrt(metrics["mean_squared_error"])
+        self._log_model_explainality(artifacts, temp_dir, model, X, dataset_name, run_id)
+        self._log_metrics(run_id, metrics, dataset_name)
+        return metrics, artifacts
+
+    def evaluate(
+        self,
+        model: "mlflow.pyfunc.PyFuncModel",
+        model_type,
+        dataset,
+        run_id,
+        evaluator_config,
+        **kwargs,
+    ):
+        with TempDir() as temp_dir:
+            X, y = dataset._extract_features_and_labels()
+            if model_type == "classifier":
+                return self._evaluate_classifier(
+                    temp_dir, model, X, y, dataset.name, run_id, evaluator_config
+                )
+            elif model_type == "regressor":
+                return self._evaluate_regressor(
+                    temp_dir, model, X, y, dataset.name, run_id, evaluator_config
+                )
+            else:
+                raise ValueError(f"Unsupported model type {model_type}")
diff --git a/mlflow/models/evaluation/evaluator_registry.py b/mlflow/models/evaluation/evaluator_registry.py
index 1c2366bf9849f..1822e313460ea 100644
--- a/mlflow/models/evaluation/evaluator_registry.py
+++ b/mlflow/models/evaluation/evaluator_registry.py
@@ -47,9 +47,12 @@ def get_evaluator(self, evaluator_name):
 _model_evaluation_registry = ModelEvaluatorRegistry()
 
 
-def register_entrypoints(module):
+def register_evaluators(module):
+    from mlflow.models.evaluation.default_evaluator import DefaultEvaluator
+
+    module._model_evaluation_registry.register("default", DefaultEvaluator)
     module._model_evaluation_registry.register_entrypoints()
 
 
 # Put it in post-importing hook to avoid circuit importing
-register_post_import_hook(register_entrypoints, __name__, overwrite=True)
+register_post_import_hook(register_evaluators, __name__, overwrite=True)

From 2f3faa69fb1b262274464a73ca627a53741e9837 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Tue, 30 Nov 2021 22:06:42 +0800
Subject: [PATCH 038/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              | 5 +++--
 mlflow/models/evaluation/default_evaluator.py | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 3aa9916f898a1..e91ed3951b448 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -47,8 +47,9 @@ def load(self, local_artifact_path=None):
             return self._load_content_from_file(local_artifact_path)
         else:
             with TempDir() as temp_dir:
-                local_artifact_file = temp_dir.path("local_artifact")
-                _download_artifact_from_uri(self._uri, local_artifact_file)
+                temp_dir_path = temp_dir.path()
+                _download_artifact_from_uri(self._uri, temp_dir_path)
+                local_artifact_file = temp_dir.path(os.listdir(temp_dir_path)[0])
                 return self._load_content_from_file(local_artifact_file)
 
     def save(self, output_artifact_path):
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 972fccc2ee421..7d290f0c04b02 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -105,7 +105,7 @@ def _log_pandas_df_artifact(
         artifacts[artifact_file_name] = artifact
 
     def _log_model_explainality(self, artifacts, temp_dir, model, X, dataset_name, run_id):
-        explainer = shap.Explainer(model, X)
+        explainer = shap.Explainer(model.predict, X)
         shap_values = explainer(X)
 
         def plot_summary():

From 9aff5e827f3a341cd44b11d4717465deae696354 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Tue, 30 Nov 2021 22:46:16 +0800
Subject: [PATCH 039/120] fix

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/default_evaluator.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 7d290f0c04b02..63c97fba50715 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -3,6 +3,7 @@
     ModelEvaluator,
     EvaluationMetrics,
     EvaluationArtifact,
+    EvaluationResult,
 )
 from mlflow.utils.file_utils import TempDir
 from mlflow.tracking.artifact_utils import get_artifact_uri
@@ -105,7 +106,7 @@ def _log_pandas_df_artifact(
         artifacts[artifact_file_name] = artifact
 
     def _log_model_explainality(self, artifacts, temp_dir, model, X, dataset_name, run_id):
-        explainer = shap.Explainer(model.predict, X)
+        explainer = shap.Explainer(model._model_impl, X)
         shap_values = explainer(X)
 
         def plot_summary():
@@ -208,7 +209,7 @@ def plot_confusion_matrix():
         self._log_metrics(run_id, metrics, dataset_name)
         self._log_model_explainality(artifacts, temp_dir, model, X, dataset_name, run_id)
 
-        return metrics, artifacts
+        return EvaluationResult(metrics, artifacts)
 
     def _evaluate_regressor(self, temp_dir, model, X, y, dataset_name, run_id, evaluator_config):
         metrics = EvaluationMetrics()
@@ -220,7 +221,7 @@ def _evaluate_regressor(self, temp_dir, model, X, y, dataset_name, run_id, evalu
         metrics["root_mean_squared_error"] = math.sqrt(metrics["mean_squared_error"])
         self._log_model_explainality(artifacts, temp_dir, model, X, dataset_name, run_id)
         self._log_metrics(run_id, metrics, dataset_name)
-        return metrics, artifacts
+        return EvaluationResult(metrics, artifacts)
 
     def evaluate(
         self,

From 1dd06174e9275bb254c75fb20067e6091555c809 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 1 Dec 2021 08:43:38 +0800
Subject: [PATCH 040/120] fix

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/default_evaluator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 63c97fba50715..27b8034cc70cb 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -200,7 +200,7 @@ def plot_lift_curve():
                 )
 
             def plot_confusion_matrix():
-                sk_metrics.ConfusionMatrixDisplay.from_predictions(y, y_pred)
+                sk_metrics.ConfusionMatrixDisplay.from_predictions(y, y_pred, normalize='all')
 
             self._log_image_artifact(
                 artifacts, temp_dir, plot_confusion_matrix, run_id, "confusion_matrix", dataset_name

From 57012d322d99402501e780b4854c6abeee051439 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 1 Dec 2021 16:25:20 +0800
Subject: [PATCH 041/120] address comments

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/__init__.py                            |   3 +-
 mlflow/models/__init__.py                     |  30 ++-
 mlflow/models/evaluation/base.py              | 149 ++++++-----
 mlflow/models/evaluation/default_evaluator.py | 246 ------------------
 .../models/evaluation/evaluator_registry.py   |   3 -
 tests/models/test_evaluation.py               |  11 +-
 .../mlflow_test_plugin/dummy_evaluator.py     |  18 +-
 7 files changed, 133 insertions(+), 327 deletions(-)
 delete mode 100644 mlflow/models/evaluation/default_evaluator.py

diff --git a/mlflow/__init__.py b/mlflow/__init__.py
index 41291b8f6e833..4ca31a1d8d705 100644
--- a/mlflow/__init__.py
+++ b/mlflow/__init__.py
@@ -42,6 +42,7 @@
 
 import mlflow.projects as projects  # noqa: E402
 import mlflow.tracking as tracking  # noqa: E402
+import mlflow.models
 
 # model flavors
 _model_flavors_supported = []
@@ -151,7 +152,7 @@
 delete_run = mlflow.tracking.fluent.delete_run
 register_model = mlflow.tracking._model_registry.fluent.register_model
 autolog = mlflow.tracking.fluent.autolog
-
+evaluate = mlflow.models.evaluate
 
 run = projects.run
 
diff --git a/mlflow/models/__init__.py b/mlflow/models/__init__.py
index 9e36cb45fb3e0..20dd48fe0222c 100644
--- a/mlflow/models/__init__.py
+++ b/mlflow/models/__init__.py
@@ -21,19 +21,27 @@
 For details, see `MLflow Models <../models.html>`_.
 """
 
-from .model import Model
-from .flavor_backend import FlavorBackend
-from .signature import ModelSignature, infer_signature
-from .utils import ModelInputExample
-from ..utils.environment import infer_pip_requirements
 from .evaluation import evaluate
 
 __all__ = [
-    "Model",
-    "ModelSignature",
-    "ModelInputExample",
-    "infer_signature",
-    "FlavorBackend",
-    "infer_pip_requirements",
     "evaluate",
 ]
+
+try:
+    from .model import Model
+    from .flavor_backend import FlavorBackend
+    from .signature import ModelSignature, infer_signature
+    from .utils import ModelInputExample
+    from ..utils.environment import infer_pip_requirements
+
+    __all__ += [
+        "Model",
+        "ModelSignature",
+        "ModelInputExample",
+        "infer_signature",
+        "FlavorBackend",
+        "infer_pip_requirements",
+        "evaluate",
+    ]
+except ImportError:
+    pass
diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index e91ed3951b448..7288a70cf1ff8 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -1,9 +1,6 @@
 from typing import Dict, Union
 import mlflow
 import hashlib
-import time
-import numpy as np
-import pandas as pd
 import json
 import os
 from mlflow.exceptions import MlflowException
@@ -20,7 +17,7 @@
 
 class EvaluationMetrics(dict):
     """
-    Represent a dict of metrics.
+    A dictionary of model evaluation metrics.
     """
 
     pass
@@ -28,7 +25,7 @@ class EvaluationMetrics(dict):
 
 class EvaluationArtifact:
     """
-    Represent a artifact. Contains artifact uri and content.
+    A model evaluation artifact containing an artifact uri and content.
     """
 
     def __init__(self, uri, content=None):
@@ -36,21 +33,27 @@ def __init__(self, uri, content=None):
         self._content = content
 
     def _load_content_from_file(self, local_artifact_path):
+        """
+        Abstract interface to load the content from local artifact file path,
+        assign the loaded content to `self._content`, and return the loaded content.
+        """
         raise NotImplementedError()
 
     def load(self, local_artifact_path=None):
         """
-        If `local_artifact_path` is None, download artifact from the artifact uri and load it.
+        If `local_artifact_path` is None, download artifact from the artifact uri,
         otherwise load artifact content from specified path.
+        then assign the loaded content to `self._content`, and return the loaded content.
         """
         if local_artifact_path is None:
-            return self._load_content_from_file(local_artifact_path)
+            self._load_content_from_file(local_artifact_path)
         else:
             with TempDir() as temp_dir:
                 temp_dir_path = temp_dir.path()
                 _download_artifact_from_uri(self._uri, temp_dir_path)
                 local_artifact_file = temp_dir.path(os.listdir(temp_dir_path)[0])
-                return self._load_content_from_file(local_artifact_file)
+                self._load_content_from_file(local_artifact_file)
+        return self._content
 
     def save(self, output_artifact_path):
         """Save artifact content into specified path."""
@@ -97,10 +100,11 @@ def load(cls, path):
         artifacts_dir = os.path.join(path, "artifacts")
 
         for artifact_name, meta in artifacts_metadata:
-            location = meta["location"]
+            uri = meta["uri"]
             ArtifactCls = _get_class_from_string(meta["class_name"])
-            content = ArtifactCls.load_content_from_file(os.path.join(artifacts_dir, artifact_name))
-            artifacts[artifact_name] = ArtifactCls(location=location, content=content)
+            artifact = ArtifactCls(uri=uri)
+            artifact.load(os.path.join(artifacts_dir, artifact_name))
+            artifacts[artifact_name] = artifact
 
         return EvaluationResult(metrics=metrics, artifacts=artifacts)
 
@@ -112,7 +116,7 @@ def save(self, path):
 
         artifacts_metadata = {
             artifact_name: {
-                "location": artifact.location,
+                "uri": artifact.uri,
                 "class_name": _get_fully_qualified_class_name(artifact),
             }
             for artifact_name, artifact in self.artifacts.items()
@@ -147,8 +151,8 @@ def artifacts(self) -> Dict[str, EvaluationArtifact]:
 
 class EvaluationDataset:
     """
-    Represents an input dataset for model evaluation. This is intended for
-    use with the `mlflow.evaluate()`API.
+    An input dataset for model evaluation. This is intended for use with the `mlflow.evaluate()`
+    API.
     """
 
     NUM_SAMPLE_ROWS_FOR_HASH = 5
@@ -172,6 +176,9 @@ def __init__(self, data, labels, name=None, path=None):
         :param path: (Optional) the path to a serialized DataFrame (must not contain ").
           (e.g. a delta table, parquet file)
         """
+        import numpy as np
+        import pandas as pd
+
         try:
             from pyspark.sql import DataFrame as SparkDataFrame
 
@@ -180,9 +187,9 @@ def __init__(self, data, labels, name=None, path=None):
             supported_dataframe_types = (pd.DataFrame,)
 
         if name is not None and '"' in name:
-            raise ValueError(f'Dataset name cannot include a double quote (") but got name {name}')
+            raise ValueError(f'Dataset name cannot include a double quote (") but got {name}')
         if path is not None and '"' in path:
-            raise ValueError(f'Dataset path cannot include a double quote (") but got name {path}')
+            raise ValueError(f'Dataset path cannot include a double quote (") but got {path}')
 
         if isinstance(data, (np.ndarray, list)):
             if not isinstance(labels, (np.ndarray, list)):
@@ -198,34 +205,62 @@ def __init__(self, data, labels, name=None, path=None):
                 )
         else:
             raise ValueError(
-                "The data argument must be a numpy array, a list or a " "Pandas DataFrame."
+                "The data argument must be a numpy array, a list or a Pandas DataFrame, or "
+                "spark DataFrame if pyspark package installed."
             )
 
         self._user_specified_name = name
-        self.data = data
+        self._original_data = data
+        self._data = None
         self.labels = labels
         self.path = path
         self._hash = None
 
+    @property
+    def data(self):
+        """
+        Return original data if data is numpy array or pandas dataframe,
+        For spark dataframe, will only return the first SPARK_DATAFRAME_LIMIT rows as pandas
+        dataframe and emit warning.
+        """
+        if self._data is not None:
+            return self._data
+
+        try:
+            from pyspark.sql import DataFrame as SparkDataFrame
+
+            spark_df_type = SparkDataFrame
+        except ImportError:
+            spark_df_type = None
+
+        if spark_df_type and isinstance(self._original_data, spark_df_type):
+            self._data = self._original_data.limit(
+                EvaluationDataset.SPARK_DATAFRAME_LIMIT
+            ).toPandas()
+            _logger.warning(
+                f"Specified Spark DataFrame is too large for model evaluation. Only "
+                f"the first {EvaluationDataset.SPARK_DATAFRAME_LIMIT} rows will be used."
+            )
+        else:
+            self._data = self._original_data
+
+        return self._data
+
     def _extract_features_and_labels(self):
         """
         Extract features data and labels data.
         For spark dataframe, will only extract the first SPARK_DATAFRAME_LIMIT rows data
         and emit warning.
         """
+        import numpy as np
+
         if isinstance(self.data, np.ndarray):
             return self.data, self.labels
         else:
-            if not isinstance(self.data, pd.DataFrame):
-                data = self.data.limit(EvaluationDataset.SPARK_DATAFRAME_LIMIT).toPandas()
-                _logger.warning(
-                    f"Only the first {EvaluationDataset.SPARK_DATAFRAME_LIMIT} rows in the "
-                    f"spark dataframe are examined."
-                )
-            else:
-                data = self.data
-            feature_cols = [x for x in data.columns if x != self.labels]
-            return data[feature_cols], data[self.labels]
+            return (
+                self.data.drop(self.labels, axis=1, inplace=False),
+                self.data[self.labels].to_numpy(),
+            )
 
     @staticmethod
     def _array_like_obj_to_bytes(data):
@@ -233,8 +268,11 @@ def _array_like_obj_to_bytes(data):
         Helper method to convert pandas dataframe/numpy array/list into bytes for
         MD5 calculation purpose.
         """
+        import numpy as np
+        import pandas as pd
+
         if isinstance(data, pd.DataFrame):
-            return data.to_numpy().tobytes() + ",".join(list(data.columns)).encode("UTF-8")
+            return data.to_numpy().tobytes() + ",".join(data.columns).encode("UTF-8")
         elif isinstance(data, np.ndarray):
             return data.tobytes()
         elif isinstance(data, list):
@@ -250,6 +288,8 @@ def _gen_md5_for_arraylike_obj(md5_gen, data):
          - first NUM_SAMPLE_ROWS_FOR_HASH rows content
          - last NUM_SAMPLE_ROWS_FOR_HASH rows content
         """
+        import numpy as np
+
         md5_gen.update(np.int64(len(data)).tobytes())
         if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
             md5_gen.update(EvaluationDataset._array_like_obj_to_bytes(data))
@@ -279,6 +319,9 @@ def hash(self):
         Compute a hash from the specified dataset by selecting the first 5 records, last 5 records,
         dataset size and feeding them through a cheap, low-collision hash function
         """
+        import numpy as np
+        import pandas as pd
+
         if self._hash is None:
             md5_gen = hashlib.md5()
             if isinstance(self.data, np.ndarray):
@@ -308,19 +351,15 @@ def _log_dataset_tag(self, client, run_id):
         Log dataset metadata as a tag "mlflow.datasets", if the tag already exists, it will
         append current dataset metadata into existing tag content.
         """
-        existing_dataset_metadata_str = client.get_run(run_id).data.tags.get("mlflow.datasets")
-        if existing_dataset_metadata_str is not None:
-            dataset_metadata_list = json.loads(existing_dataset_metadata_str)
-        else:
-            dataset_metadata_list = []
+        existing_dataset_metadata_str = client.get_run(run_id).data.tags.get(
+            "mlflow.datasets", "[]"
+        )
+        dataset_metadata_list = json.loads(existing_dataset_metadata_str)
 
-        metadata_exists = False
         for metadata in dataset_metadata_list:
             if metadata["hash"] == self.hash and metadata["name"] == self._user_specified_name:
-                metadata_exists = True
                 break
-
-        if not metadata_exists:
+        else:
             dataset_metadata_list.append(self._metadata)
 
         dataset_metadata_str = json.dumps(dataset_metadata_list)
@@ -345,29 +384,15 @@ def can_evaluate(self, model_type, evaluator_config=None, **kwargs) -> bool:
         """
         raise NotImplementedError()
 
-    def _log_metrics(self, run_id, metrics, dataset_name):
-        """
-        Helper method to log metrics into specified run.
-        """
-        client = mlflow.tracking.MlflowClient()
-        timestamp = int(time.time() * 1000)
-        client.log_batch(
-            run_id,
-            metrics=[
-                Metric(key=f"{key}_on_{dataset_name}", value=value, timestamp=timestamp, step=0)
-                for key, value in metrics.items()
-            ],
-        )
-
     def evaluate(
         self,
-        model: "mlflow.pyfunc.PyFuncModel",
+        model,
         model_type,
         dataset,
         run_id,
         evaluator_config,
         **kwargs,
-    ) -> "mlflow.models.evaluation.EvaluationResult":
+    ):
         """
         The abstract API to log metrics and artifacts, and return evaluation results.
 
@@ -397,7 +422,7 @@ def list_evaluators():
     return list(_model_evaluation_registry._registry.keys())
 
 
-class StartRunOrReuseActiveRun:
+class _StartRunOrReuseActiveRun:
     """
     A manager context return:
      - If there's an active run, return the active run id.
@@ -420,12 +445,8 @@ def __enter__(self):
                 )
             return active_run_id
         else:
-            if self.user_specified_run_id is None:
-                raise ValueError(
-                    "Active run does not exist, you need specify a run_id when " "evaluating."
-                )
             self.managed_run = mlflow.start_run(run_id=self.user_specified_run_id).__enter__()
-            return self.user_specified_run_id
+            return self.managed_run.info.run_id
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         if self.managed_run is not None:
@@ -434,8 +455,8 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 def evaluate(
     model: Union[str, "mlflow.pyfunc.PyFuncModel"],
-    model_type,
-    dataset,
+    model_type: str,
+    dataset: "mlflow.models.evaluation.EvaluationDataset",
     run_id=None,
     evaluators=None,
     evaluator_config=None,
@@ -499,11 +520,11 @@ def evaluate(
         pass
     else:
         raise ValueError(
-            "The model argument must be a URI str referring to mlflow model or "
+            "The model argument must be a string URI referring to an MLflow model or "
             "an instance of `mlflow.pyfunc.PyFuncModel`."
         )
 
-    with StartRunOrReuseActiveRun(run_id) as actual_run_id:
+    with _StartRunOrReuseActiveRun(run_id) as actual_run_id:
         client = mlflow.tracking.MlflowClient()
         dataset._log_dataset_tag(client, actual_run_id)
 
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
deleted file mode 100644
index 27b8034cc70cb..0000000000000
--- a/mlflow/models/evaluation/default_evaluator.py
+++ /dev/null
@@ -1,246 +0,0 @@
-import mlflow
-from mlflow.models.evaluation.base import (
-    ModelEvaluator,
-    EvaluationMetrics,
-    EvaluationArtifact,
-    EvaluationResult,
-)
-from mlflow.utils.file_utils import TempDir
-from mlflow.tracking.artifact_utils import get_artifact_uri
-from sklearn import metrics as sk_metrics
-import matplotlib.pyplot as pyplot
-import scikitplot
-import shap
-import math
-import pandas as pd
-
-shap.initjs()
-
-"""
-[P0] Accuracy: Calculates how often predictions equal labels.
-[P0] BinaryCrossentropy: Computes the crossentropy metric between the labels and predictions.
-[P0] Hinge: Computes the hinge metric between y_true and y_pred.
-[P0] Sum: Computes the (weighted) sum of the given values.
-[P0] Mean: Computes the (weighted) mean of the given values.
-[P0] ExampleCount: Computes the total number of evaluation examples.
-[P0] MeanAbsoluteError: Computes the mean absolute error between the labels and predictions.
-[P0] MeanSquaredError: Computes the mean squared error between y_true and y_pred.
-[P0] RootMeanSquaredError: Computes root mean squared error metric between y_true and y_pred.
-
-[P0] TrueNegatives: Calculates the number of true negatives.
-[P0] TruePositives: Calculates the number of true positives.
-[P0] FalseNegatives: Calculates the number of false negatives.
-[P0] FalsePositives: Calculates the number of false positives.
-https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html#sklearn.metrics.confusion_matrix
-
-[P0] Precision: Computes the precision of the predictions with respect to the labels.
-[P0] Recall: Computes the recall of the predictions with respect to the labels.
-[P0] AUC: Approximates the AUC (Area under the curve) of the ROC or PR curves.
-[P0] F1 Score: 2*precision*recall / (precision+recall)
-
-[P0] BinaryClassConfusionMatrix
-
-Plots
-[P0] Confusion matrix
-[P0] Interactive ROC curve with metrics (TP/TN/FP/FN/Acc/F1/AUC), binary classification
-[P0] Lift chart
-
-Global explainability
-[P0] Model built-in feature importance (supported models)
-[P0] SHAP explainers
-     [P0] Summary plot
-"""
-
-from PIL.Image import Image, open as open_image
-
-
-class ImageEvaluationArtifact(EvaluationArtifact):
-    def save(self, output_artifact_path):
-        self._content.save(output_artifact_path)
-
-    def _load_content_from_file(self, local_artifact_path):
-        self._content = open_image(local_artifact_path)
-        return self._content
-
-
-class CsvEvaluationArtifact(EvaluationArtifact):
-    def save(self, output_artifact_path):
-        self._content.to_csv(output_artifact_path, index=False)
-
-    def _load_content_from_file(self, local_artifact_path):
-        self._content = pd.read_csv(local_artifact_path)
-        return self._content
-
-
-class DefaultEvaluator(ModelEvaluator):
-    def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
-        return model_type in ["classifier", "regressor"]
-
-    def _log_image_artifact(
-        self, artifacts, temp_dir, do_plot, run_id, artifact_name, dataset_name
-    ):
-        client = mlflow.tracking.MlflowClient()
-        pyplot.clf()
-        do_plot()
-        artifact_file_name = f"{artifact_name}_on_{dataset_name}.png"
-        artifact_file_local_path = temp_dir.path(artifact_file_name)
-        pyplot.savefig(artifact_file_local_path)
-        client.log_artifact(run_id, artifact_file_local_path)
-        artifact = ImageEvaluationArtifact(uri=get_artifact_uri(run_id, artifact_file_name))
-        artifact.load(artifact_file_local_path)
-        artifacts[artifact_file_name] = artifact
-
-    def _log_pandas_df_artifact(
-        self, artifacts, temp_dir, pandas_df, run_id, artifact_name, dataset_name
-    ):
-        client = mlflow.tracking.MlflowClient()
-        artifact_file_name = f"{artifact_name}_on_{dataset_name}.csv"
-        artifact_file_local_path = temp_dir.path(artifact_file_name)
-        pandas_df.to_csv(artifact_file_local_path, index=False)
-        client.log_artifact(run_id, artifact_file_local_path)
-        artifact = CsvEvaluationArtifact(
-            uri=get_artifact_uri(run_id, artifact_file_name),
-            content=pandas_df,
-        )
-        artifact.load(artifact_file_local_path)
-        artifacts[artifact_file_name] = artifact
-
-    def _log_model_explainality(self, artifacts, temp_dir, model, X, dataset_name, run_id):
-        explainer = shap.Explainer(model._model_impl, X)
-        shap_values = explainer(X)
-
-        def plot_summary():
-            shap.plots.beeswarm(shap_values, show=False)
-
-        self._log_image_artifact(
-            artifacts, temp_dir, plot_summary, run_id, "shap_summary", dataset_name
-        )
-
-        def plot_feature_importance():
-            shap.plots.bar(shap_values, show=False)
-
-        self._log_image_artifact(
-            artifacts,
-            temp_dir,
-            plot_feature_importance,
-            run_id,
-            "shap_feature_importance",
-            dataset_name,
-        )
-
-    def _evaluate_classifier(self, temp_dir, model, X, y, dataset_name, run_id, evaluator_config):
-        # Note: require labels to be number of 0, 1, 2, .. num_classes - 1
-        label_list = sorted(list(set(y)))
-        assert label_list[0] >= 0, "Evaluation dataset labels must be positive integers."
-        max_label = label_list[-1]
-        num_classes = max_label + 1
-
-        y_pred = model.predict(X)
-
-        is_binomial = num_classes <= 2
-
-        metrics = EvaluationMetrics()
-        artifacts = {}
-        metrics["accuracy"] = sk_metrics.accuracy_score(y, y_pred)
-        metrics["example_count"] = len(X)
-
-        # TODO: sum/mean on what data ?
-        #  [P0] Sum: Computes the (weighted) sum of the given values.
-        #  [P0] Mean: Computes the (weighted) mean of the given values.
-
-        if is_binomial:
-            if hasattr(model, "predict_proba"):
-                y_probs = model.predict_proba(X)
-                y_prob = y_probs[:, 1]
-            else:
-                y_probs = None
-                y_prob = None
-
-            confusion_matrix = sk_metrics.confusion_matrix(y, y_pred)
-            tn, fp, fn, tp = confusion_matrix.ravel()
-            metrics["true_negatives"] = tn
-            metrics["false_positives"] = fp
-            metrics["false_negatives"] = fn
-            metrics["true_positives"] = tp
-            metrics["recall"] = sk_metrics.recall_score(y, y_pred)
-            metrics["precision"] = sk_metrics.precision_score(y, y_pred)
-            metrics["f1_score"] = sk_metrics.f1_score(y, y_pred)
-
-            # TODO:
-            #  compute hinge loss, this requires calling decision_function of the model
-            #  e.g., see https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC.decision_function
-
-            if y_probs is not None:
-                metrics["roc_auc"] = sk_metrics.roc_auc_score(y, y_prob)
-                fpr, tpr, thresholds = sk_metrics.roc_curve(y, y_prob)
-                roc_curve_pandas_df = pd.DataFrame(
-                    {"fpr": fpr, "tpr": tpr, "thresholds": thresholds}
-                )
-                self._log_pandas_df_artifact(
-                    artifacts, temp_dir, roc_curve_pandas_df, run_id, "roc_curve", dataset_name
-                )
-
-                roc_auc = sk_metrics.auc(fpr, tpr)
-                metrics["precision_recall_auc"] = roc_auc
-
-                def plot_roc_curve():
-                    sk_metrics.RocCurveDisplay(
-                        fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name="example estimator"
-                    ).plot()
-
-                self._log_image_artifact(
-                    artifacts, temp_dir, plot_roc_curve, run_id, "roc_curve", dataset_name
-                )
-
-                def plot_lift_curve():
-                    scikitplot.metrics.plot_lift_curve(y, y_probs)
-
-                self._log_image_artifact(
-                    artifacts, temp_dir, plot_lift_curve, run_id, "lift_curve", dataset_name
-                )
-
-            def plot_confusion_matrix():
-                sk_metrics.ConfusionMatrixDisplay.from_predictions(y, y_pred, normalize='all')
-
-            self._log_image_artifact(
-                artifacts, temp_dir, plot_confusion_matrix, run_id, "confusion_matrix", dataset_name
-            )
-
-        self._log_metrics(run_id, metrics, dataset_name)
-        self._log_model_explainality(artifacts, temp_dir, model, X, dataset_name, run_id)
-
-        return EvaluationResult(metrics, artifacts)
-
-    def _evaluate_regressor(self, temp_dir, model, X, y, dataset_name, run_id, evaluator_config):
-        metrics = EvaluationMetrics()
-        artifacts = {}
-        y_pred = model.predict(X)
-        metrics["example_count"] = len(X)
-        metrics["mean_absolute_error"] = sk_metrics.mean_absolute_error(y, y_pred)
-        metrics["mean_squared_error"] = sk_metrics.mean_squared_error(y, y_pred)
-        metrics["root_mean_squared_error"] = math.sqrt(metrics["mean_squared_error"])
-        self._log_model_explainality(artifacts, temp_dir, model, X, dataset_name, run_id)
-        self._log_metrics(run_id, metrics, dataset_name)
-        return EvaluationResult(metrics, artifacts)
-
-    def evaluate(
-        self,
-        model: "mlflow.pyfunc.PyFuncModel",
-        model_type,
-        dataset,
-        run_id,
-        evaluator_config,
-        **kwargs,
-    ):
-        with TempDir() as temp_dir:
-            X, y = dataset._extract_features_and_labels()
-            if model_type == "classifier":
-                return self._evaluate_classifier(
-                    temp_dir, model, X, y, dataset.name, run_id, evaluator_config
-                )
-            elif model_type == "regressor":
-                return self._evaluate_regressor(
-                    temp_dir, model, X, y, dataset.name, run_id, evaluator_config
-                )
-            else:
-                raise ValueError(f"Unsupported model type {model_type}")
diff --git a/mlflow/models/evaluation/evaluator_registry.py b/mlflow/models/evaluation/evaluator_registry.py
index 1822e313460ea..9e7f027f0496e 100644
--- a/mlflow/models/evaluation/evaluator_registry.py
+++ b/mlflow/models/evaluation/evaluator_registry.py
@@ -48,9 +48,6 @@ def get_evaluator(self, evaluator_name):
 
 
 def register_evaluators(module):
-    from mlflow.models.evaluation.default_evaluator import DefaultEvaluator
-
-    module._model_evaluation_registry.register("default", DefaultEvaluator)
     module._model_evaluation_registry.register_entrypoints()
 
 
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index 91e1b534bfe9e..e37fece833b11 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -156,7 +156,7 @@ def test_dataset_name():
     d1 = EvaluationDataset(data=X, labels=y, name="a1")
     assert d1.name == "a1"
     d2 = EvaluationDataset(data=X, labels=y)
-    d2.name == d2.hash
+    assert d2.name == d2.hash
 
 
 def test_dataset_hash(iris_dataset, iris_pandas_df_dataset):
@@ -186,3 +186,12 @@ def test_log_dataset_tag(iris_dataset, iris_pandas_df_dataset):
             iris_dataset._metadata,
             iris_pandas_df_dataset._metadata,
         ]
+
+
+# test hash with data change
+# test hash logged in evaluator
+# test artifact save/load
+# test Start/Create Run
+# test dataset with spark
+# test extract features label
+# test evalute correct args passing to Evaluator, multi evals, config, etc.
diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
index 700b705a37124..2a048ee17c162 100644
--- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -7,7 +7,9 @@
     EvaluationDataset,
 )
 from mlflow.tracking.artifact_utils import get_artifact_uri
+from mlflow.entities import Metric, RunTag
 from sklearn import metrics as sk_metrics
+import time
 import numpy as np
 import pandas as pd
 import io
@@ -17,7 +19,7 @@ class Array2DEvaluationArtifact(EvaluationArtifact):
     def save(self, output_artifact_path):
         pd.DataFrame(self._content).to_csv(output_artifact_path, index=False)
 
-    def load(self, local_artifact_path):
+    def _load_content_from_file(self, local_artifact_path):
         pdf = pd.read_csv(local_artifact_path)
         self._content = pdf.to_numpy()
         return self._content
@@ -27,6 +29,20 @@ class DummyEvaluator(ModelEvaluator):
     def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
         return model_type in ["classifier", "regressor"]
 
+    def _log_metrics(self, run_id, metrics, dataset_name):
+        """
+        Helper method to log metrics into specified run.
+        """
+        client = mlflow.tracking.MlflowClient()
+        timestamp = int(time.time() * 1000)
+        client.log_batch(
+            run_id,
+            metrics=[
+                Metric(key=f"{key}_on_{dataset_name}", value=value, timestamp=timestamp, step=0)
+                for key, value in metrics.items()
+            ],
+        )
+
     def evaluate(
         self, model, model_type, dataset, run_id, evaluator_config=None, **kwargs
     ) -> EvaluationResult:

From 21538cbd81016a94af8d20acf19ade43ee8faee2 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 1 Dec 2021 16:44:25 +0800
Subject: [PATCH 042/120] fix doc

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 7288a70cf1ff8..4a88c5a6bca31 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -131,14 +131,14 @@ def save(self, path):
             artifact.save(os.path.join(artifacts_dir, artifact_name))
 
     @property
-    def metrics(self) -> EvaluationMetrics:
+    def metrics(self) -> "mlflow.models.evaluation.EvaluationMetrics":
         """
         A dictionary mapping scalar metric names to scalar metric values
         """
         return self._metrics
 
     @property
-    def artifacts(self) -> Dict[str, EvaluationArtifact]:
+    def artifacts(self) -> Dict[str, "mlflow.models.evaluation.EvaluationArtifact"]:
         """
         A dictionary mapping standardized artifact names (e.g. "roc_data") to
         artifact content and location information
@@ -151,7 +151,8 @@ def artifacts(self) -> Dict[str, EvaluationArtifact]:
 
 class EvaluationDataset:
     """
-    An input dataset for model evaluation. This is intended for use with the `mlflow.evaluate()`
+    An input dataset for model evaluation. This is intended for use with the
+    :py:func:`mlflow.evaluate()`
     API.
     """
 
@@ -399,15 +400,16 @@ def evaluate(
         :param model: A pyfunc model instance.
         :param model_type: A string describing the model type (e.g., "regressor",
                    "classifier", …).
-        :param dataset: An instance of `EvaluationDataset` containing features
-                        and labels (optional) for model evaluation.
+        :param dataset: An instance of :py:class:`mlflow.models.evaluation.EvaluationDataset`
+                        containing features and labels (optional) for model evaluation.
         :param run_id: The ID of the MLflow Run to which to log results.
         :param evaluator_config: A dictionary of additional configurations for
                                  the evaluator.
         :param **kwargs: For forwards compatibility, a placeholder for additional
                          arguments that may be added to the evaluation interface
                          in the future.
-        :return: An `EvaluationResult` instance containing evaluation results.
+        :return: An :py:class:`mlflow.models.evaluation.EvaluationResult` instance containing
+                 evaluation results.
         """
         raise NotImplementedError()
 
@@ -470,8 +472,8 @@ def evaluate(
 
     :param model_type: A string describing the model type. The default evaluator
                        supports "regressor" and "classifier" as model types.
-    :param dataset: An instance of `EvaluationDataset` containing features
-                    labels (optional) for model evaluation.
+    :param dataset: An instance of :py:class:`mlflow.models.evaluation.base.EvaluationDataset`
+                    containing features labels (optional) for model evaluation.
     :param run_id: The ID of the MLflow Run to which to log results. If
                    unspecified, behavior depends on the specified `evaluator`.
                    When `run_id` is unspecified, the default evaluator logs
@@ -486,7 +488,8 @@ def evaluate(
                              to the evaluator. If multiple evaluators are
                              specified, each configuration should be supplied as
                              a nested dictionary whose key is the evaluator name.
-    :return: An `EvaluationResult` instance containing evaluation results.
+    :return: An :py:class:`mlflow.models.evaluation.EvaluationDataset` instance containing
+             evaluation results.
     """
     # import _model_evaluation_registry and PyFuncModel inside function to avoid circuit importing
     from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry

From 812f0df2514b49f5abf0eb3701c53d8d9e080577 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 1 Dec 2021 17:03:11 +0800
Subject: [PATCH 043/120] fix doc

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 docs/source/python_api/mlflow.models.evaluation.base.rst | 8 ++++++++
 docs/source/python_api/mlflow.models.evaluation.rst      | 8 ++++++++
 mlflow/models/evaluation/base.py                         | 8 ++++----
 3 files changed, 20 insertions(+), 4 deletions(-)
 create mode 100644 docs/source/python_api/mlflow.models.evaluation.base.rst
 create mode 100644 docs/source/python_api/mlflow.models.evaluation.rst

diff --git a/docs/source/python_api/mlflow.models.evaluation.base.rst b/docs/source/python_api/mlflow.models.evaluation.base.rst
new file mode 100644
index 0000000000000..36b8a49222645
--- /dev/null
+++ b/docs/source/python_api/mlflow.models.evaluation.base.rst
@@ -0,0 +1,8 @@
+mlflow.models.evaluation.base
+=============================
+
+.. automodule:: mlflow.models.evaluation.base
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
diff --git a/docs/source/python_api/mlflow.models.evaluation.rst b/docs/source/python_api/mlflow.models.evaluation.rst
new file mode 100644
index 0000000000000..4034f6a5d0c4c
--- /dev/null
+++ b/docs/source/python_api/mlflow.models.evaluation.rst
@@ -0,0 +1,8 @@
+mlflow.models.evaluation
+========================
+
+.. automodule:: mlflow.models.evaluation
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 4a88c5a6bca31..8dc6d563c8307 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -152,7 +152,7 @@ def artifacts(self) -> Dict[str, "mlflow.models.evaluation.EvaluationArtifact"]:
 class EvaluationDataset:
     """
     An input dataset for model evaluation. This is intended for use with the
-    :py:func:`mlflow.evaluate()`
+    :py:func:`mlflow.models.evaluation.evaluate()`
     API.
     """
 
@@ -377,7 +377,7 @@ def can_evaluate(self, model_type, evaluator_config=None, **kwargs) -> bool:
                            "classifier", …).
         :param evaluator_config: A dictionary of additional configurations for
                                  the evaluator.
-        :param **kwargs: For forwards compatibility, a placeholder for additional
+        :param kwargs: For forwards compatibility, a placeholder for additional
                          arguments that may be added to the evaluation interface
                          in the future.
         :return: True if the evaluator can evaluate the specified model on the
@@ -405,7 +405,7 @@ def evaluate(
         :param run_id: The ID of the MLflow Run to which to log results.
         :param evaluator_config: A dictionary of additional configurations for
                                  the evaluator.
-        :param **kwargs: For forwards compatibility, a placeholder for additional
+        :param kwargs: For forwards compatibility, a placeholder for additional
                          arguments that may be added to the evaluation interface
                          in the future.
         :return: An :py:class:`mlflow.models.evaluation.EvaluationResult` instance containing
@@ -472,7 +472,7 @@ def evaluate(
 
     :param model_type: A string describing the model type. The default evaluator
                        supports "regressor" and "classifier" as model types.
-    :param dataset: An instance of :py:class:`mlflow.models.evaluation.base.EvaluationDataset`
+    :param dataset: An instance of :py:class:`mlflow.models.evaluation.EvaluationDataset`
                     containing features labels (optional) for model evaluation.
     :param run_id: The ID of the MLflow Run to which to log results. If
                    unspecified, behavior depends on the specified `evaluator`.

From cc2ac8e0aee36d918bc42416ec4c162b0b1182c3 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 1 Dec 2021 17:07:25 +0800
Subject: [PATCH 044/120] update import

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 docs/source/python_api/mlflow.models.evaluation.base.rst | 8 --------
 tests/models/test_evaluation.py                          | 9 ---------
 2 files changed, 17 deletions(-)
 delete mode 100644 docs/source/python_api/mlflow.models.evaluation.base.rst

diff --git a/docs/source/python_api/mlflow.models.evaluation.base.rst b/docs/source/python_api/mlflow.models.evaluation.base.rst
deleted file mode 100644
index 36b8a49222645..0000000000000
--- a/docs/source/python_api/mlflow.models.evaluation.base.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-mlflow.models.evaluation.base
-=============================
-
-.. automodule:: mlflow.models.evaluation.base
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index e37fece833b11..60976ca38db2e 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -186,12 +186,3 @@ def test_log_dataset_tag(iris_dataset, iris_pandas_df_dataset):
             iris_dataset._metadata,
             iris_pandas_df_dataset._metadata,
         ]
-
-
-# test hash with data change
-# test hash logged in evaluator
-# test artifact save/load
-# test Start/Create Run
-# test dataset with spark
-# test extract features label
-# test evalute correct args passing to Evaluator, multi evals, config, etc.

From 226d54b6349ba3cdd62b2ca7c199b8116f7b1d22 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 1 Dec 2021 17:14:24 +0800
Subject: [PATCH 045/120] fix doc

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 docs/source/python_api/mlflow.models.evaluation.base.rst | 7 +++++++
 mlflow/__init__.py                                       | 1 +
 mlflow/models/__init__.py                                | 1 -
 3 files changed, 8 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/python_api/mlflow.models.evaluation.base.rst

diff --git a/docs/source/python_api/mlflow.models.evaluation.base.rst b/docs/source/python_api/mlflow.models.evaluation.base.rst
new file mode 100644
index 0000000000000..501303a39735a
--- /dev/null
+++ b/docs/source/python_api/mlflow.models.evaluation.base.rst
@@ -0,0 +1,7 @@
+mlflow.models.evaluation.base
+=============================
+
+.. automodule:: mlflow.models.evaluation.base
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/mlflow/__init__.py b/mlflow/__init__.py
index 0758b9714dbde..648c0ee693060 100644
--- a/mlflow/__init__.py
+++ b/mlflow/__init__.py
@@ -193,4 +193,5 @@
     "set_registry_uri",
     "list_run_infos",
     "autolog",
+    "evaluate",
 ] + _model_flavors_supported
diff --git a/mlflow/models/__init__.py b/mlflow/models/__init__.py
index 20dd48fe0222c..b6fd6083d7db2 100644
--- a/mlflow/models/__init__.py
+++ b/mlflow/models/__init__.py
@@ -41,7 +41,6 @@
         "infer_signature",
         "FlavorBackend",
         "infer_pip_requirements",
-        "evaluate",
     ]
 except ImportError:
     pass

From c20ccdc2cb976142b39ecc899425a9cc315801d8 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 1 Dec 2021 22:54:23 +0800
Subject: [PATCH 046/120] update hash algo

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py | 24 ++++++++++++++++++++----
 tests/models/test_evaluation.py  |  4 ++--
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 8dc6d563c8307..06fa29d012f0b 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -10,6 +10,7 @@
 from mlflow.utils import _get_fully_qualified_class_name
 from mlflow.utils.class_utils import _get_class_from_string
 import logging
+import struct
 
 
 _logger = logging.getLogger(__name__)
@@ -263,21 +264,33 @@ def _extract_features_and_labels(self):
                 self.data[self.labels].to_numpy(),
             )
 
+    @staticmethod
+    def _convert_uint64_ndarray_to_bytes(array):
+        assert len(array.shape) == 1
+        # see struct pack format string https://docs.python.org/3/library/struct.html#format-strings
+        return struct.pack(f'>{array.size}Q', *array)
+
     @staticmethod
     def _array_like_obj_to_bytes(data):
         """
         Helper method to convert pandas dataframe/numpy array/list into bytes for
         MD5 calculation purpose.
         """
+        from pandas.util import hash_pandas_object, hash_array
         import numpy as np
         import pandas as pd
 
         if isinstance(data, pd.DataFrame):
-            return data.to_numpy().tobytes() + ",".join(data.columns).encode("UTF-8")
+            return EvaluationDataset._convert_uint64_ndarray_to_bytes(hash_pandas_object(data)) + \
+                   ",".join(data.columns).encode("UTF-8")
         elif isinstance(data, np.ndarray):
-            return data.tobytes()
+            return EvaluationDataset._convert_uint64_ndarray_to_bytes(
+                hash_array(data.flatten(order='C'))
+            )
         elif isinstance(data, list):
-            return np.array(data).tobytes()
+            return EvaluationDataset._convert_uint64_ndarray_to_bytes(
+                hash_array(np.array(data))
+            )
         else:
             raise ValueError("Unsupported data type.")
 
@@ -291,7 +304,10 @@ def _gen_md5_for_arraylike_obj(md5_gen, data):
         """
         import numpy as np
 
-        md5_gen.update(np.int64(len(data)).tobytes())
+        len_bytes = EvaluationDataset._convert_uint64_ndarray_to_bytes(
+            np.array([len(data)], dtype='uint64')
+        )
+        md5_gen.update(len_bytes)
         if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
             md5_gen.update(EvaluationDataset._array_like_obj_to_bytes(data))
         else:
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index 60976ca38db2e..c4ddbd4f229fd 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -160,8 +160,8 @@ def test_dataset_name():
 
 
 def test_dataset_hash(iris_dataset, iris_pandas_df_dataset):
-    assert iris_dataset.hash == "49a04c127e5441e0f27e63a325b5fa69"
-    assert iris_pandas_df_dataset.hash == "d6770fd5fffe651cb95e965854920df9"
+    assert iris_dataset.hash == "c7417e63a9ce038a32f37ecd7fb829f6"
+    assert iris_pandas_df_dataset.hash == "e796a0b1e0bef0fc06b4b1ad62e3ea63"
 
 
 def test_log_dataset_tag(iris_dataset, iris_pandas_df_dataset):

From 68842fe4a86023488ae1f5d4950a90b18580ca34 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 2 Dec 2021 18:13:53 +0800
Subject: [PATCH 047/120] update import

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/__init__.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/mlflow/models/__init__.py b/mlflow/models/__init__.py
index b6fd6083d7db2..b1f73ec16abe6 100644
--- a/mlflow/models/__init__.py
+++ b/mlflow/models/__init__.py
@@ -21,26 +21,29 @@
 For details, see `MLflow Models <../models.html>`_.
 """
 
+from .model import Model
+from .flavor_backend import FlavorBackend
+from ..utils.environment import infer_pip_requirements
 from .evaluation import evaluate
 
 __all__ = [
+    "Model",
+    "FlavorBackend",
+    "infer_pip_requirements",
     "evaluate",
 ]
 
+
+# Under skinny-mlflow requirements, the following packages cannot be imported
+# because of lack of numpy/pandas library, so wrap them with try...except block
 try:
-    from .model import Model
-    from .flavor_backend import FlavorBackend
     from .signature import ModelSignature, infer_signature
     from .utils import ModelInputExample
-    from ..utils.environment import infer_pip_requirements
 
     __all__ += [
-        "Model",
         "ModelSignature",
         "ModelInputExample",
         "infer_signature",
-        "FlavorBackend",
-        "infer_pip_requirements",
     ]
 except ImportError:
     pass

From 755639bc1fc78b911f98a515308e0f7d77a30e91 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 2 Dec 2021 18:24:22 +0800
Subject: [PATCH 048/120] address comment

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py | 43 +++++++++++++-------------------
 1 file changed, 17 insertions(+), 26 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 06fa29d012f0b..8d84ff4886c0d 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -3,6 +3,7 @@
 import hashlib
 import json
 import os
+from contextlib import contextmanager
 from mlflow.exceptions import MlflowException
 from mlflow.utils.file_utils import TempDir
 from mlflow.entities import Metric, RunTag
@@ -440,35 +441,25 @@ def list_evaluators():
     return list(_model_evaluation_registry._registry.keys())
 
 
-class _StartRunOrReuseActiveRun:
+@contextmanager
+def _start_run_or_reuse_active_run(run_id):
     """
     A manager context return:
      - If there's an active run, return the active run id.
-     - otherwise start a mflow run with the specified run_id.
+     - otherwise start a mflow run with the specified run_id,
+       if specified run_id is None, start a new run.
     """
-
-    def __init__(self, run_id):
-        self.user_specified_run_id = run_id
-        self.managed_run = None
-
-    def __enter__(self):
-        if mlflow.active_run() is not None:
-            active_run_id = mlflow.active_run().info.run_id
-            if (
-                self.user_specified_run_id is not None
-                and self.user_specified_run_id != active_run_id
-            ):
-                raise ValueError(
-                    "An active run exists, you cannot specify another run_id when " "evaluating."
-                )
-            return active_run_id
-        else:
-            self.managed_run = mlflow.start_run(run_id=self.user_specified_run_id).__enter__()
-            return self.managed_run.info.run_id
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if self.managed_run is not None:
-            return self.managed_run.__exit__(exc_type, exc_val, exc_tb)
+    active_run = mlflow.active_run()
+    if not active_run:
+        # Note `mlflow.start_run` throws if `run_id` is not found.
+        with mlflow.start_run(run_id=run_id) as run:
+            yield run.info.run_id
+    else:
+        if run_id and active_run.info.run_id != run_id:
+            raise ValueError(
+                "An active run exists, you cannot specify another run_id when " "evaluating."
+            )
+        yield active_run.info.run_id
 
 
 def evaluate(
@@ -543,7 +534,7 @@ def evaluate(
             "an instance of `mlflow.pyfunc.PyFuncModel`."
         )
 
-    with _StartRunOrReuseActiveRun(run_id) as actual_run_id:
+    with _start_run_or_reuse_active_run(run_id) as actual_run_id:
         client = mlflow.tracking.MlflowClient()
         dataset._log_dataset_tag(client, actual_run_id)
 

From 73cd7045cde9786246f19a064ce400f0729e45a0 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 2 Dec 2021 22:43:20 +0800
Subject: [PATCH 049/120] add tests

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py |  4 +-
 tests/models/test_evaluation.py  | 79 ++++++++++++++++++++++++++++++--
 2 files changed, 76 insertions(+), 7 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 8d84ff4886c0d..5f15f84788932 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -47,7 +47,7 @@ def load(self, local_artifact_path=None):
         otherwise load artifact content from specified path.
         then assign the loaded content to `self._content`, and return the loaded content.
         """
-        if local_artifact_path is None:
+        if local_artifact_path is not None:
             self._load_content_from_file(local_artifact_path)
         else:
             with TempDir() as temp_dir:
@@ -101,7 +101,7 @@ def load(cls, path):
 
         artifacts_dir = os.path.join(path, "artifacts")
 
-        for artifact_name, meta in artifacts_metadata:
+        for artifact_name, meta in artifacts_metadata.items():
             uri = meta["uri"]
             ArtifactCls = _get_class_from_string(meta["class_name"])
             artifact = ArtifactCls(uri=uri)
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index c4ddbd4f229fd..3731b6e19b92f 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -1,12 +1,16 @@
 import mlflow
 
-from mlflow.models.evaluation import evaluate, EvaluationDataset
+from mlflow.models.evaluation import evaluate, EvaluationDataset, EvaluationResult
 import sklearn
+import os
 import sklearn.datasets
 import sklearn.linear_model
 import pytest
 import numpy as np
 import pandas as pd
+from unittest import mock
+from mlflow.utils.file_utils import TempDir
+from mlflow_test_plugin.dummy_evaluator import Array2DEvaluationArtifact
 
 from sklearn.metrics import (
     accuracy_score,
@@ -15,6 +19,8 @@
     mean_squared_error,
 )
 
+from pyspark.sql import SparkSession
+
 from mlflow.tracking.artifact_utils import get_artifact_uri
 import json
 
@@ -41,6 +47,13 @@ def get_local_artifact_path(run_id, artifact_path):
     return get_artifact_uri(run_id, artifact_path).replace("file://", "")
 
 
+@pytest.fixture(scope="module")
+def spark_session():
+    session = SparkSession.builder.master("local[*]").getOrCreate()
+    yield session
+    session.stop()
+
+
 @pytest.fixture(scope="module")
 def regressor_model():
     X, y = get_diabetes_dataset()
@@ -108,15 +121,49 @@ def test_classifier_evaluate(classifier_model, iris_dataset):
     assert saved_artifacts == [artifact_name]
 
     assert eval_result.metrics == expected_metrics
-    returned_confusion_matrix_artifact = eval_result.artifacts[artifact_name]
-    assert np.array_equal(returned_confusion_matrix_artifact.content, expected_artifact)
-    assert returned_confusion_matrix_artifact.uri == get_artifact_uri(
+    confusion_matrix_artifact = eval_result.artifacts[artifact_name]
+    assert np.array_equal(confusion_matrix_artifact.content, expected_artifact)
+    assert confusion_matrix_artifact.uri == get_artifact_uri(
         run.info.run_id, artifact_name
     )
     assert np.array_equal(
-        returned_confusion_matrix_artifact.load(saved_artifact_path), expected_artifact
+        confusion_matrix_artifact.load(saved_artifact_path), expected_artifact
     )
 
+    with TempDir() as temp_dir:
+        temp_dir_path = temp_dir.path()
+        eval_result.save(temp_dir_path)
+
+        with open(temp_dir.path('metrics.json'), 'r') as fp:
+            assert json.load(fp) == eval_result.metrics
+
+        with open(temp_dir.path('artifacts_metadata.json'), 'r') as fp:
+            assert json.load(fp) == {
+                'confusion_matrix_on_iris_dataset.csv': {
+                    'uri': confusion_matrix_artifact.uri,
+                    'class_name': 'mlflow_test_plugin.dummy_evaluator.Array2DEvaluationArtifact'
+                }
+            }
+
+        assert os.listdir(temp_dir.path('artifacts')) == ['confusion_matrix_on_iris_dataset.csv']
+
+        loaded_eval_result = EvaluationResult.load(temp_dir_path)
+        assert loaded_eval_result.metrics == eval_result.metrics
+        loaded_confusion_matrix_artifact = loaded_eval_result.artifacts[artifact_name]
+        assert confusion_matrix_artifact.uri == loaded_confusion_matrix_artifact.uri
+        assert np.array_equal(
+            confusion_matrix_artifact.content,
+            loaded_confusion_matrix_artifact.content,
+        )
+
+        new_confusion_matrix_artifact = \
+            Array2DEvaluationArtifact(uri=confusion_matrix_artifact.uri)
+        new_confusion_matrix_artifact.load()
+        assert np.array_equal(
+            confusion_matrix_artifact.content,
+            new_confusion_matrix_artifact.content,
+        )
+
 
 def test_regressor_evaluate(regressor_model, iris_dataset):
     with mlflow.start_run() as run:
@@ -164,6 +211,28 @@ def test_dataset_hash(iris_dataset, iris_pandas_df_dataset):
     assert iris_pandas_df_dataset.hash == "e796a0b1e0bef0fc06b4b1ad62e3ea63"
 
 
+def test_datasset_extract_features_label(iris_dataset, iris_pandas_df_dataset):
+    X1, y1 = iris_dataset._extract_features_and_labels()
+    assert np.array_equal(X1, iris_dataset.data)
+    assert np.array_equal(y1, iris_dataset.labels)
+
+    X2, y2 = iris_pandas_df_dataset._extract_features_and_labels()
+    assert list(X2.columns) == ['f1', 'f2']
+    assert np.array_equal(X2['f1'], X1[:, 0])
+    assert np.array_equal(X2['f2'], X1[:, 1])
+    assert np.array_equal(y2, y1)
+
+
+def test_spark_df_dataset(spark_session):
+    spark_df = spark_session.createDataFrame([(1.0, 2.0, 3.0)] * 10, ['f1', 'f2', 'y'])
+    with mock.patch.object(EvaluationDataset, 'SPARK_DATAFRAME_LIMIT', 5):
+        dataset = EvaluationDataset(spark_df, 'y')
+        assert list(dataset.data.columns) == ['f1', 'f2', 'y']
+        assert list(dataset.data['f1']) == [1.0] * 5
+        assert list(dataset.data['f2']) == [2.0] * 5
+        assert list(dataset.data['y']) == [3.0] * 5
+
+
 def test_log_dataset_tag(iris_dataset, iris_pandas_df_dataset):
     with mlflow.start_run() as run:
         client = mlflow.tracking.MlflowClient()

From 010f225b67bbd1c583b9aa631a300141714e59d4 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 2 Dec 2021 22:46:34 +0800
Subject: [PATCH 050/120] fix lint

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py | 15 ++++++-----
 tests/models/test_evaluation.py  | 43 ++++++++++++++------------------
 2 files changed, 26 insertions(+), 32 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 5f15f84788932..aab2b93960a96 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -269,7 +269,7 @@ def _extract_features_and_labels(self):
     def _convert_uint64_ndarray_to_bytes(array):
         assert len(array.shape) == 1
         # see struct pack format string https://docs.python.org/3/library/struct.html#format-strings
-        return struct.pack(f'>{array.size}Q', *array)
+        return struct.pack(f">{array.size}Q", *array)
 
     @staticmethod
     def _array_like_obj_to_bytes(data):
@@ -282,16 +282,15 @@ def _array_like_obj_to_bytes(data):
         import pandas as pd
 
         if isinstance(data, pd.DataFrame):
-            return EvaluationDataset._convert_uint64_ndarray_to_bytes(hash_pandas_object(data)) + \
-                   ",".join(data.columns).encode("UTF-8")
+            return EvaluationDataset._convert_uint64_ndarray_to_bytes(
+                hash_pandas_object(data)
+            ) + ",".join(data.columns).encode("UTF-8")
         elif isinstance(data, np.ndarray):
             return EvaluationDataset._convert_uint64_ndarray_to_bytes(
-                hash_array(data.flatten(order='C'))
+                hash_array(data.flatten(order="C"))
             )
         elif isinstance(data, list):
-            return EvaluationDataset._convert_uint64_ndarray_to_bytes(
-                hash_array(np.array(data))
-            )
+            return EvaluationDataset._convert_uint64_ndarray_to_bytes(hash_array(np.array(data)))
         else:
             raise ValueError("Unsupported data type.")
 
@@ -306,7 +305,7 @@ def _gen_md5_for_arraylike_obj(md5_gen, data):
         import numpy as np
 
         len_bytes = EvaluationDataset._convert_uint64_ndarray_to_bytes(
-            np.array([len(data)], dtype='uint64')
+            np.array([len(data)], dtype="uint64")
         )
         md5_gen.update(len_bytes)
         if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index 3731b6e19b92f..8b4ea3cf84ce4 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -123,29 +123,25 @@ def test_classifier_evaluate(classifier_model, iris_dataset):
     assert eval_result.metrics == expected_metrics
     confusion_matrix_artifact = eval_result.artifacts[artifact_name]
     assert np.array_equal(confusion_matrix_artifact.content, expected_artifact)
-    assert confusion_matrix_artifact.uri == get_artifact_uri(
-        run.info.run_id, artifact_name
-    )
-    assert np.array_equal(
-        confusion_matrix_artifact.load(saved_artifact_path), expected_artifact
-    )
+    assert confusion_matrix_artifact.uri == get_artifact_uri(run.info.run_id, artifact_name)
+    assert np.array_equal(confusion_matrix_artifact.load(saved_artifact_path), expected_artifact)
 
     with TempDir() as temp_dir:
         temp_dir_path = temp_dir.path()
         eval_result.save(temp_dir_path)
 
-        with open(temp_dir.path('metrics.json'), 'r') as fp:
+        with open(temp_dir.path("metrics.json"), "r") as fp:
             assert json.load(fp) == eval_result.metrics
 
-        with open(temp_dir.path('artifacts_metadata.json'), 'r') as fp:
+        with open(temp_dir.path("artifacts_metadata.json"), "r") as fp:
             assert json.load(fp) == {
-                'confusion_matrix_on_iris_dataset.csv': {
-                    'uri': confusion_matrix_artifact.uri,
-                    'class_name': 'mlflow_test_plugin.dummy_evaluator.Array2DEvaluationArtifact'
+                "confusion_matrix_on_iris_dataset.csv": {
+                    "uri": confusion_matrix_artifact.uri,
+                    "class_name": "mlflow_test_plugin.dummy_evaluator.Array2DEvaluationArtifact",
                 }
             }
 
-        assert os.listdir(temp_dir.path('artifacts')) == ['confusion_matrix_on_iris_dataset.csv']
+        assert os.listdir(temp_dir.path("artifacts")) == ["confusion_matrix_on_iris_dataset.csv"]
 
         loaded_eval_result = EvaluationResult.load(temp_dir_path)
         assert loaded_eval_result.metrics == eval_result.metrics
@@ -156,8 +152,7 @@ def test_classifier_evaluate(classifier_model, iris_dataset):
             loaded_confusion_matrix_artifact.content,
         )
 
-        new_confusion_matrix_artifact = \
-            Array2DEvaluationArtifact(uri=confusion_matrix_artifact.uri)
+        new_confusion_matrix_artifact = Array2DEvaluationArtifact(uri=confusion_matrix_artifact.uri)
         new_confusion_matrix_artifact.load()
         assert np.array_equal(
             confusion_matrix_artifact.content,
@@ -217,20 +212,20 @@ def test_datasset_extract_features_label(iris_dataset, iris_pandas_df_dataset):
     assert np.array_equal(y1, iris_dataset.labels)
 
     X2, y2 = iris_pandas_df_dataset._extract_features_and_labels()
-    assert list(X2.columns) == ['f1', 'f2']
-    assert np.array_equal(X2['f1'], X1[:, 0])
-    assert np.array_equal(X2['f2'], X1[:, 1])
+    assert list(X2.columns) == ["f1", "f2"]
+    assert np.array_equal(X2["f1"], X1[:, 0])
+    assert np.array_equal(X2["f2"], X1[:, 1])
     assert np.array_equal(y2, y1)
 
 
 def test_spark_df_dataset(spark_session):
-    spark_df = spark_session.createDataFrame([(1.0, 2.0, 3.0)] * 10, ['f1', 'f2', 'y'])
-    with mock.patch.object(EvaluationDataset, 'SPARK_DATAFRAME_LIMIT', 5):
-        dataset = EvaluationDataset(spark_df, 'y')
-        assert list(dataset.data.columns) == ['f1', 'f2', 'y']
-        assert list(dataset.data['f1']) == [1.0] * 5
-        assert list(dataset.data['f2']) == [2.0] * 5
-        assert list(dataset.data['y']) == [3.0] * 5
+    spark_df = spark_session.createDataFrame([(1.0, 2.0, 3.0)] * 10, ["f1", "f2", "y"])
+    with mock.patch.object(EvaluationDataset, "SPARK_DATAFRAME_LIMIT", 5):
+        dataset = EvaluationDataset(spark_df, "y")
+        assert list(dataset.data.columns) == ["f1", "f2", "y"]
+        assert list(dataset.data["f1"]) == [1.0] * 5
+        assert list(dataset.data["f2"]) == [2.0] * 5
+        assert list(dataset.data["y"]) == [3.0] * 5
 
 
 def test_log_dataset_tag(iris_dataset, iris_pandas_df_dataset):

From e2f9c898b8139f832dd483545d4a036b3069b662 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Fri, 3 Dec 2021 14:28:00 +0800
Subject: [PATCH 051/120] add tests

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 tests/models/test_evaluation.py | 98 ++++++++++++++++++++++++++++-----
 1 file changed, 83 insertions(+), 15 deletions(-)

diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index 8b4ea3cf84ce4..f3f72ba610d2e 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -1,6 +1,8 @@
 import mlflow
 
-from mlflow.models.evaluation import evaluate, EvaluationDataset, EvaluationResult
+from mlflow.models.evaluation import \
+    evaluate, EvaluationDataset, EvaluationResult, ModelEvaluator, EvaluationArtifact, \
+    EvaluationMetrics
 import sklearn
 import os
 import sklearn.datasets
@@ -11,6 +13,7 @@
 from unittest import mock
 from mlflow.utils.file_utils import TempDir
 from mlflow_test_plugin.dummy_evaluator import Array2DEvaluationArtifact
+from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
 
 from sklearn.metrics import (
     accuracy_score,
@@ -55,19 +58,29 @@ def spark_session():
 
 
 @pytest.fixture(scope="module")
-def regressor_model():
+def regressor_model_uri():
     X, y = get_diabetes_dataset()
     reg = sklearn.linear_model.LinearRegression()
     reg.fit(X, y)
-    return reg
+
+    with mlflow.start_run() as run:
+        mlflow.sklearn.log_model(reg, "reg_model")
+        regressor_model_uri = get_artifact_uri(run.info.run_id, "reg_model")
+
+    return regressor_model_uri
 
 
 @pytest.fixture(scope="module")
-def classifier_model():
+def classifier_model_uri():
     X, y = get_iris()
     clf = sklearn.linear_model.LogisticRegression()
     clf.fit(X, y)
-    return clf
+
+    with mlflow.start_run() as run:
+        mlflow.sklearn.log_model(clf, "clf_model")
+        classifier_model_uri = get_artifact_uri(run.info.run_id, "clf_model")
+
+    return classifier_model_uri
 
 
 @pytest.fixture(scope="module")
@@ -86,11 +99,7 @@ def iris_pandas_df_dataset():
     return EvaluationDataset(data=data, labels=labels, name="iris_pandas_df_dataset")
 
 
-def test_classifier_evaluate(classifier_model, iris_dataset):
-    with mlflow.start_run() as run:
-        mlflow.sklearn.log_model(classifier_model, "clf_model")
-        classifier_model_uri = get_artifact_uri(run.info.run_id, "clf_model")
-
+def test_classifier_evaluate(classifier_model_uri, iris_dataset):
     y_true = iris_dataset.labels
     classifier_model = mlflow.pyfunc.load_model(classifier_model_uri)
     y_pred = classifier_model.predict(iris_dataset.data)
@@ -160,11 +169,7 @@ def test_classifier_evaluate(classifier_model, iris_dataset):
         )
 
 
-def test_regressor_evaluate(regressor_model, iris_dataset):
-    with mlflow.start_run() as run:
-        mlflow.sklearn.log_model(regressor_model, "reg_model")
-        regressor_model_uri = get_artifact_uri(run.info.run_id, "reg_model")
-
+def test_regressor_evaluate(regressor_model_uri, iris_dataset):
     y_true = iris_dataset.labels
     regressor_model = mlflow.pyfunc.load_model(regressor_model_uri)
     y_pred = regressor_model.predict(iris_dataset.data)
@@ -250,3 +255,66 @@ def test_log_dataset_tag(iris_dataset, iris_pandas_df_dataset):
             iris_dataset._metadata,
             iris_pandas_df_dataset._metadata,
         ]
+
+
+class TestEvauator1(ModelEvaluator):
+    pass
+
+
+class TestEvauator2(ModelEvaluator):
+    pass
+
+
+class TestArtifact1(EvaluationArtifact):
+    pass
+
+
+class TestArtifact2(EvaluationArtifact):
+    pass
+
+
+def test_evaluator_interface(classifier_model_uri, iris_dataset):
+
+    with mock.patch.object(
+            _model_evaluation_registry,
+            '_registry', {'test_evaluator1': TestEvauator1, 'test_evaluator2': TestEvauator2}
+    ):
+        with mlflow.start_run() as run:
+            mlflow.sklearn.log_model(classifier_model_uri, "clf_model")
+            classifier_model_uri = get_artifact_uri(run.info.run_id, "clf_model")
+
+        evaluator1_config = {'eval1_confg_a': 3, 'eval1_confg_b': 4}
+        evaluator1_return_value = EvaluationResult(
+            metrics=EvaluationMetrics({'m1': 5, 'm2': 6}),
+            artifacts={'a1': TestArtifact1(), 'a2': TestArtifact2()}
+        )
+        with mock.patch.object(
+            TestEvauator1, 'can_evaluate', return_value=False
+        ) as mock_can_evaluate, mock.patch.object(
+            TestEvauator1, 'evaluate', return_value=evaluator1_return_value
+        ) as mock_evaluate:
+            with mlflow.start_run() as run:
+                evaluate(
+                    classifier_model_uri, 'classifier', iris_dataset,
+                    run_id=None, evaluators='test_evaluator1', evaluator_config=evaluator1_config)
+                mock_can_evaluate.assert_called_once_with('classifier', evaluator1_config)
+                mock_evaluate.assert_not_called()
+        with mock.patch.object(
+            TestEvauator1, 'can_evaluate', return_value=True
+        ) as mock_can_evaluate, mock.patch.object(
+            TestEvauator1, 'evaluate'
+        ) as mock_evaluate:
+            classifier_model = mlflow.pyfunc.load_model(classifier_model_uri)
+            with mlflow.start_run() as run:
+                eval1_result = evaluate(
+                    classifier_model, 'classifier', iris_dataset,
+                    run_id=None, evaluators='test_evaluator1', evaluator_config=evaluator1_config)
+                mock_can_evaluate.assert_called_once_with('classifier', evaluator1_config)
+                mock_evaluate.assert_called_once_with(
+                    classifier_model, 'classifier', iris_dataset, run.info.run_id,
+                    evaluator1_config
+                )
+                assert eval1_result.metrics == evaluator1_return_value.metrics
+                assert eval1_result.artifacts == evaluator1_return_value.artifacts
+
+

From b1b34f8738eb9f8bdc5f268339c03ecfd5244b3e Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Fri, 3 Dec 2021 14:58:23 +0800
Subject: [PATCH 052/120] add more tests

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py |   4 +-
 tests/models/test_evaluation.py  | 115 ++++++++++++++++++++++++++-----
 2 files changed, 97 insertions(+), 22 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index aab2b93960a96..bdee56ea542c9 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -539,9 +539,7 @@ def evaluate(
 
         eval_results = []
         for evaluator_name in evaluators:
-            config = evaluator_config.get(evaluator_name)
-            if config is None:
-                config = {}
+            config = evaluator_config.get(evaluator_name) or {}
             try:
                 evaluator = _model_evaluation_registry.get_evaluator(evaluator_name)
             except MlflowException:
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index f3f72ba610d2e..7f7e8f96ba765 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -3,6 +3,7 @@
 from mlflow.models.evaluation import \
     evaluate, EvaluationDataset, EvaluationResult, ModelEvaluator, EvaluationArtifact, \
     EvaluationMetrics
+from mlflow.models.evaluation.base import _start_run_or_reuse_active_run
 import sklearn
 import os
 import sklearn.datasets
@@ -257,64 +258,140 @@ def test_log_dataset_tag(iris_dataset, iris_pandas_df_dataset):
         ]
 
 
-class TestEvauator1(ModelEvaluator):
+class FakeEvauator1(ModelEvaluator):
     pass
 
 
-class TestEvauator2(ModelEvaluator):
+class FakeEvauator2(ModelEvaluator):
     pass
 
 
-class TestArtifact1(EvaluationArtifact):
+class FakeArtifact1(EvaluationArtifact):
     pass
 
 
-class TestArtifact2(EvaluationArtifact):
+class FakeArtifact2(EvaluationArtifact):
     pass
 
 
 def test_evaluator_interface(classifier_model_uri, iris_dataset):
-
     with mock.patch.object(
             _model_evaluation_registry,
-            '_registry', {'test_evaluator1': TestEvauator1, 'test_evaluator2': TestEvauator2}
+            '_registry', {'test_evaluator1': FakeEvauator1}
     ):
-        with mlflow.start_run() as run:
-            mlflow.sklearn.log_model(classifier_model_uri, "clf_model")
-            classifier_model_uri = get_artifact_uri(run.info.run_id, "clf_model")
-
         evaluator1_config = {'eval1_confg_a': 3, 'eval1_confg_b': 4}
         evaluator1_return_value = EvaluationResult(
             metrics=EvaluationMetrics({'m1': 5, 'm2': 6}),
-            artifacts={'a1': TestArtifact1(), 'a2': TestArtifact2()}
+            artifacts={'a1': FakeArtifact1(uri='uri1'), 'a2': FakeArtifact2(uri='uri2')}
         )
         with mock.patch.object(
-            TestEvauator1, 'can_evaluate', return_value=False
+            FakeEvauator1, 'can_evaluate', return_value=False
         ) as mock_can_evaluate, mock.patch.object(
-            TestEvauator1, 'evaluate', return_value=evaluator1_return_value
+            FakeEvauator1, 'evaluate', return_value=evaluator1_return_value
         ) as mock_evaluate:
-            with mlflow.start_run() as run:
+            with mlflow.start_run():
                 evaluate(
                     classifier_model_uri, 'classifier', iris_dataset,
                     run_id=None, evaluators='test_evaluator1', evaluator_config=evaluator1_config)
                 mock_can_evaluate.assert_called_once_with('classifier', evaluator1_config)
                 mock_evaluate.assert_not_called()
         with mock.patch.object(
-            TestEvauator1, 'can_evaluate', return_value=True
+            FakeEvauator1, 'can_evaluate', return_value=True
         ) as mock_can_evaluate, mock.patch.object(
-            TestEvauator1, 'evaluate'
+            FakeEvauator1, 'evaluate', return_value=evaluator1_return_value
         ) as mock_evaluate:
             classifier_model = mlflow.pyfunc.load_model(classifier_model_uri)
             with mlflow.start_run() as run:
                 eval1_result = evaluate(
                     classifier_model, 'classifier', iris_dataset,
-                    run_id=None, evaluators='test_evaluator1', evaluator_config=evaluator1_config)
+                    run_id=None, evaluators='test_evaluator1',
+                    evaluator_config=evaluator1_config
+                )
+                assert eval1_result.metrics == evaluator1_return_value.metrics
+                assert eval1_result.artifacts == evaluator1_return_value.artifacts
+
                 mock_can_evaluate.assert_called_once_with('classifier', evaluator1_config)
                 mock_evaluate.assert_called_once_with(
                     classifier_model, 'classifier', iris_dataset, run.info.run_id,
                     evaluator1_config
                 )
-                assert eval1_result.metrics == evaluator1_return_value.metrics
-                assert eval1_result.artifacts == evaluator1_return_value.artifacts
 
 
+def test_evaluate_with_multi_evaluators(classifier_model_uri, iris_dataset):
+    with mock.patch.object(
+            _model_evaluation_registry,
+            '_registry', {'test_evaluator1': FakeEvauator1, 'test_evaluator2': FakeEvauator2}
+    ):
+        evaluator1_config = {'eval1_confg': 3}
+        evaluator2_config = {'eval2_confg': 4}
+        evaluator1_return_value = EvaluationResult(
+            metrics=EvaluationMetrics({'m1': 5}),
+            artifacts={'a1': FakeArtifact1(uri='uri1')}
+        )
+        evaluator2_return_value = EvaluationResult(
+            metrics=EvaluationMetrics({'m2': 6}),
+            artifacts={'a2': FakeArtifact2(uri='uri2')}
+        )
+        with mock.patch.object(
+            FakeEvauator1, 'can_evaluate', return_value=True
+        ) as mock_can_evaluate1, mock.patch.object(
+            FakeEvauator1, 'evaluate', return_value=evaluator1_return_value
+        ) as mock_evaluate1, mock.patch.object(
+            FakeEvauator2, 'can_evaluate', return_value=True
+        ) as mock_can_evaluate2, mock.patch.object(
+            FakeEvauator2, 'evaluate', return_value=evaluator2_return_value
+        ) as mock_evaluate2:
+            classifier_model = mlflow.pyfunc.load_model(classifier_model_uri)
+            with mlflow.start_run() as run:
+                eval_result = evaluate(
+                    classifier_model, 'classifier', iris_dataset,
+                    run_id=None, evaluators=['test_evaluator1', 'test_evaluator2'],
+                    evaluator_config={
+                        'test_evaluator1': evaluator1_config,
+                        'test_evaluator2': evaluator2_config
+                    }
+                )
+                assert eval_result.metrics == {
+                    **evaluator1_return_value.metrics,
+                    **evaluator2_return_value.metrics,
+                }
+                assert eval_result.artifacts == {
+                    **evaluator1_return_value.artifacts,
+                    **evaluator2_return_value.artifacts,
+                }
+                mock_can_evaluate1.assert_called_once_with('classifier', evaluator1_config)
+                mock_evaluate1.assert_called_once_with(
+                    classifier_model, 'classifier', iris_dataset, run.info.run_id,
+                    evaluator1_config
+                )
+                mock_can_evaluate2.assert_called_once_with('classifier', evaluator2_config)
+                mock_evaluate2.assert_called_once_with(
+                    classifier_model, 'classifier', iris_dataset, run.info.run_id,
+                    evaluator2_config
+                )
+
+
+def test_start_run_or_reuse_active_run():
+    with _start_run_or_reuse_active_run(run_id=None) as run_id:
+        assert mlflow.active_run().info.run_id == run_id
+
+    with mlflow.start_run() as run:
+        pass
+    previous_run_id = run.info.run_id
+
+    with _start_run_or_reuse_active_run(run_id=previous_run_id) as run_id:
+        assert previous_run_id == run_id
+        assert mlflow.active_run().info.run_id == run_id
+
+    with mlflow.start_run() as run:
+        active_run_id = run.info.run_id
+
+        with _start_run_or_reuse_active_run(run_id=None) as run_id:
+            assert run_id == active_run_id
+
+        with _start_run_or_reuse_active_run(run_id=active_run_id) as run_id:
+            assert run_id == active_run_id
+
+        with pytest.raises(ValueError, match='An active run exists'):
+            with _start_run_or_reuse_active_run(run_id=previous_run_id):
+                pass

From a51bd1f6070f42e9c5ce1131180a5e5a6d1d04d8 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Fri, 3 Dec 2021 17:23:26 +0800
Subject: [PATCH 053/120] add tests

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py |   8 +-
 tests/models/test_evaluation.py  | 121 +++++++++++++++++++------------
 2 files changed, 79 insertions(+), 50 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index bdee56ea542c9..3c7dcdbd12df3 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -282,9 +282,7 @@ def _array_like_obj_to_bytes(data):
         import pandas as pd
 
         if isinstance(data, pd.DataFrame):
-            return EvaluationDataset._convert_uint64_ndarray_to_bytes(
-                hash_pandas_object(data)
-            ) + ",".join(data.columns).encode("UTF-8")
+            return EvaluationDataset._convert_uint64_ndarray_to_bytes(hash_pandas_object(data))
         elif isinstance(data, np.ndarray):
             return EvaluationDataset._convert_uint64_ndarray_to_bytes(
                 hash_array(data.flatten(order="C"))
@@ -345,8 +343,10 @@ def hash(self):
                 EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
                 EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.labels)
             elif isinstance(self.data, pd.DataFrame):
+                column_names = ",".join(self.data.columns)
+                meta_str = f"columns={column_names}\nlabels={self.labels}"
+                md5_gen.update(meta_str.encode("UTF-8"))
                 EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
-                md5_gen.update(self.labels.encode("UTF-8"))
             self._hash = md5_gen.hexdigest()
         return self._hash
 
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index 7f7e8f96ba765..cb7121fdadd71 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -1,8 +1,14 @@
 import mlflow
 
-from mlflow.models.evaluation import \
-    evaluate, EvaluationDataset, EvaluationResult, ModelEvaluator, EvaluationArtifact, \
-    EvaluationMetrics
+from mlflow.models.evaluation import (
+    evaluate,
+    EvaluationDataset,
+    EvaluationResult,
+    ModelEvaluator,
+    EvaluationArtifact,
+    EvaluationMetrics,
+)
+import hashlib
 from mlflow.models.evaluation.base import _start_run_or_reuse_active_run
 import sklearn
 import os
@@ -207,9 +213,26 @@ def test_dataset_name():
     assert d2.name == d2.hash
 
 
+def test_gen_md5_for_arraylike_obj():
+    def get_md5(data):
+        md5_gen = hashlib.md5()
+        EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, data)
+        return md5_gen.hexdigest()
+
+    list0 = list(range(20))
+    list1 = [100] + list0[1:]
+    list2 = list0[:-1] + [100]
+    list3 = list0[:10] + [100] + list0[10:]
+
+    assert 4 == len({get_md5(list0), get_md5(list1), get_md5(list2), get_md5(list3)})
+
+    list4 = list0[:10] + [99] + list0[10:]
+    assert get_md5(list3) == get_md5(list4)
+
+
 def test_dataset_hash(iris_dataset, iris_pandas_df_dataset):
     assert iris_dataset.hash == "c7417e63a9ce038a32f37ecd7fb829f6"
-    assert iris_pandas_df_dataset.hash == "e796a0b1e0bef0fc06b4b1ad62e3ea63"
+    assert iris_pandas_df_dataset.hash == "d06cfb6352dba29afe514d9be87021aa"
 
 
 def test_datasset_extract_features_label(iris_dataset, iris_pandas_df_dataset):
@@ -276,80 +299,88 @@ class FakeArtifact2(EvaluationArtifact):
 
 def test_evaluator_interface(classifier_model_uri, iris_dataset):
     with mock.patch.object(
-            _model_evaluation_registry,
-            '_registry', {'test_evaluator1': FakeEvauator1}
+        _model_evaluation_registry, "_registry", {"test_evaluator1": FakeEvauator1}
     ):
-        evaluator1_config = {'eval1_confg_a': 3, 'eval1_confg_b': 4}
+        evaluator1_config = {"eval1_confg_a": 3, "eval1_confg_b": 4}
         evaluator1_return_value = EvaluationResult(
-            metrics=EvaluationMetrics({'m1': 5, 'm2': 6}),
-            artifacts={'a1': FakeArtifact1(uri='uri1'), 'a2': FakeArtifact2(uri='uri2')}
+            metrics=EvaluationMetrics({"m1": 5, "m2": 6}),
+            artifacts={"a1": FakeArtifact1(uri="uri1"), "a2": FakeArtifact2(uri="uri2")},
         )
         with mock.patch.object(
-            FakeEvauator1, 'can_evaluate', return_value=False
+            FakeEvauator1, "can_evaluate", return_value=False
         ) as mock_can_evaluate, mock.patch.object(
-            FakeEvauator1, 'evaluate', return_value=evaluator1_return_value
+            FakeEvauator1, "evaluate", return_value=evaluator1_return_value
         ) as mock_evaluate:
             with mlflow.start_run():
                 evaluate(
-                    classifier_model_uri, 'classifier', iris_dataset,
-                    run_id=None, evaluators='test_evaluator1', evaluator_config=evaluator1_config)
-                mock_can_evaluate.assert_called_once_with('classifier', evaluator1_config)
+                    classifier_model_uri,
+                    "classifier",
+                    iris_dataset,
+                    run_id=None,
+                    evaluators="test_evaluator1",
+                    evaluator_config=evaluator1_config,
+                )
+                mock_can_evaluate.assert_called_once_with("classifier", evaluator1_config)
                 mock_evaluate.assert_not_called()
         with mock.patch.object(
-            FakeEvauator1, 'can_evaluate', return_value=True
+            FakeEvauator1, "can_evaluate", return_value=True
         ) as mock_can_evaluate, mock.patch.object(
-            FakeEvauator1, 'evaluate', return_value=evaluator1_return_value
+            FakeEvauator1, "evaluate", return_value=evaluator1_return_value
         ) as mock_evaluate:
             classifier_model = mlflow.pyfunc.load_model(classifier_model_uri)
             with mlflow.start_run() as run:
                 eval1_result = evaluate(
-                    classifier_model, 'classifier', iris_dataset,
-                    run_id=None, evaluators='test_evaluator1',
-                    evaluator_config=evaluator1_config
+                    classifier_model,
+                    "classifier",
+                    iris_dataset,
+                    run_id=None,
+                    evaluators="test_evaluator1",
+                    evaluator_config=evaluator1_config,
                 )
                 assert eval1_result.metrics == evaluator1_return_value.metrics
                 assert eval1_result.artifacts == evaluator1_return_value.artifacts
 
-                mock_can_evaluate.assert_called_once_with('classifier', evaluator1_config)
+                mock_can_evaluate.assert_called_once_with("classifier", evaluator1_config)
                 mock_evaluate.assert_called_once_with(
-                    classifier_model, 'classifier', iris_dataset, run.info.run_id,
-                    evaluator1_config
+                    classifier_model, "classifier", iris_dataset, run.info.run_id, evaluator1_config
                 )
 
 
 def test_evaluate_with_multi_evaluators(classifier_model_uri, iris_dataset):
     with mock.patch.object(
-            _model_evaluation_registry,
-            '_registry', {'test_evaluator1': FakeEvauator1, 'test_evaluator2': FakeEvauator2}
+        _model_evaluation_registry,
+        "_registry",
+        {"test_evaluator1": FakeEvauator1, "test_evaluator2": FakeEvauator2},
     ):
-        evaluator1_config = {'eval1_confg': 3}
-        evaluator2_config = {'eval2_confg': 4}
+        evaluator1_config = {"eval1_confg": 3}
+        evaluator2_config = {"eval2_confg": 4}
         evaluator1_return_value = EvaluationResult(
-            metrics=EvaluationMetrics({'m1': 5}),
-            artifacts={'a1': FakeArtifact1(uri='uri1')}
+            metrics=EvaluationMetrics({"m1": 5}), artifacts={"a1": FakeArtifact1(uri="uri1")}
         )
         evaluator2_return_value = EvaluationResult(
-            metrics=EvaluationMetrics({'m2': 6}),
-            artifacts={'a2': FakeArtifact2(uri='uri2')}
+            metrics=EvaluationMetrics({"m2": 6}), artifacts={"a2": FakeArtifact2(uri="uri2")}
         )
         with mock.patch.object(
-            FakeEvauator1, 'can_evaluate', return_value=True
+            FakeEvauator1, "can_evaluate", return_value=True
         ) as mock_can_evaluate1, mock.patch.object(
-            FakeEvauator1, 'evaluate', return_value=evaluator1_return_value
+            FakeEvauator1, "evaluate", return_value=evaluator1_return_value
         ) as mock_evaluate1, mock.patch.object(
-            FakeEvauator2, 'can_evaluate', return_value=True
+            FakeEvauator2, "can_evaluate", return_value=True
         ) as mock_can_evaluate2, mock.patch.object(
-            FakeEvauator2, 'evaluate', return_value=evaluator2_return_value
+            FakeEvauator2, "evaluate", return_value=evaluator2_return_value
         ) as mock_evaluate2:
             classifier_model = mlflow.pyfunc.load_model(classifier_model_uri)
             with mlflow.start_run() as run:
                 eval_result = evaluate(
-                    classifier_model, 'classifier', iris_dataset,
-                    run_id=None, evaluators=['test_evaluator1', 'test_evaluator2'],
+                    classifier_model,
+                    "classifier",
+                    iris_dataset,
+                    run_id=None,
+                    evaluators=["test_evaluator1", "test_evaluator2"],
                     evaluator_config={
-                        'test_evaluator1': evaluator1_config,
-                        'test_evaluator2': evaluator2_config
-                    }
+                        "test_evaluator1": evaluator1_config,
+                        "test_evaluator2": evaluator2_config,
+                    },
                 )
                 assert eval_result.metrics == {
                     **evaluator1_return_value.metrics,
@@ -359,15 +390,13 @@ def test_evaluate_with_multi_evaluators(classifier_model_uri, iris_dataset):
                     **evaluator1_return_value.artifacts,
                     **evaluator2_return_value.artifacts,
                 }
-                mock_can_evaluate1.assert_called_once_with('classifier', evaluator1_config)
+                mock_can_evaluate1.assert_called_once_with("classifier", evaluator1_config)
                 mock_evaluate1.assert_called_once_with(
-                    classifier_model, 'classifier', iris_dataset, run.info.run_id,
-                    evaluator1_config
+                    classifier_model, "classifier", iris_dataset, run.info.run_id, evaluator1_config
                 )
-                mock_can_evaluate2.assert_called_once_with('classifier', evaluator2_config)
+                mock_can_evaluate2.assert_called_once_with("classifier", evaluator2_config)
                 mock_evaluate2.assert_called_once_with(
-                    classifier_model, 'classifier', iris_dataset, run.info.run_id,
-                    evaluator2_config
+                    classifier_model, "classifier", iris_dataset, run.info.run_id, evaluator2_config
                 )
 
 
@@ -392,6 +421,6 @@ def test_start_run_or_reuse_active_run():
         with _start_run_or_reuse_active_run(run_id=active_run_id) as run_id:
             assert run_id == active_run_id
 
-        with pytest.raises(ValueError, match='An active run exists'):
+        with pytest.raises(ValueError, match="An active run exists"):
             with _start_run_or_reuse_active_run(run_id=previous_run_id):
                 pass

From c82eaa6ae05b9b644293ad13d0c9712bd14b723c Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Fri, 3 Dec 2021 17:51:41 +0800
Subject: [PATCH 054/120] fix lint

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/__init__.py                     |  4 ++--
 mlflow/models/evaluation/base.py              |  2 +-
 tests/models/test_evaluation.py               | 24 +++++++++++++++----
 .../mlflow_test_plugin/dummy_evaluator.py     |  4 +---
 tests/resources/mlflow-test-plugin/setup.py   |  2 +-
 5 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/mlflow/models/__init__.py b/mlflow/models/__init__.py
index b1f73ec16abe6..c40f249b23cc0 100644
--- a/mlflow/models/__init__.py
+++ b/mlflow/models/__init__.py
@@ -37,8 +37,8 @@
 # Under skinny-mlflow requirements, the following packages cannot be imported
 # because of lack of numpy/pandas library, so wrap them with try...except block
 try:
-    from .signature import ModelSignature, infer_signature
-    from .utils import ModelInputExample
+    from .signature import ModelSignature, infer_signature  # pylint: disable=unused-import
+    from .utils import ModelInputExample  # pylint: disable=unused-import
 
     __all__ += [
         "ModelSignature",
diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 3c7dcdbd12df3..d5605e05041c2 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -6,7 +6,7 @@
 from contextlib import contextmanager
 from mlflow.exceptions import MlflowException
 from mlflow.utils.file_utils import TempDir
-from mlflow.entities import Metric, RunTag
+from mlflow.entities import RunTag
 from mlflow.tracking.artifact_utils import _download_artifact_from_uri
 from mlflow.utils import _get_fully_qualified_class_name
 from mlflow.utils.class_utils import _get_class_from_string
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index cb7121fdadd71..440576d5d9341 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -282,19 +282,35 @@ def test_log_dataset_tag(iris_dataset, iris_pandas_df_dataset):
 
 
 class FakeEvauator1(ModelEvaluator):
-    pass
+    def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
+        raise RuntimeError()
+
+    def evaluate(self, model, model_type, dataset, run_id, evaluator_config, **kwargs):
+        raise RuntimeError()
 
 
 class FakeEvauator2(ModelEvaluator):
-    pass
+    def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
+        raise RuntimeError()
+
+    def evaluate(self, model, model_type, dataset, run_id, evaluator_config, **kwargs):
+        raise RuntimeError()
 
 
 class FakeArtifact1(EvaluationArtifact):
-    pass
+    def save(self, output_artifact_path):
+        raise RuntimeError()
+
+    def _load_content_from_file(self, local_artifact_path):
+        raise RuntimeError()
 
 
 class FakeArtifact2(EvaluationArtifact):
-    pass
+    def save(self, output_artifact_path):
+        raise RuntimeError()
+
+    def _load_content_from_file(self, local_artifact_path):
+        raise RuntimeError()
 
 
 def test_evaluator_interface(classifier_model_uri, iris_dataset):
diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
index 2a048ee17c162..d463dfbde1603 100644
--- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -4,13 +4,11 @@
     EvaluationMetrics,
     EvaluationArtifact,
     EvaluationResult,
-    EvaluationDataset,
 )
 from mlflow.tracking.artifact_utils import get_artifact_uri
-from mlflow.entities import Metric, RunTag
+from mlflow.entities import Metric
 from sklearn import metrics as sk_metrics
 import time
-import numpy as np
 import pandas as pd
 import io
 
diff --git a/tests/resources/mlflow-test-plugin/setup.py b/tests/resources/mlflow-test-plugin/setup.py
index b5ac9cfa49f36..8bad5b86d53f9 100644
--- a/tests/resources/mlflow-test-plugin/setup.py
+++ b/tests/resources/mlflow-test-plugin/setup.py
@@ -26,6 +26,6 @@
         "mlflow.project_backend": "dummy-backend=mlflow_test_plugin.dummy_backend:PluginDummyProjectBackend",  # pylint: disable=line-too-long
         # Define a MLflow model deployment plugin for target 'faketarget'
         "mlflow.deployments": "faketarget=mlflow_test_plugin.fake_deployment_plugin",
-        "mlflow.model_evaluator": "dummy_evaluator=mlflow_test_plugin.dummy_evaluator:DummyEvaluator",
+        "mlflow.model_evaluator": "dummy_evaluator=mlflow_test_plugin.dummy_evaluator:DummyEvaluator",  # pylint: disable=line-too-long
     },
 )

From 96709eb8793bf5f8e0416be163174e6baa0583f9 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Fri, 3 Dec 2021 23:15:19 +0800
Subject: [PATCH 055/120] update shap explainer

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/default_evaluator.py | 300 ++++++++++++++++++
 .../models/evaluation/evaluator_registry.py   |   2 +
 2 files changed, 302 insertions(+)
 create mode 100644 mlflow/models/evaluation/default_evaluator.py

diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
new file mode 100644
index 0000000000000..32e2ca99ba81b
--- /dev/null
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -0,0 +1,300 @@
+import mlflow
+from mlflow.models.evaluation.base import (
+    ModelEvaluator,
+    EvaluationMetrics,
+    EvaluationArtifact,
+    EvaluationResult,
+)
+from mlflow.entities.metric import Metric
+from mlflow.utils.file_utils import TempDir
+from mlflow.tracking.artifact_utils import get_artifact_uri
+from sklearn import metrics as sk_metrics
+import matplotlib.pyplot as pyplot
+import scikitplot
+import shap
+import shap.maskers
+import math
+import pandas as pd
+import time
+
+shap.initjs()
+
+"""
+[P0] Accuracy: Calculates how often predictions equal labels.
+[P0] BinaryCrossentropy: Computes the crossentropy metric between the labels and predictions.
+[P0] Hinge: Computes the hinge metric between y_true and y_pred.
+[P0] Sum: Computes the (weighted) sum of the given values.
+[P0] Mean: Computes the (weighted) mean of the given values.
+[P0] ExampleCount: Computes the total number of evaluation examples.
+[P0] MeanAbsoluteError: Computes the mean absolute error between the labels and predictions.
+[P0] MeanSquaredError: Computes the mean squared error between y_true and y_pred.
+[P0] RootMeanSquaredError: Computes root mean squared error metric between y_true and y_pred.
+
+[P0] TrueNegatives: Calculates the number of true negatives.
+[P0] TruePositives: Calculates the number of true positives.
+[P0] FalseNegatives: Calculates the number of false negatives.
+[P0] FalsePositives: Calculates the number of false positives.
+https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html#sklearn.metrics.confusion_matrix
+
+[P0] Precision: Computes the precision of the predictions with respect to the labels.
+[P0] Recall: Computes the recall of the predictions with respect to the labels.
+[P0] AUC: Approximates the AUC (Area under the curve) of the ROC or PR curves.
+[P0] F1 Score: 2*precision*recall / (precision+recall)
+
+[P0] BinaryClassConfusionMatrix
+
+Plots
+[P0] Confusion matrix
+[P0] Interactive ROC curve with metrics (TP/TN/FP/FN/Acc/F1/AUC), binary classification
+[P0] Lift chart
+
+Global explainability
+[P0] Model built-in feature importance (supported models)
+[P0] SHAP explainers
+     [P0] Summary plot
+"""
+
+from PIL.Image import Image, open as open_image
+
+
+class ImageEvaluationArtifact(EvaluationArtifact):
+    def save(self, output_artifact_path):
+        self._content.save(output_artifact_path)
+
+    def _load_content_from_file(self, local_artifact_path):
+        self._content = open_image(local_artifact_path)
+        return self._content
+
+
+class CsvEvaluationArtifact(EvaluationArtifact):
+    def save(self, output_artifact_path):
+        self._content.to_csv(output_artifact_path, index=False)
+
+    def _load_content_from_file(self, local_artifact_path):
+        self._content = pd.read_csv(local_artifact_path)
+        return self._content
+
+
+_MIN_SAMPLE_ROWS_FOR_SHAP = 2000
+
+
+class DefaultEvaluator(ModelEvaluator):
+    def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
+        return model_type in ["classifier", "regressor"]
+
+    def _log_metrics(self, run_id, metrics, dataset_name):
+        """
+        Helper method to log metrics into specified run.
+        """
+        client = mlflow.tracking.MlflowClient()
+        timestamp = int(time.time() * 1000)
+        client.log_batch(
+            run_id,
+            metrics=[
+                Metric(key=f"{key}_on_{dataset_name}", value=value, timestamp=timestamp, step=0)
+                for key, value in metrics.items()
+            ],
+        )
+
+    def _log_image_artifact(
+        self, artifacts, temp_dir, do_plot, run_id, artifact_name, dataset_name
+    ):
+        client = mlflow.tracking.MlflowClient()
+        pyplot.clf()
+        do_plot()
+        artifact_file_name = f"{artifact_name}_on_{dataset_name}.png"
+        artifact_file_local_path = temp_dir.path(artifact_file_name)
+        pyplot.savefig(artifact_file_local_path)
+        client.log_artifact(run_id, artifact_file_local_path)
+        artifact = ImageEvaluationArtifact(uri=get_artifact_uri(run_id, artifact_file_name))
+        artifact.load(artifact_file_local_path)
+        artifacts[artifact_file_name] = artifact
+
+    def _log_pandas_df_artifact(
+        self, artifacts, temp_dir, pandas_df, run_id, artifact_name, dataset_name
+    ):
+        client = mlflow.tracking.MlflowClient()
+        artifact_file_name = f"{artifact_name}_on_{dataset_name}.csv"
+        artifact_file_local_path = temp_dir.path(artifact_file_name)
+        pandas_df.to_csv(artifact_file_local_path, index=False)
+        client.log_artifact(run_id, artifact_file_local_path)
+        artifact = CsvEvaluationArtifact(
+            uri=get_artifact_uri(run_id, artifact_file_name),
+            content=pandas_df,
+        )
+        artifact.load(artifact_file_local_path)
+        artifacts[artifact_file_name] = artifact
+
+    def _log_model_explainality(self, artifacts, temp_dir, model, X, dataset_name, run_id, sample_ratio):
+        model_loader_module = model.metadata.flavors['python_function']["loader_module"]
+        if model_loader_module == 'mlflow.sklearn':
+            raw_model = model._model_impl
+        elif model_loader_module == 'mlflow.lightgbm':
+            raw_model = model.lgb_model
+        elif model_loader_module == 'mlflow.xgboost':
+            raw_model = model.xgb_model
+        else:
+            raw_model = None
+
+        if len(X) < _MIN_SAMPLE_ROWS_FOR_SHAP:
+            sample_rows = len(X)
+        else:
+            sample_rows = max(int(len(X) * sample_ratio), _MIN_SAMPLE_ROWS_FOR_SHAP)
+
+        sampled_X = shap.sample(X, sample_rows)
+        if raw_model:
+            maskers = shap.maskers.Independent(sampled_X)
+            if shap.explainers.Linear.supports_model_with_masker(raw_model, maskers):
+                explainer = shap.explainers.Linear(raw_model, maskers)
+            elif shap.explainers.Tree.supports_model_with_masker(raw_model, maskers):
+                explainer = shap.explainers.Tree(raw_model, maskers)
+            elif shap.explainers.Additive.supports_model_with_masker(raw_model, maskers):
+                explainer = shap.explainers.Additive(raw_model, maskers)
+            else:
+                explainer = shap.explainers.Sampling(model.predict, X)
+        else:
+            explainer = shap.explainers.Sampling(model.predict, X)
+
+        if isinstance(explainer, shap.explainers.Sampling):
+            shap_values = explainer(X, sample_rows)
+        else:
+            shap_values = explainer(sampled_X)
+
+        def plot_summary():
+            shap.plots.beeswarm(shap_values, show=False)
+
+        self._log_image_artifact(
+            artifacts, temp_dir, plot_summary, run_id, "shap_summary", dataset_name
+        )
+
+        def plot_feature_importance():
+            shap.plots.bar(shap_values, show=False)
+
+        self._log_image_artifact(
+            artifacts,
+            temp_dir,
+            plot_feature_importance,
+            run_id,
+            "shap_feature_importance",
+            dataset_name,
+        )
+
+    def _evaluate_classifier(self, temp_dir, model, X, y, dataset_name, run_id, evaluator_config):
+        # Note: require labels to be number of 0, 1, 2, .. num_classes - 1
+        label_list = sorted(list(set(y)))
+        assert label_list[0] >= 0, "Evaluation dataset labels must be positive integers."
+        max_label = label_list[-1]
+        num_classes = max_label + 1
+
+        y_pred = model.predict(X)
+
+        is_binomial = num_classes <= 2
+
+        metrics = EvaluationMetrics()
+        artifacts = {}
+        metrics["accuracy"] = sk_metrics.accuracy_score(y, y_pred)
+        metrics["example_count"] = len(X)
+
+        # TODO: sum/mean on what data ?
+        #  [P0] Sum: Computes the (weighted) sum of the given values.
+        #  [P0] Mean: Computes the (weighted) mean of the given values.
+
+        if is_binomial:
+            if hasattr(model, "predict_proba"):
+                y_probs = model.predict_proba(X)
+                y_prob = y_probs[:, 1]
+            else:
+                y_probs = None
+                y_prob = None
+
+            confusion_matrix = sk_metrics.confusion_matrix(y, y_pred)
+            tn, fp, fn, tp = confusion_matrix.ravel()
+            metrics["true_negatives"] = tn
+            metrics["false_positives"] = fp
+            metrics["false_negatives"] = fn
+            metrics["true_positives"] = tp
+            metrics["recall"] = sk_metrics.recall_score(y, y_pred)
+            metrics["precision"] = sk_metrics.precision_score(y, y_pred)
+            metrics["f1_score"] = sk_metrics.f1_score(y, y_pred)
+
+            # TODO:
+            #  compute hinge loss, this requires calling decision_function of the model
+            #  e.g., see https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC.decision_function
+
+            if y_probs is not None:
+                metrics["roc_auc"] = sk_metrics.roc_auc_score(y, y_prob)
+                fpr, tpr, thresholds = sk_metrics.roc_curve(y, y_prob)
+                roc_curve_pandas_df = pd.DataFrame(
+                    {"fpr": fpr, "tpr": tpr, "thresholds": thresholds}
+                )
+                self._log_pandas_df_artifact(
+                    artifacts, temp_dir, roc_curve_pandas_df, run_id, "roc_curve", dataset_name
+                )
+
+                roc_auc = sk_metrics.auc(fpr, tpr)
+                metrics["precision_recall_auc"] = roc_auc
+
+                def plot_roc_curve():
+                    sk_metrics.RocCurveDisplay(
+                        fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name="example estimator"
+                    ).plot()
+
+                self._log_image_artifact(
+                    artifacts, temp_dir, plot_roc_curve, run_id, "roc_curve", dataset_name
+                )
+
+                def plot_lift_curve():
+                    scikitplot.metrics.plot_lift_curve(y, y_probs)
+
+                self._log_image_artifact(
+                    artifacts, temp_dir, plot_lift_curve, run_id, "lift_curve", dataset_name
+                )
+
+            def plot_confusion_matrix():
+                sk_metrics.ConfusionMatrixDisplay.from_predictions(y, y_pred, normalize='all')
+
+            self._log_image_artifact(
+                artifacts, temp_dir, plot_confusion_matrix, run_id, "confusion_matrix", dataset_name
+            )
+
+        self._log_metrics(run_id, metrics, dataset_name)
+
+        if evaluator_config.get('log_model_explainality', True):
+            sample_ratio = evaluator_config.get('sample_ratio_for_calc_model_explainality', 1.0)
+            self._log_model_explainality(artifacts, temp_dir, model, X, dataset_name, run_id, sample_ratio)
+
+        return EvaluationResult(metrics, artifacts)
+
+    def _evaluate_regressor(self, temp_dir, model, X, y, dataset_name, run_id, evaluator_config):
+        metrics = EvaluationMetrics()
+        artifacts = {}
+        y_pred = model.predict(X)
+        metrics["example_count"] = len(X)
+        metrics["mean_absolute_error"] = sk_metrics.mean_absolute_error(y, y_pred)
+        metrics["mean_squared_error"] = sk_metrics.mean_squared_error(y, y_pred)
+        metrics["root_mean_squared_error"] = math.sqrt(metrics["mean_squared_error"])
+        self._log_model_explainality(artifacts, temp_dir, model, X, dataset_name, run_id)
+        self._log_metrics(run_id, metrics, dataset_name)
+        return EvaluationResult(metrics, artifacts)
+
+    def evaluate(
+        self,
+        model: "mlflow.pyfunc.PyFuncModel",
+        model_type,
+        dataset,
+        run_id,
+        evaluator_config,
+        **kwargs,
+    ):
+        with TempDir() as temp_dir:
+            X, y = dataset._extract_features_and_labels()
+            if model_type == "classifier":
+                return self._evaluate_classifier(
+                    temp_dir, model, X, y, dataset.name, run_id, evaluator_config
+                )
+            elif model_type == "regressor":
+                return self._evaluate_regressor(
+                    temp_dir, model, X, y, dataset.name, run_id, evaluator_config
+                )
+            else:
+                raise ValueError(f"Unsupported model type {model_type}")
diff --git a/mlflow/models/evaluation/evaluator_registry.py b/mlflow/models/evaluation/evaluator_registry.py
index 9e7f027f0496e..cb10fc128226a 100644
--- a/mlflow/models/evaluation/evaluator_registry.py
+++ b/mlflow/models/evaluation/evaluator_registry.py
@@ -48,6 +48,8 @@ def get_evaluator(self, evaluator_name):
 
 
 def register_evaluators(module):
+    from mlflow.models.evaluation.default_evaluator import DefaultEvaluator
+    module._model_evaluation_registry.register('default', DefaultEvaluator)
     module._model_evaluation_registry.register_entrypoints()
 
 

From dc1b32d2edfcd8fcb36f794da52b325252eefd32 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 8 Dec 2021 21:41:55 +0800
Subject: [PATCH 056/120] address comments

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/default_evaluator.py | 35 +++++++++++--------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 32e2ca99ba81b..69893de7758fb 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -125,7 +125,9 @@ def _log_pandas_df_artifact(
         artifact.load(artifact_file_local_path)
         artifacts[artifact_file_name] = artifact
 
-    def _log_model_explainality(self, artifacts, temp_dir, model, X, dataset_name, run_id, sample_ratio):
+    def _log_model_explainality(
+            self, artifacts, temp_dir, model, X, dataset_name, run_id, sample_ratio, algorithm
+    ):
         model_loader_module = model.metadata.flavors['python_function']["loader_module"]
         if model_loader_module == 'mlflow.sklearn':
             raw_model = model._model_impl
@@ -142,18 +144,22 @@ def _log_model_explainality(self, artifacts, temp_dir, model, X, dataset_name, r
             sample_rows = max(int(len(X) * sample_ratio), _MIN_SAMPLE_ROWS_FOR_SHAP)
 
         sampled_X = shap.sample(X, sample_rows)
-        if raw_model:
-            maskers = shap.maskers.Independent(sampled_X)
-            if shap.explainers.Linear.supports_model_with_masker(raw_model, maskers):
-                explainer = shap.explainers.Linear(raw_model, maskers)
-            elif shap.explainers.Tree.supports_model_with_masker(raw_model, maskers):
-                explainer = shap.explainers.Tree(raw_model, maskers)
-            elif shap.explainers.Additive.supports_model_with_masker(raw_model, maskers):
-                explainer = shap.explainers.Additive(raw_model, maskers)
+
+        if algorithm:
+            explainer = shap.Explainer(model.predict, X, algorithm=algorithm)
+        else:
+            if raw_model:
+                maskers = shap.maskers.Independent(sampled_X)
+                if shap.explainers.Linear.supports_model_with_masker(raw_model, maskers):
+                    explainer = shap.explainers.Linear(raw_model, maskers)
+                elif shap.explainers.Tree.supports_model_with_masker(raw_model, maskers):
+                    explainer = shap.explainers.Tree(raw_model, maskers)
+                elif shap.explainers.Additive.supports_model_with_masker(raw_model, maskers):
+                    explainer = shap.explainers.Additive(raw_model, maskers)
+                else:
+                    explainer = shap.explainers.Sampling(model.predict, X)
             else:
                 explainer = shap.explainers.Sampling(model.predict, X)
-        else:
-            explainer = shap.explainers.Sampling(model.predict, X)
 
         if isinstance(explainer, shap.explainers.Sampling):
             shap_values = explainer(X, sample_rows)
@@ -182,9 +188,8 @@ def plot_feature_importance():
     def _evaluate_classifier(self, temp_dir, model, X, y, dataset_name, run_id, evaluator_config):
         # Note: require labels to be number of 0, 1, 2, .. num_classes - 1
         label_list = sorted(list(set(y)))
-        assert label_list[0] >= 0, "Evaluation dataset labels must be positive integers."
-        max_label = label_list[-1]
-        num_classes = max_label + 1
+        assert label_list[0] == 0, "Label values must being at '0'."
+        num_classes = len(label_list)
 
         y_pred = model.predict(X)
 
@@ -251,7 +256,7 @@ def plot_lift_curve():
                 )
 
             def plot_confusion_matrix():
-                sk_metrics.ConfusionMatrixDisplay.from_predictions(y, y_pred, normalize='all')
+                sk_metrics.ConfusionMatrixDisplay.from_predictions(y, y_pred)
 
             self._log_image_artifact(
                 artifacts, temp_dir, plot_confusion_matrix, run_id, "confusion_matrix", dataset_name

From 11a388b23997a9dbfbc055dbe7152813cd3a55f9 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 9 Dec 2021 22:06:03 +0800
Subject: [PATCH 057/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/default_evaluator.py | 52 +++++++++++++++----
 mlflow/pyfunc/__init__.py                     |  4 +-
 2 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 69893de7758fb..81cccfa9714da 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -16,6 +16,7 @@
 import math
 import pandas as pd
 import time
+from functools import partial
 
 shap.initjs()
 
@@ -126,9 +127,15 @@ def _log_pandas_df_artifact(
         artifacts[artifact_file_name] = artifact
 
     def _log_model_explainality(
-            self, artifacts, temp_dir, model, X, dataset_name, run_id, sample_ratio, algorithm
+            self, artifacts, temp_dir, model, X, dataset_name, run_id, evaluator_config
     ):
+        sample_ratio = evaluator_config.get('explainality.sample_ratio', 1.0)
+        algorithm = evaluator_config.get('explainality.algorithm', None)
+
         model_loader_module = model.metadata.flavors['python_function']["loader_module"]
+
+        predict_fn = model.predict
+
         if model_loader_module == 'mlflow.sklearn':
             raw_model = model._model_impl
         elif model_loader_module == 'mlflow.lightgbm':
@@ -138,6 +145,13 @@ def _log_model_explainality(
         else:
             raw_model = None
 
+        try:
+            import xgboost
+            if raw_model is not None and isinstance(raw_model, xgboost.XGBModel):
+                predict_fn = partial(predict_fn, validate_features=False)
+        except ImportError:
+            pass
+
         if len(X) < _MIN_SAMPLE_ROWS_FOR_SHAP:
             sample_rows = len(X)
         else:
@@ -145,21 +159,34 @@ def _log_model_explainality(
 
         sampled_X = shap.sample(X, sample_rows)
 
+        feature_names = list(X.columns)
         if algorithm:
-            explainer = shap.Explainer(model.predict, X, algorithm=algorithm)
+            explainer = shap.Explainer(
+                predict_fn, sampled_X, feature_names=feature_names, algorithm=algorithm
+            )
         else:
             if raw_model:
                 maskers = shap.maskers.Independent(sampled_X)
                 if shap.explainers.Linear.supports_model_with_masker(raw_model, maskers):
-                    explainer = shap.explainers.Linear(raw_model, maskers)
+                    explainer = shap.explainers.Linear(
+                        raw_model, maskers, feature_names=feature_names
+                    )
                 elif shap.explainers.Tree.supports_model_with_masker(raw_model, maskers):
-                    explainer = shap.explainers.Tree(raw_model, maskers)
+                    explainer = shap.explainers.Tree(
+                        raw_model, maskers, feature_names=feature_names
+                    )
                 elif shap.explainers.Additive.supports_model_with_masker(raw_model, maskers):
-                    explainer = shap.explainers.Additive(raw_model, maskers)
+                    explainer = shap.explainers.Additive(
+                        raw_model, maskers, feature_names=feature_names
+                    )
                 else:
-                    explainer = shap.explainers.Sampling(model.predict, X)
-            else:
-                explainer = shap.explainers.Sampling(model.predict, X)
+                    # fallback to default sampling explainer
+                    pass
+
+            if not explainer:
+                explainer = shap.explainers.Sampling(
+                    predict_fn, X, feature_names=feature_names
+                )
 
         if isinstance(explainer, shap.explainers.Sampling):
             shap_values = explainer(X, sample_rows)
@@ -265,8 +292,9 @@ def plot_confusion_matrix():
         self._log_metrics(run_id, metrics, dataset_name)
 
         if evaluator_config.get('log_model_explainality', True):
-            sample_ratio = evaluator_config.get('sample_ratio_for_calc_model_explainality', 1.0)
-            self._log_model_explainality(artifacts, temp_dir, model, X, dataset_name, run_id, sample_ratio)
+            self._log_model_explainality(
+                artifacts, temp_dir, model, X, dataset_name, run_id, evaluator_config
+            )
 
         return EvaluationResult(metrics, artifacts)
 
@@ -278,7 +306,9 @@ def _evaluate_regressor(self, temp_dir, model, X, y, dataset_name, run_id, evalu
         metrics["mean_absolute_error"] = sk_metrics.mean_absolute_error(y, y_pred)
         metrics["mean_squared_error"] = sk_metrics.mean_squared_error(y, y_pred)
         metrics["root_mean_squared_error"] = math.sqrt(metrics["mean_squared_error"])
-        self._log_model_explainality(artifacts, temp_dir, model, X, dataset_name, run_id)
+        self._log_model_explainality(
+            artifacts, temp_dir, model, X, dataset_name, run_id, evaluator_config
+        )
         self._log_metrics(run_id, metrics, dataset_name)
         return EvaluationResult(metrics, artifacts)
 
diff --git a/mlflow/pyfunc/__init__.py b/mlflow/pyfunc/__init__.py
index a5d772a9d54fd..5f3a0d03a261f 100644
--- a/mlflow/pyfunc/__init__.py
+++ b/mlflow/pyfunc/__init__.py
@@ -586,7 +586,7 @@ def __init__(self, model_meta: Model, model_impl: Any):
         self._model_meta = model_meta
         self._model_impl = model_impl
 
-    def predict(self, data: PyFuncInput) -> PyFuncOutput:
+    def predict(self, data: PyFuncInput, **kwargs) -> PyFuncOutput:
         """
         Generate model predictions.
 
@@ -602,7 +602,7 @@ def predict(self, data: PyFuncInput) -> PyFuncOutput:
         input_schema = self.metadata.get_input_schema()
         if input_schema is not None:
             data = _enforce_schema(data, input_schema)
-        return self._model_impl.predict(data)
+        return self._model_impl.predict(data, **kwargs)
 
     @property
     def metadata(self):

From be4724f3ee7180521dedadeb168acec76029c451 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 9 Dec 2021 22:55:23 +0800
Subject: [PATCH 058/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              |  8 ++
 mlflow/models/evaluation/default_evaluator.py | 91 +++++++++----------
 mlflow/pyfunc/__init__.py                     |  4 +-
 3 files changed, 53 insertions(+), 50 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 1c88ecd821c89..6ba067ed2d4f8 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -320,6 +320,14 @@ def _gen_md5_for_arraylike_obj(md5_gen, data):
                 )
             )
 
+    @property
+    def feature_names(self):
+        import numpy as np
+        if isinstance(self.data, np.ndarray):
+            return [f'f{i}' for i in range(self.data.shape[1])]
+        else:
+            return [c for c in self.data.columns if c != self.labels]
+
     @property
     def name(self):
         """
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 81cccfa9714da..aa53e33efefd4 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -76,7 +76,7 @@ def _load_content_from_file(self, local_artifact_path):
         return self._content
 
 
-_MIN_SAMPLE_ROWS_FOR_SHAP = 2000
+_DEFAULT_SAMPLE_ROWS_FOR_SHAP = 2000
 
 
 class DefaultEvaluator(ModelEvaluator):
@@ -127,9 +127,9 @@ def _log_pandas_df_artifact(
         artifacts[artifact_file_name] = artifact
 
     def _log_model_explainality(
-            self, artifacts, temp_dir, model, X, dataset_name, run_id, evaluator_config
+            self, artifacts, temp_dir, model, X, dataset_name, feature_names, run_id, evaluator_config
     ):
-        sample_ratio = evaluator_config.get('explainality.sample_ratio', 1.0)
+        sample_rows = evaluator_config.get('explainality.nsamples', _DEFAULT_SAMPLE_ROWS_FOR_SHAP)
         algorithm = evaluator_config.get('explainality.algorithm', None)
 
         model_loader_module = model.metadata.flavors['python_function']["loader_module"]
@@ -139,59 +139,54 @@ def _log_model_explainality(
         if model_loader_module == 'mlflow.sklearn':
             raw_model = model._model_impl
         elif model_loader_module == 'mlflow.lightgbm':
-            raw_model = model.lgb_model
+            raw_model = model._model_impl.lgb_model
         elif model_loader_module == 'mlflow.xgboost':
-            raw_model = model.xgb_model
+            raw_model = model._model_impl.xgb_model
         else:
             raw_model = None
 
-        try:
-            import xgboost
-            if raw_model is not None and isinstance(raw_model, xgboost.XGBModel):
-                predict_fn = partial(predict_fn, validate_features=False)
-        except ImportError:
-            pass
-
-        if len(X) < _MIN_SAMPLE_ROWS_FOR_SHAP:
-            sample_rows = len(X)
-        else:
-            sample_rows = max(int(len(X) * sample_ratio), _MIN_SAMPLE_ROWS_FOR_SHAP)
+        if raw_model:
+            predict_fn = raw_model.predict
+            try:
+                import xgboost
+                if isinstance(raw_model, xgboost.XGBModel):
+                    # Because shap evaluation will pass evaluation data in ndarray format
+                    # (without feature names), if set validate_features=True it will raise error.
+                    predict_fn = partial(predict_fn, validate_features=False)
+            except ImportError:
+                pass
 
         sampled_X = shap.sample(X, sample_rows)
-
-        feature_names = list(X.columns)
         if algorithm:
             explainer = shap.Explainer(
                 predict_fn, sampled_X, feature_names=feature_names, algorithm=algorithm
             )
+            shap_values = explainer(sampled_X)
         else:
-            if raw_model:
-                maskers = shap.maskers.Independent(sampled_X)
-                if shap.explainers.Linear.supports_model_with_masker(raw_model, maskers):
-                    explainer = shap.explainers.Linear(
-                        raw_model, maskers, feature_names=feature_names
-                    )
-                elif shap.explainers.Tree.supports_model_with_masker(raw_model, maskers):
-                    explainer = shap.explainers.Tree(
-                        raw_model, maskers, feature_names=feature_names
-                    )
-                elif shap.explainers.Additive.supports_model_with_masker(raw_model, maskers):
-                    explainer = shap.explainers.Additive(
-                        raw_model, maskers, feature_names=feature_names
-                    )
-                else:
-                    # fallback to default sampling explainer
-                    pass
-
-            if not explainer:
+            maskers = shap.maskers.Independent(sampled_X)
+            if raw_model and shap.explainers.Linear.supports_model_with_masker(raw_model, maskers):
+                explainer = shap.explainers.Linear(
+                    raw_model, maskers, feature_names=feature_names
+                )
+                shap_values = explainer(X)
+            elif raw_model and shap.explainers.Tree.supports_model_with_masker(raw_model, maskers):
+                explainer = shap.explainers.Tree(
+                    raw_model, maskers, feature_names=feature_names
+                )
+                shap_values = explainer(X)
+            elif raw_model and shap.explainers.Additive.supports_model_with_masker(
+                    raw_model, maskers
+            ):
+                explainer = shap.explainers.Additive(
+                    raw_model, maskers, feature_names=feature_names
+                )
+                shap_values = explainer(X)
+            else:
+                # fallback to default sampling explainer
                 explainer = shap.explainers.Sampling(
                     predict_fn, X, feature_names=feature_names
                 )
-
-        if isinstance(explainer, shap.explainers.Sampling):
-            shap_values = explainer(X, sample_rows)
-        else:
-            shap_values = explainer(sampled_X)
+                shap_values = explainer(X, sample_rows)
 
         def plot_summary():
             shap.plots.beeswarm(shap_values, show=False)
@@ -212,7 +207,7 @@ def plot_feature_importance():
             dataset_name,
         )
 
-    def _evaluate_classifier(self, temp_dir, model, X, y, dataset_name, run_id, evaluator_config):
+    def _evaluate_classifier(self, temp_dir, model, X, y, dataset_name, feature_names, run_id, evaluator_config):
         # Note: require labels to be number of 0, 1, 2, .. num_classes - 1
         label_list = sorted(list(set(y)))
         assert label_list[0] == 0, "Label values must being at '0'."
@@ -293,12 +288,12 @@ def plot_confusion_matrix():
 
         if evaluator_config.get('log_model_explainality', True):
             self._log_model_explainality(
-                artifacts, temp_dir, model, X, dataset_name, run_id, evaluator_config
+                artifacts, temp_dir, model, X, dataset_name, feature_names, run_id, evaluator_config
             )
 
         return EvaluationResult(metrics, artifacts)
 
-    def _evaluate_regressor(self, temp_dir, model, X, y, dataset_name, run_id, evaluator_config):
+    def _evaluate_regressor(self, temp_dir, model, X, y, dataset_name, feature_names, run_id, evaluator_config):
         metrics = EvaluationMetrics()
         artifacts = {}
         y_pred = model.predict(X)
@@ -307,7 +302,7 @@ def _evaluate_regressor(self, temp_dir, model, X, y, dataset_name, run_id, evalu
         metrics["mean_squared_error"] = sk_metrics.mean_squared_error(y, y_pred)
         metrics["root_mean_squared_error"] = math.sqrt(metrics["mean_squared_error"])
         self._log_model_explainality(
-            artifacts, temp_dir, model, X, dataset_name, run_id, evaluator_config
+            artifacts, temp_dir, model, X, dataset_name, feature_names, run_id, evaluator_config
         )
         self._log_metrics(run_id, metrics, dataset_name)
         return EvaluationResult(metrics, artifacts)
@@ -325,11 +320,11 @@ def evaluate(
             X, y = dataset._extract_features_and_labels()
             if model_type == "classifier":
                 return self._evaluate_classifier(
-                    temp_dir, model, X, y, dataset.name, run_id, evaluator_config
+                    temp_dir, model, X, y, dataset.name, dataset.feature_names, run_id, evaluator_config
                 )
             elif model_type == "regressor":
                 return self._evaluate_regressor(
-                    temp_dir, model, X, y, dataset.name, run_id, evaluator_config
+                    temp_dir, model, X, y, dataset.name, dataset.feature_names, run_id, evaluator_config
                 )
             else:
                 raise ValueError(f"Unsupported model type {model_type}")
diff --git a/mlflow/pyfunc/__init__.py b/mlflow/pyfunc/__init__.py
index 5f3a0d03a261f..a5d772a9d54fd 100644
--- a/mlflow/pyfunc/__init__.py
+++ b/mlflow/pyfunc/__init__.py
@@ -586,7 +586,7 @@ def __init__(self, model_meta: Model, model_impl: Any):
         self._model_meta = model_meta
         self._model_impl = model_impl
 
-    def predict(self, data: PyFuncInput, **kwargs) -> PyFuncOutput:
+    def predict(self, data: PyFuncInput) -> PyFuncOutput:
         """
         Generate model predictions.
 
@@ -602,7 +602,7 @@ def predict(self, data: PyFuncInput, **kwargs) -> PyFuncOutput:
         input_schema = self.metadata.get_input_schema()
         if input_schema is not None:
             data = _enforce_schema(data, input_schema)
-        return self._model_impl.predict(data, **kwargs)
+        return self._model_impl.predict(data)
 
     @property
     def metadata(self):

From 14adf5da6bb78395aacd90b6bbf1fc67d0856556 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 9 Dec 2021 23:06:19 +0800
Subject: [PATCH 059/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/default_evaluator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index aa53e33efefd4..e57a7a55e1b37 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -163,7 +163,7 @@ def _log_model_explainality(
             )
             shap_values = explainer(sampled_X)
         else:
-            maskers = shap.maskers.Independent(sampled_X)
+            maskers = shap.maskers.Independent(X)
             if raw_model and shap.explainers.Linear.supports_model_with_masker(raw_model, maskers):
                 explainer = shap.explainers.Linear(
                     raw_model, maskers, feature_names=feature_names

From 4c3865186cce4ad348f80cbcf88a45edefe5f33c Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Sun, 12 Dec 2021 19:18:36 +0800
Subject: [PATCH 060/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              |  2 +
 mlflow/models/evaluation/default_evaluator.py | 37 +++++++++++++------
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 6ba067ed2d4f8..a12ea628c1325 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -10,6 +10,7 @@
 from mlflow.tracking.artifact_utils import _download_artifact_from_uri
 from mlflow.utils import _get_fully_qualified_class_name
 from mlflow.utils.class_utils import _get_class_from_string
+from mlflow.utils.annotations import experimental
 import logging
 import struct
 
@@ -469,6 +470,7 @@ def _start_run_or_reuse_active_run(run_id):
         yield active_run.info.run_id
 
 
+@experimental
 def evaluate(
     model: Union[str, "mlflow.pyfunc.PyFuncModel"],
     model_type: str,
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index e57a7a55e1b37..5bb1468169648 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -18,8 +18,6 @@
 import time
 from functools import partial
 
-shap.initjs()
-
 """
 [P0] Accuracy: Calculates how often predictions equal labels.
 [P0] BinaryCrossentropy: Computes the crossentropy metric between the labels and predictions.
@@ -83,7 +81,14 @@ class DefaultEvaluator(ModelEvaluator):
     def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
         return model_type in ["classifier", "regressor"]
 
-    def _log_metrics(self, run_id, metrics, dataset_name):
+    @staticmethod
+    def _gen_log_key(key, dataset_name, model):
+        if hasattr(model.metadata, 'model_uuid'):
+            return f'{key}_on_data_{dataset_name}_model_{model.metadata.model_uuid}'
+        else:
+            return f'{key}_on_data_{dataset_name}'
+
+    def _log_metrics(self, run_id, metrics, dataset_name, model):
         """
         Helper method to log metrics into specified run.
         """
@@ -92,7 +97,8 @@ def _log_metrics(self, run_id, metrics, dataset_name):
         client.log_batch(
             run_id,
             metrics=[
-                Metric(key=f"{key}_on_{dataset_name}", value=value, timestamp=timestamp, step=0)
+                Metric(key=DefaultEvaluator._gen_log_key(key, dataset_name, model),
+                       value=value, timestamp=timestamp, step=0)
                 for key, value in metrics.items()
             ],
         )
@@ -126,11 +132,11 @@ def _log_pandas_df_artifact(
         artifact.load(artifact_file_local_path)
         artifacts[artifact_file_name] = artifact
 
-    def _log_model_explainality(
+    def _log_model_explainability(
             self, artifacts, temp_dir, model, X, dataset_name, feature_names, run_id, evaluator_config
     ):
-        sample_rows = evaluator_config.get('explainality.nsamples', _DEFAULT_SAMPLE_ROWS_FOR_SHAP)
-        algorithm = evaluator_config.get('explainality.algorithm', None)
+        sample_rows = evaluator_config.get('explainability_nsamples', _DEFAULT_SAMPLE_ROWS_FOR_SHAP)
+        algorithm = evaluator_config.get('explainality_algorithm', None)
 
         model_loader_module = model.metadata.flavors['python_function']["loader_module"]
 
@@ -284,10 +290,10 @@ def plot_confusion_matrix():
                 artifacts, temp_dir, plot_confusion_matrix, run_id, "confusion_matrix", dataset_name
             )
 
-        self._log_metrics(run_id, metrics, dataset_name)
+        self._log_metrics(run_id, metrics, dataset_name, model)
 
-        if evaluator_config.get('log_model_explainality', True):
-            self._log_model_explainality(
+        if evaluator_config.get('log_model_explainability', True):
+            self._log_model_explainability(
                 artifacts, temp_dir, model, X, dataset_name, feature_names, run_id, evaluator_config
             )
 
@@ -301,10 +307,16 @@ def _evaluate_regressor(self, temp_dir, model, X, y, dataset_name, feature_names
         metrics["mean_absolute_error"] = sk_metrics.mean_absolute_error(y, y_pred)
         metrics["mean_squared_error"] = sk_metrics.mean_squared_error(y, y_pred)
         metrics["root_mean_squared_error"] = math.sqrt(metrics["mean_squared_error"])
-        self._log_model_explainality(
+        metrics['sum_on_label'] = sum(y)
+        metrics['mean_on_label'] = metrics['sum_on_label'] / metrics["example_count"]
+        metrics['r2_score'] = sk_metrics.r2_score(y, y_pred)
+        metrics['max_error'] = sk_metrics.max_error(y, y_pred)
+        metrics['mean_absolute_percentage_error'] = \
+            sk_metrics.mean_absolute_percentage_error(y, y_pred)
+        self._log_model_explainability(
             artifacts, temp_dir, model, X, dataset_name, feature_names, run_id, evaluator_config
         )
-        self._log_metrics(run_id, metrics, dataset_name)
+        self._log_metrics(run_id, metrics, dataset_name, model)
         return EvaluationResult(metrics, artifacts)
 
     def evaluate(
@@ -316,6 +328,7 @@ def evaluate(
         evaluator_config,
         **kwargs,
     ):
+        shap.initjs()
         with TempDir() as temp_dir:
             X, y = dataset._extract_features_and_labels()
             if model_type == "classifier":

From cbc0c32272092c459cb944f17214472c18ea3c0a Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 13 Dec 2021 16:29:57 +0800
Subject: [PATCH 061/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py | 51 ++++++++++++++++++++------------
 1 file changed, 32 insertions(+), 19 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index a12ea628c1325..ab86b7b93da15 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -511,27 +511,40 @@ def evaluate(
     from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
     from mlflow.pyfunc import PyFuncModel
 
-    if not evaluators:
-        evaluators = list(_model_evaluation_registry._registry.keys())
-
-    if isinstance(evaluators, str):
-        evaluators = [evaluators]
-        if not (evaluator_config is None or isinstance(evaluator_config, dict)):
+    if evaluators is None:
+        evaluator_name_list = list(_model_evaluation_registry._registry.keys())
+        if evaluator_config is not None:
             raise ValueError(
-                "If `evaluators` argument is a str, evaluator_config must be None or a dict."
+                'If `evaluators` argument is None, `evaluator_config` argument must be None too.'
             )
-        evaluator_config = {evaluators[0]: evaluator_config}
-    elif isinstance(evaluators, list):
-        evaluators = set(evaluators)
-        if not (
-            isinstance(evaluator_config, dict)
-            and all(k in evaluators and isinstance(v, dict) for k, v in evaluator_config.items())
-        ):
+        evaluator_name_to_conf_map = {}
+    elif isinstance(evaluators, str):
+        if not (evaluator_config is None or isinstance(evaluator_config, dict)):
             raise ValueError(
-                "If `evaluators` argument is a evaluator name list, evaluator_config"
-                "must be a dict contains mapping from evaluator name to individual "
-                "evaluator config dict."
+                "If `evaluators` argument is the name of an evaluator, evaluator_config must be "
+                "None or a dict containing config items for the evaluator."
             )
+        evaluator_name_list = [evaluators]
+        evaluator_name_to_conf_map = {evaluators[0]: evaluator_config}
+    elif isinstance(evaluators, list):
+        if evaluator_config is not None:
+            if not (
+                    isinstance(evaluator_config, dict)
+                    and all(k in evaluators and isinstance(v, dict)
+                            for k, v in evaluator_config.items())
+            ):
+                raise ValueError(
+                    "If `evaluators` argument is a evaluator name list, evaluator_config "
+                    "must be a dict contains mapping from evaluator name to individual "
+                    "evaluator config dict."
+                )
+        evaluator_name_list = list(set(evaluators))
+        evaluator_name_to_conf_map = evaluator_config or {}
+    else:
+        raise ValueError(
+            '`evaluators` argument must be None, a evaluator name string, or a list of '
+            'evalautor names.'
+        )
 
     if isinstance(model, str):
         model = mlflow.pyfunc.load_model(model)
@@ -548,8 +561,8 @@ def evaluate(
         dataset._log_dataset_tag(client, actual_run_id)
 
         eval_results = []
-        for evaluator_name in evaluators:
-            config = evaluator_config.get(evaluator_name) or {}
+        for evaluator_name in evaluator_name_list:
+            config = evaluator_name_to_conf_map.get(evaluator_name) or {}
             try:
                 evaluator = _model_evaluation_registry.get_evaluator(evaluator_name)
             except MlflowException:

From 503d26edd3e6fac7c63a5e51c56dd5ee99d5b671 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 13 Dec 2021 17:25:50 +0800
Subject: [PATCH 062/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              | 119 ++++++++++++------
 mlflow/models/evaluation/default_evaluator.py |  45 ++++---
 2 files changed, 108 insertions(+), 56 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index ab86b7b93da15..05446a6f4d609 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -162,7 +162,7 @@ class EvaluationDataset:
     NUM_SAMPLE_ROWS_FOR_HASH = 5
     SPARK_DATAFRAME_LIMIT = 10000
 
-    def __init__(self, data, labels, name=None, path=None):
+    def __init__(self, data, labels, name=None, path=None, feature_names=None):
         """
         :param data: One of the following:
          - A numpy array or list of evaluation features, excluding labels.
@@ -179,6 +179,8 @@ def __init__(self, data, labels, name=None, path=None):
 
         :param path: (Optional) the path to a serialized DataFrame (must not contain ").
           (e.g. a delta table, parquet file)
+
+        :param feature_names: (Optional) A list of the feature names.
         """
         import numpy as np
         import pandas as pd
@@ -220,6 +222,24 @@ def __init__(self, data, labels, name=None, path=None):
         self.path = path
         self._hash = None
 
+        if isinstance(self.data, np.ndarray):
+            num_features = self.data.shape[1]
+            if feature_names is not None:
+                feature_names = list(feature_names)
+                if num_features != len(feature_names):
+                    raise ValueError('feature name list must be the same length with feature data.')
+                self._feature_names = feature_names
+            else:
+                self._feature_names = [f'f{i}' for i in range(num_features)]
+        else:
+            pd_column_names = [c for c in self.data.columns if c != self.labels]
+            if feature_names is not None:
+                feature_names = list(feature_names)
+                if pd_column_names != list(feature_names):
+                    raise ValueError('feature names must match feature column names in the pandas '
+                                     'dataframe')
+            self._feature_names = pd_column_names
+
     @property
     def data(self):
         """
@@ -323,11 +343,7 @@ def _gen_md5_for_arraylike_obj(md5_gen, data):
 
     @property
     def feature_names(self):
-        import numpy as np
-        if isinstance(self.data, np.ndarray):
-            return [f'f{i}' for i in range(self.data.shape[1])]
-        else:
-            return [c for c in self.data.columns if c != self.labels]
+        return self._feature_names
 
     @property
     def name(self):
@@ -470,6 +486,50 @@ def _start_run_or_reuse_active_run(run_id):
         yield active_run.info.run_id
 
 
+def _normalize_evaluators_and_evaluator_config_args(
+        evaluators,
+        evaluator_config,
+):
+    from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
+
+    if evaluators is None:
+        evaluator_name_list = list(_model_evaluation_registry._registry.keys())
+        if evaluator_config is not None:
+            raise ValueError(
+                'If `evaluators` argument is None, `evaluator_config` argument must be None too.'
+            )
+        evaluator_name_to_conf_map = {}
+    elif isinstance(evaluators, str):
+        if not (evaluator_config is None or isinstance(evaluator_config, dict)):
+            raise ValueError(
+                "If `evaluators` argument is the name of an evaluator, evaluator_config must be "
+                "None or a dict containing config items for the evaluator."
+            )
+        evaluator_name_list = [evaluators]
+        evaluator_name_to_conf_map = {evaluators[0]: evaluator_config}
+    elif isinstance(evaluators, list):
+        if evaluator_config is not None:
+            if not (
+                    isinstance(evaluator_config, dict)
+                    and all(k in evaluators and isinstance(v, dict)
+                            for k, v in evaluator_config.items())
+            ):
+                raise ValueError(
+                    "If `evaluators` argument is a evaluator name list, evaluator_config "
+                    "must be a dict contains mapping from evaluator name to individual "
+                    "evaluator config dict."
+                )
+        evaluator_name_list = list(set(evaluators))
+        evaluator_name_to_conf_map = evaluator_config or {}
+    else:
+        raise ValueError(
+            '`evaluators` argument must be None, a evaluator name string, or a list of '
+            'evalautor names.'
+        )
+
+    return evaluator_name_list, evaluator_name_to_conf_map
+
+
 @experimental
 def evaluate(
     model: Union[str, "mlflow.pyfunc.PyFuncModel"],
@@ -506,46 +566,20 @@ def evaluate(
                              a nested dictionary whose key is the evaluator name.
     :return: An :py:class:`mlflow.models.evaluation.EvaluationDataset` instance containing
              evaluation results.
+
+    The default evaluator support 'regressor' and 'classifer' type model, the config item for
+    default evaluator includes:
+     - log_model_explainability: A boolean value representing whether to log model explainability.
+       Default value is True.
+     - explainality_algorithm: A string to specify the shap explainer algorithm. If not set, it will
+       choose the best fit explainer according to the model.
+     - explainability_nsamples: The sample rows for calculating model explainability.
+       Default value is 2000.
     """
     # import _model_evaluation_registry and PyFuncModel inside function to avoid circuit importing
     from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
     from mlflow.pyfunc import PyFuncModel
 
-    if evaluators is None:
-        evaluator_name_list = list(_model_evaluation_registry._registry.keys())
-        if evaluator_config is not None:
-            raise ValueError(
-                'If `evaluators` argument is None, `evaluator_config` argument must be None too.'
-            )
-        evaluator_name_to_conf_map = {}
-    elif isinstance(evaluators, str):
-        if not (evaluator_config is None or isinstance(evaluator_config, dict)):
-            raise ValueError(
-                "If `evaluators` argument is the name of an evaluator, evaluator_config must be "
-                "None or a dict containing config items for the evaluator."
-            )
-        evaluator_name_list = [evaluators]
-        evaluator_name_to_conf_map = {evaluators[0]: evaluator_config}
-    elif isinstance(evaluators, list):
-        if evaluator_config is not None:
-            if not (
-                    isinstance(evaluator_config, dict)
-                    and all(k in evaluators and isinstance(v, dict)
-                            for k, v in evaluator_config.items())
-            ):
-                raise ValueError(
-                    "If `evaluators` argument is a evaluator name list, evaluator_config "
-                    "must be a dict contains mapping from evaluator name to individual "
-                    "evaluator config dict."
-                )
-        evaluator_name_list = list(set(evaluators))
-        evaluator_name_to_conf_map = evaluator_config or {}
-    else:
-        raise ValueError(
-            '`evaluators` argument must be None, a evaluator name string, or a list of '
-            'evalautor names.'
-        )
-
     if isinstance(model, str):
         model = mlflow.pyfunc.load_model(model)
     elif isinstance(model, PyFuncModel):
@@ -556,6 +590,9 @@ def evaluate(
             "an instance of `mlflow.pyfunc.PyFuncModel`."
         )
 
+    evaluator_name_list, evaluator_name_to_conf_map = \
+        _normalize_evaluators_and_evaluator_config_args(evaluators, evaluator_config)
+
     with _start_run_or_reuse_active_run(run_id) as actual_run_id:
         client = mlflow.tracking.MlflowClient()
         dataset._log_dataset_tag(client, actual_run_id)
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 5bb1468169648..620a3de8e24fc 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -9,14 +9,11 @@
 from mlflow.utils.file_utils import TempDir
 from mlflow.tracking.artifact_utils import get_artifact_uri
 from sklearn import metrics as sk_metrics
-import matplotlib.pyplot as pyplot
-import scikitplot
-import shap
-import shap.maskers
 import math
 import pandas as pd
 import time
 from functools import partial
+import logging
 
 """
 [P0] Accuracy: Calculates how often predictions equal labels.
@@ -56,6 +53,9 @@
 from PIL.Image import Image, open as open_image
 
 
+_logger = logging.getLogger(__name__)
+
+
 class ImageEvaluationArtifact(EvaluationArtifact):
     def save(self, output_artifact_path):
         self._content.save(output_artifact_path)
@@ -106,6 +106,7 @@ def _log_metrics(self, run_id, metrics, dataset_name, model):
     def _log_image_artifact(
         self, artifacts, temp_dir, do_plot, run_id, artifact_name, dataset_name
     ):
+        import matplotlib.pyplot as pyplot
         client = mlflow.tracking.MlflowClient()
         pyplot.clf()
         do_plot()
@@ -135,6 +136,9 @@ def _log_pandas_df_artifact(
     def _log_model_explainability(
             self, artifacts, temp_dir, model, X, dataset_name, feature_names, run_id, evaluator_config
     ):
+        import shap
+        import shap.maskers
+
         sample_rows = evaluator_config.get('explainability_nsamples', _DEFAULT_SAMPLE_ROWS_FOR_SHAP)
         algorithm = evaluator_config.get('explainality_algorithm', None)
 
@@ -142,14 +146,19 @@ def _log_model_explainability(
 
         predict_fn = model.predict
 
-        if model_loader_module == 'mlflow.sklearn':
-            raw_model = model._model_impl
-        elif model_loader_module == 'mlflow.lightgbm':
-            raw_model = model._model_impl.lgb_model
-        elif model_loader_module == 'mlflow.xgboost':
-            raw_model = model._model_impl.xgb_model
-        else:
+        try:
+            if model_loader_module == 'mlflow.sklearn':
+                raw_model = model._model_impl
+            elif model_loader_module == 'mlflow.lightgbm':
+                raw_model = model._model_impl.lgb_model
+            elif model_loader_module == 'mlflow.xgboost':
+                raw_model = model._model_impl.xgb_model
+            else:
+                raw_model = None
+        except Exception as e:
             raw_model = None
+            _logger.warning(f'Raw model resolution fails unexpectedly on PyFuncModel {model!r}, '
+                            f'error message is {e}')
 
         if raw_model:
             predict_fn = raw_model.predict
@@ -169,24 +178,24 @@ def _log_model_explainability(
             )
             shap_values = explainer(sampled_X)
         else:
-            maskers = shap.maskers.Independent(X)
+            maskers = shap.maskers.Independent(sampled_X)
             if raw_model and shap.explainers.Linear.supports_model_with_masker(raw_model, maskers):
                 explainer = shap.explainers.Linear(
                     raw_model, maskers, feature_names=feature_names
                 )
-                shap_values = explainer(X)
+                shap_values = explainer(sampled_X)
             elif raw_model and shap.explainers.Tree.supports_model_with_masker(raw_model, maskers):
                 explainer = shap.explainers.Tree(
                     raw_model, maskers, feature_names=feature_names
                 )
-                shap_values = explainer(X)
+                shap_values = explainer(sampled_X)
             elif raw_model and shap.explainers.Additive.supports_model_with_masker(
                     raw_model, maskers
             ):
                 explainer = shap.explainers.Additive(
                     raw_model, maskers, feature_names=feature_names
                 )
-                shap_values = explainer(X)
+                shap_values = explainer(sampled_X)
             else:
                 # fallback to default sampling explainer
                 explainer = shap.explainers.Sampling(
@@ -194,6 +203,8 @@ def _log_model_explainability(
                 )
                 shap_values = explainer(X, sample_rows)
 
+        _logger.info(f'Shap explainer {explainer.__class__.__name__} is used.')
+
         def plot_summary():
             shap.plots.beeswarm(shap_values, show=False)
 
@@ -214,6 +225,8 @@ def plot_feature_importance():
         )
 
     def _evaluate_classifier(self, temp_dir, model, X, y, dataset_name, feature_names, run_id, evaluator_config):
+        import scikitplot
+
         # Note: require labels to be number of 0, 1, 2, .. num_classes - 1
         label_list = sorted(list(set(y)))
         assert label_list[0] == 0, "Label values must being at '0'."
@@ -328,6 +341,8 @@ def evaluate(
         evaluator_config,
         **kwargs,
     ):
+        import shap
+
         shap.initjs()
         with TempDir() as temp_dir:
             X, y = dataset._extract_features_and_labels()

From eae32d415991d9d46804e801f0f5248a74bd14c4 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 13 Dec 2021 20:04:07 +0800
Subject: [PATCH 063/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/default_evaluator.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 620a3de8e24fc..d9c5ca2711e70 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -285,9 +285,10 @@ def plot_roc_curve():
                         fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name="example estimator"
                     ).plot()
 
-                self._log_image_artifact(
-                    artifacts, temp_dir, plot_roc_curve, run_id, "roc_curve", dataset_name
-                )
+                if hasattr(sk_metrics, 'RocCurveDisplay'):
+                    self._log_image_artifact(
+                        artifacts, temp_dir, plot_roc_curve, run_id, "roc_curve", dataset_name
+                    )
 
                 def plot_lift_curve():
                     scikitplot.metrics.plot_lift_curve(y, y_probs)
@@ -297,11 +298,12 @@ def plot_lift_curve():
                 )
 
             def plot_confusion_matrix():
-                sk_metrics.ConfusionMatrixDisplay.from_predictions(y, y_pred)
+                sk_metrics.ConfusionMatrixDisplay(confusion_matrix=confusion_matrix).plot()
 
-            self._log_image_artifact(
-                artifacts, temp_dir, plot_confusion_matrix, run_id, "confusion_matrix", dataset_name
-            )
+            if hasattr(sk_metrics, 'ConfusionMatrixDisplay'):
+                self._log_image_artifact(
+                    artifacts, temp_dir, plot_confusion_matrix, run_id, "confusion_matrix", dataset_name
+                )
 
         self._log_metrics(run_id, metrics, dataset_name, model)
 

From a4b8b60f1d377ad4243b654721e97c3e7538899f Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 13 Dec 2021 20:11:17 +0800
Subject: [PATCH 064/120] remove scikitplot dep

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/default_evaluator.py |   9 +-
 mlflow/models/evaluation/lift_curve.py        | 153 ++++++++++++++++++
 2 files changed, 157 insertions(+), 5 deletions(-)
 create mode 100644 mlflow/models/evaluation/lift_curve.py

diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index d9c5ca2711e70..8cedcfd13ee83 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -225,7 +225,7 @@ def plot_feature_importance():
         )
 
     def _evaluate_classifier(self, temp_dir, model, X, y, dataset_name, feature_names, run_id, evaluator_config):
-        import scikitplot
+        from mlflow.models.evaluation.lift_curve import plot_lift_curve
 
         # Note: require labels to be number of 0, 1, 2, .. num_classes - 1
         label_list = sorted(list(set(y)))
@@ -290,11 +290,10 @@ def plot_roc_curve():
                         artifacts, temp_dir, plot_roc_curve, run_id, "roc_curve", dataset_name
                     )
 
-                def plot_lift_curve():
-                    scikitplot.metrics.plot_lift_curve(y, y_probs)
-
                 self._log_image_artifact(
-                    artifacts, temp_dir, plot_lift_curve, run_id, "lift_curve", dataset_name
+                    artifacts, temp_dir,
+                    lambda: plot_lift_curve(y, y_probs),
+                    run_id, "lift_curve", dataset_name
                 )
 
             def plot_confusion_matrix():
diff --git a/mlflow/models/evaluation/lift_curve.py b/mlflow/models/evaluation/lift_curve.py
new file mode 100644
index 0000000000000..71e11f9bbe885
--- /dev/null
+++ b/mlflow/models/evaluation/lift_curve.py
@@ -0,0 +1,153 @@
+import matplotlib.pyplot as plt
+
+import numpy as np
+
+
+def cumulative_gain_curve(y_true, y_score, pos_label=None):
+    """This function generates the points necessary to plot the Cumulative Gain
+
+    Note: This implementation is restricted to the binary classification task.
+
+    Args:
+        y_true (array-like, shape (n_samples)): True labels of the data.
+
+        y_score (array-like, shape (n_samples)): Target scores, can either be
+            probability estimates of the positive class, confidence values, or
+            non-thresholded measure of decisions (as returned by
+            decision_function on some classifiers).
+
+        pos_label (int or str, default=None): Label considered as positive and
+            others are considered negative
+
+    Returns:
+        percentages (numpy.ndarray): An array containing the X-axis values for
+            plotting the Cumulative Gains chart.
+
+        gains (numpy.ndarray): An array containing the Y-axis values for one
+            curve of the Cumulative Gains chart.
+
+    Raises:
+        ValueError: If `y_true` is not composed of 2 classes. The Cumulative
+            Gain Chart is only relevant in binary classification.
+    """
+    y_true, y_score = np.asarray(y_true), np.asarray(y_score)
+
+    # ensure binary classification if pos_label is not specified
+    classes = np.unique(y_true)
+    if (pos_label is None and
+        not (np.array_equal(classes, [0, 1]) or
+             np.array_equal(classes, [-1, 1]) or
+             np.array_equal(classes, [0]) or
+             np.array_equal(classes, [-1]) or
+             np.array_equal(classes, [1]))):
+        raise ValueError("Data is not binary and pos_label is not specified")
+    elif pos_label is None:
+        pos_label = 1.
+
+    # make y_true a boolean vector
+    y_true = (y_true == pos_label)
+
+    sorted_indices = np.argsort(y_score)[::-1]
+    y_true = y_true[sorted_indices]
+    gains = np.cumsum(y_true)
+
+    percentages = np.arange(start=1, stop=len(y_true) + 1)
+
+    gains = gains / float(np.sum(y_true))
+    percentages = percentages / float(len(y_true))
+
+    gains = np.insert(gains, 0, [0])
+    percentages = np.insert(percentages, 0, [0])
+
+    return percentages, gains
+
+
+def plot_lift_curve(y_true, y_probas, title='Lift Curve',
+                    ax=None, figsize=None, title_fontsize="large",
+                    text_fontsize="medium"):
+    """Generates the Lift Curve from labels and scores/probabilities
+
+    The lift curve is used to determine the effectiveness of a
+    binary classifier. A detailed explanation can be found at
+    http://www2.cs.uregina.ca/~dbd/cs831/notes/lift_chart/lift_chart.html.
+    The implementation here works only for binary classification.
+
+    Args:
+        y_true (array-like, shape (n_samples)):
+            Ground truth (correct) target values.
+
+        y_probas (array-like, shape (n_samples, n_classes)):
+            Prediction probabilities for each class returned by a classifier.
+
+        title (string, optional): Title of the generated plot. Defaults to
+            "Lift Curve".
+
+        ax (:class:`matplotlib.axes.Axes`, optional): The axes upon which to
+            plot the learning curve. If None, the plot is drawn on a new set of
+            axes.
+
+        figsize (2-tuple, optional): Tuple denoting figure size of the plot
+            e.g. (6, 6). Defaults to ``None``.
+
+        title_fontsize (string or int, optional): Matplotlib-style fontsizes.
+            Use e.g. "small", "medium", "large" or integer-values. Defaults to
+            "large".
+
+        text_fontsize (string or int, optional): Matplotlib-style fontsizes.
+            Use e.g. "small", "medium", "large" or integer-values. Defaults to
+            "medium".
+
+    Returns:
+        ax (:class:`matplotlib.axes.Axes`): The axes on which the plot was
+            drawn.
+
+    Example:
+        >>> lr = LogisticRegression()
+        >>> lr = lr.fit(X_train, y_train)
+        >>> y_probas = lr.predict_proba(X_test)
+        >>> plot_lift_curve(y_test, y_probas)
+        <matplotlib.axes._subplots.AxesSubplot object at 0x7fe967d64490>
+        >>> plt.show()
+
+        .. image:: _static/examples/plot_lift_curve.png
+           :align: center
+           :alt: Lift Curve
+    """
+    y_true = np.array(y_true)
+    y_probas = np.array(y_probas)
+
+    classes = np.unique(y_true)
+    if len(classes) != 2:
+        raise ValueError('Cannot calculate Lift Curve for data with '
+                         '{} category/ies'.format(len(classes)))
+
+    # Compute Cumulative Gain Curves
+    percentages, gains1 = cumulative_gain_curve(y_true, y_probas[:, 0],
+                                                classes[0])
+    percentages, gains2 = cumulative_gain_curve(y_true, y_probas[:, 1],
+                                                classes[1])
+
+    percentages = percentages[1:]
+    gains1 = gains1[1:]
+    gains2 = gains2[1:]
+
+    gains1 = gains1 / percentages
+    gains2 = gains2 / percentages
+
+    if ax is None:
+        fig, ax = plt.subplots(1, 1, figsize=figsize)
+
+    ax.set_title(title, fontsize=title_fontsize)
+
+    ax.plot(percentages, gains1, lw=3, label='Class {}'.format(classes[0]))
+    ax.plot(percentages, gains2, lw=3, label='Class {}'.format(classes[1]))
+
+    ax.plot([0, 1], [1, 1], 'k--', lw=2, label='Baseline')
+
+    ax.set_xlabel('Percentage of sample', fontsize=text_fontsize)
+    ax.set_ylabel('Lift', fontsize=text_fontsize)
+    ax.tick_params(labelsize=text_fontsize)
+    ax.grid('on')
+    ax.legend(loc='lower right', fontsize=text_fontsize)
+
+    return ax

From 1bc630b9b1cf547605c79520f21026ca09947a41 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 13 Dec 2021 21:01:41 +0800
Subject: [PATCH 065/120] add pr curve

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/default_evaluator.py | 53 +++++++++++++------
 1 file changed, 38 insertions(+), 15 deletions(-)

diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 8cedcfd13ee83..084969e88d1c3 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -11,6 +11,7 @@
 from sklearn import metrics as sk_metrics
 import math
 import pandas as pd
+import numpy as np
 import time
 from functools import partial
 import logging
@@ -104,25 +105,25 @@ def _log_metrics(self, run_id, metrics, dataset_name, model):
         )
 
     def _log_image_artifact(
-        self, artifacts, temp_dir, do_plot, run_id, artifact_name, dataset_name
+        self, artifacts, temp_dir, do_plot, run_id, artifact_name, dataset_name, model
     ):
         import matplotlib.pyplot as pyplot
         client = mlflow.tracking.MlflowClient()
         pyplot.clf()
         do_plot()
-        artifact_file_name = f"{artifact_name}_on_{dataset_name}.png"
+        artifact_file_name = DefaultEvaluator._gen_log_key(artifact_name, dataset_name, model) + '.png'
         artifact_file_local_path = temp_dir.path(artifact_file_name)
         pyplot.savefig(artifact_file_local_path)
         client.log_artifact(run_id, artifact_file_local_path)
         artifact = ImageEvaluationArtifact(uri=get_artifact_uri(run_id, artifact_file_name))
         artifact.load(artifact_file_local_path)
-        artifacts[artifact_file_name] = artifact
+        artifacts[artifact_name] = artifact
 
     def _log_pandas_df_artifact(
-        self, artifacts, temp_dir, pandas_df, run_id, artifact_name, dataset_name
+        self, artifacts, temp_dir, pandas_df, run_id, artifact_name, dataset_name, model
     ):
         client = mlflow.tracking.MlflowClient()
-        artifact_file_name = f"{artifact_name}_on_{dataset_name}.csv"
+        artifact_file_name = DefaultEvaluator._gen_log_key(artifact_name, dataset_name, model) + '.csv'
         artifact_file_local_path = temp_dir.path(artifact_file_name)
         pandas_df.to_csv(artifact_file_local_path, index=False)
         client.log_artifact(run_id, artifact_file_local_path)
@@ -131,7 +132,7 @@ def _log_pandas_df_artifact(
             content=pandas_df,
         )
         artifact.load(artifact_file_local_path)
-        artifacts[artifact_file_name] = artifact
+        artifacts[artifact_name] = artifact
 
     def _log_model_explainability(
             self, artifacts, temp_dir, model, X, dataset_name, feature_names, run_id, evaluator_config
@@ -209,7 +210,7 @@ def plot_summary():
             shap.plots.beeswarm(shap_values, show=False)
 
         self._log_image_artifact(
-            artifacts, temp_dir, plot_summary, run_id, "shap_summary", dataset_name
+            artifacts, temp_dir, plot_summary, run_id, "shap_summary", dataset_name, model,
         )
 
         def plot_feature_importance():
@@ -222,6 +223,7 @@ def plot_feature_importance():
             run_id,
             "shap_feature_importance",
             dataset_name,
+            model,
         )
 
     def _evaluate_classifier(self, temp_dir, model, X, y, dataset_name, feature_names, run_id, evaluator_config):
@@ -266,34 +268,55 @@ def _evaluate_classifier(self, temp_dir, model, X, y, dataset_name, feature_name
             # TODO:
             #  compute hinge loss, this requires calling decision_function of the model
             #  e.g., see https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC.decision_function
-
             if y_probs is not None:
-                metrics["roc_auc"] = sk_metrics.roc_auc_score(y, y_prob)
                 fpr, tpr, thresholds = sk_metrics.roc_curve(y, y_prob)
                 roc_curve_pandas_df = pd.DataFrame(
                     {"fpr": fpr, "tpr": tpr, "thresholds": thresholds}
                 )
                 self._log_pandas_df_artifact(
-                    artifacts, temp_dir, roc_curve_pandas_df, run_id, "roc_curve", dataset_name
+                    artifacts, temp_dir, roc_curve_pandas_df, run_id, "roc_curve", dataset_name, model,
                 )
 
                 roc_auc = sk_metrics.auc(fpr, tpr)
-                metrics["precision_recall_auc"] = roc_auc
+                metrics["roc_auc"] = roc_auc
 
                 def plot_roc_curve():
                     sk_metrics.RocCurveDisplay(
-                        fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name="example estimator"
+                        fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                     ).plot()
 
                 if hasattr(sk_metrics, 'RocCurveDisplay'):
                     self._log_image_artifact(
-                        artifacts, temp_dir, plot_roc_curve, run_id, "roc_curve", dataset_name
+                        artifacts, temp_dir, plot_roc_curve, run_id, "roc_curve", dataset_name, model,
+                    )
+
+                precision, recall, thresholds = sk_metrics.precision_recall_curve(y, y_prob)
+                thresholds = np.append(thresholds, [1.0], axis=0)
+                pr_curve_pandas_df = pd.DataFrame(
+                    {"precision": precision, "recall": recall, "thresholds": thresholds}
+                )
+                self._log_pandas_df_artifact(
+                    artifacts, temp_dir, pr_curve_pandas_df, run_id, "precision_recall_curve",
+                    dataset_name, model,
+                )
+
+                pr_auc = sk_metrics.auc(recall, precision)
+                metrics["precision_recall_auc"] = pr_auc
+
+                def plot_pr_curve():
+                    sk_metrics.PrecisionRecallDisplay(
+                        precision, recall,
+                    ).plot()
+
+                if hasattr(sk_metrics, 'PrecisionRecallDisplay'):
+                    self._log_image_artifact(
+                        artifacts, temp_dir, plot_pr_curve, run_id, "precision_recall_curve", dataset_name, model,
                     )
 
                 self._log_image_artifact(
                     artifacts, temp_dir,
                     lambda: plot_lift_curve(y, y_probs),
-                    run_id, "lift_curve", dataset_name
+                    run_id, "lift_curve", dataset_name, model,
                 )
 
             def plot_confusion_matrix():
@@ -301,7 +324,7 @@ def plot_confusion_matrix():
 
             if hasattr(sk_metrics, 'ConfusionMatrixDisplay'):
                 self._log_image_artifact(
-                    artifacts, temp_dir, plot_confusion_matrix, run_id, "confusion_matrix", dataset_name
+                    artifacts, temp_dir, plot_confusion_matrix, run_id, "confusion_matrix", dataset_name, model,
                 )
 
         self._log_metrics(run_id, metrics, dataset_name, model)

From 06ce7e8ea62f79a82491d250f6790bab349be14e Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 13 Dec 2021 21:28:38 +0800
Subject: [PATCH 066/120] add shap.summary_plot

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/default_evaluator.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 084969e88d1c3..6e8b4b2d824d5 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -213,6 +213,13 @@ def plot_summary():
             artifacts, temp_dir, plot_summary, run_id, "shap_summary", dataset_name, model,
         )
 
+        def plot_summary_in_js():
+            shap.summary_plot(shap_values, show=False)
+
+        self._log_image_artifact(
+            artifacts, temp_dir, plot_summary_in_js, run_id, "shap_summary_in_js", dataset_name, model,
+        )
+
         def plot_feature_importance():
             shap.plots.bar(shap_values, show=False)
 

From 1016fa02ab79ff6812f43ecb71d78c42279902e2 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 13 Dec 2021 22:48:52 +0800
Subject: [PATCH 067/120] log explainer

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              |  2 +-
 mlflow/models/evaluation/default_evaluator.py | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 05446a6f4d609..49ae184026e4e 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -481,7 +481,7 @@ def _start_run_or_reuse_active_run(run_id):
     else:
         if run_id and active_run.info.run_id != run_id:
             raise ValueError(
-                "An active run exists, you cannot specify another run_id when " "evaluating."
+                "An active run exists, you cannot specify another run_id when evaluating."
             )
         yield active_run.info.run_id
 
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 6e8b4b2d824d5..a987c60c7cac2 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -139,6 +139,7 @@ def _log_model_explainability(
     ):
         import shap
         import shap.maskers
+        import matplotlib.pyplot as pyplot
 
         sample_rows = evaluator_config.get('explainability_nsamples', _DEFAULT_SAMPLE_ROWS_FOR_SHAP)
         algorithm = evaluator_config.get('explainality_algorithm', None)
@@ -206,7 +207,14 @@ def _log_model_explainability(
 
         _logger.info(f'Shap explainer {explainer.__class__.__name__} is used.')
 
+        # TODO: seems infer pip req fail when log_explainer.
+        mlflow.shap.log_explainer(
+            explainer,
+            artifact_path=DefaultEvaluator._gen_log_key('explainer', dataset_name, model)
+        )
+
         def plot_summary():
+            pyplot.subplots_adjust(left=0.4)
             shap.plots.beeswarm(shap_values, show=False)
 
         self._log_image_artifact(
@@ -214,6 +222,7 @@ def plot_summary():
         )
 
         def plot_summary_in_js():
+            pyplot.subplots_adjust(left=0.4)
             shap.summary_plot(shap_values, show=False)
 
         self._log_image_artifact(
@@ -221,6 +230,7 @@ def plot_summary_in_js():
         )
 
         def plot_feature_importance():
+            pyplot.subplots_adjust(left=0.4)
             shap.plots.bar(shap_values, show=False)
 
         self._log_image_artifact(

From c0550605b841dfb57cba9755be49527a3f792d83 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Tue, 14 Dec 2021 22:38:16 +0800
Subject: [PATCH 068/120] address comments

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py       | 71 +++++++++++++++++---------
 mlflow/models/evaluation/lift_curve.py | 20 +++++---
 2 files changed, 59 insertions(+), 32 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 49ae184026e4e..fc1b373e52e71 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -530,6 +530,48 @@ def _normalize_evaluators_and_evaluator_config_args(
     return evaluator_name_list, evaluator_name_to_conf_map
 
 
+def _evaluate(
+        model, model_type, dataset, actual_run_id, evaluator_name_list, evaluator_name_to_conf_map
+):
+    """
+    This method is the patch point for databricks instrumentation.
+    The public API "evaluate" will verify argument first, and then pass normalized arguments
+    to the _evaluate method.
+    """
+    # import _model_evaluation_registry and PyFuncModel inside function to avoid circuit importing
+    from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
+
+    client = mlflow.tracking.MlflowClient()
+    dataset._log_dataset_tag(client, actual_run_id)
+
+    eval_results = []
+    for evaluator_name in evaluator_name_list:
+        config = evaluator_name_to_conf_map.get(evaluator_name) or {}
+        try:
+            evaluator = _model_evaluation_registry.get_evaluator(evaluator_name)
+        except MlflowException:
+            _logger.warning(f"Evaluator '{evaluator_name}' is not registered.")
+            continue
+
+        if evaluator.can_evaluate(model_type, config):
+            _logger.info(f"Evaluating the model with the {evaluator_name} evaluator.")
+            result = evaluator.evaluate(model, model_type, dataset, actual_run_id, config)
+            eval_results.append(result)
+
+    if len(eval_results) == 0:
+        raise ValueError(
+            "The model could not be evaluated by any of the registered evaluators, please "
+            "check the model type and other configs are set correctly."
+        )
+
+    merged_eval_result = EvaluationResult(EvaluationMetrics(), dict())
+    for eval_result in eval_results:
+        merged_eval_result.metrics.update(eval_result.metrics)
+        merged_eval_result.artifacts.update(eval_result.artifacts)
+
+    return merged_eval_result
+
+
 @experimental
 def evaluate(
     model: Union[str, "mlflow.pyfunc.PyFuncModel"],
@@ -576,8 +618,6 @@ def evaluate(
      - explainability_nsamples: The sample rows for calculating model explainability.
        Default value is 2000.
     """
-    # import _model_evaluation_registry and PyFuncModel inside function to avoid circuit importing
-    from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
     from mlflow.pyfunc import PyFuncModel
 
     if isinstance(model, str):
@@ -594,26 +634,7 @@ def evaluate(
         _normalize_evaluators_and_evaluator_config_args(evaluators, evaluator_config)
 
     with _start_run_or_reuse_active_run(run_id) as actual_run_id:
-        client = mlflow.tracking.MlflowClient()
-        dataset._log_dataset_tag(client, actual_run_id)
-
-        eval_results = []
-        for evaluator_name in evaluator_name_list:
-            config = evaluator_name_to_conf_map.get(evaluator_name) or {}
-            try:
-                evaluator = _model_evaluation_registry.get_evaluator(evaluator_name)
-            except MlflowException:
-                _logger.warning(f"Evaluator '{evaluator_name}' is not registered.")
-                continue
-
-            if evaluator.can_evaluate(model_type, config):
-                _logger.info(f"Evaluating the model with the {evaluator_name} evaluator.")
-                result = evaluator.evaluate(model, model_type, dataset, actual_run_id, config)
-                eval_results.append(result)
-
-        merged_eval_result = EvaluationResult(EvaluationMetrics(), dict())
-        for eval_result in eval_results:
-            merged_eval_result.metrics.update(eval_result.metrics)
-            merged_eval_result.artifacts.update(eval_result.artifacts)
-
-        return merged_eval_result
+        return _evaluate(
+            model, model_type, dataset, actual_run_id,
+            evaluator_name_list, evaluator_name_to_conf_map
+        )
diff --git a/mlflow/models/evaluation/lift_curve.py b/mlflow/models/evaluation/lift_curve.py
index 71e11f9bbe885..9fd7485f25450 100644
--- a/mlflow/models/evaluation/lift_curve.py
+++ b/mlflow/models/evaluation/lift_curve.py
@@ -3,8 +3,11 @@
 import numpy as np
 
 
-def cumulative_gain_curve(y_true, y_score, pos_label=None):
-    """This function generates the points necessary to plot the Cumulative Gain
+def _cumulative_gain_curve(y_true, y_score, pos_label=None):
+    """
+    This method is copied from scikit-plot package.
+
+    This function generates the points necessary to plot the Cumulative Gain
 
     Note: This implementation is restricted to the binary classification task.
 
@@ -65,7 +68,10 @@ def cumulative_gain_curve(y_true, y_score, pos_label=None):
 def plot_lift_curve(y_true, y_probas, title='Lift Curve',
                     ax=None, figsize=None, title_fontsize="large",
                     text_fontsize="medium"):
-    """Generates the Lift Curve from labels and scores/probabilities
+    """
+    This method is copied from scikit-plot package.
+
+    Generates the Lift Curve from labels and scores/probabilities
 
     The lift curve is used to determine the effectiveness of a
     binary classifier. A detailed explanation can be found at
@@ -122,10 +128,10 @@ def plot_lift_curve(y_true, y_probas, title='Lift Curve',
                          '{} category/ies'.format(len(classes)))
 
     # Compute Cumulative Gain Curves
-    percentages, gains1 = cumulative_gain_curve(y_true, y_probas[:, 0],
-                                                classes[0])
-    percentages, gains2 = cumulative_gain_curve(y_true, y_probas[:, 1],
-                                                classes[1])
+    percentages, gains1 = _cumulative_gain_curve(y_true, y_probas[:, 0],
+                                                 classes[0])
+    percentages, gains2 = _cumulative_gain_curve(y_true, y_probas[:, 1],
+                                                 classes[1])
 
     percentages = percentages[1:]
     gains1 = gains1[1:]

From 849656d3e94c2296eff70063e2ba408f85cab3d2 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Tue, 14 Dec 2021 23:32:54 +0800
Subject: [PATCH 069/120] address comments

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              | 13 +++--
 mlflow/models/evaluation/default_evaluator.py | 56 ++++++++++---------
 mlflow/utils/string_utils.py                  | 10 ++++
 3 files changed, 50 insertions(+), 29 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index fc1b373e52e71..856a3c8b94a51 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -388,7 +388,7 @@ def _metadata(self):
             metadata["path"] = self.path
         return metadata
 
-    def _log_dataset_tag(self, client, run_id):
+    def _log_dataset_tag(self, client, run_id, model_uuid):
         """
         Log dataset metadata as a tag "mlflow.datasets", if the tag already exists, it will
         append current dataset metadata into existing tag content.
@@ -399,10 +399,14 @@ def _log_dataset_tag(self, client, run_id):
         dataset_metadata_list = json.loads(existing_dataset_metadata_str)
 
         for metadata in dataset_metadata_list:
-            if metadata["hash"] == self.hash and metadata["name"] == self._user_specified_name:
+            if metadata["hash"] == self.hash and \
+                    metadata["name"] == self.name and \
+                    metadata["model"] == model_uuid:
                 break
         else:
-            dataset_metadata_list.append(self._metadata)
+            new_metadata = self._metadata
+            new_metadata["model"] = model_uuid
+            dataset_metadata_list.append(new_metadata)
 
         dataset_metadata_str = json.dumps(dataset_metadata_list, separators=(",", ":"))
         client.log_batch(
@@ -542,7 +546,8 @@ def _evaluate(
     from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
 
     client = mlflow.tracking.MlflowClient()
-    dataset._log_dataset_tag(client, actual_run_id)
+    model_uuid = model.metadata.model_uuid
+    dataset._log_dataset_tag(client, actual_run_id, model_uuid)
 
     eval_results = []
     for evaluator_name in evaluator_name_list:
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index a987c60c7cac2..3f05a61c832c5 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -8,6 +8,8 @@
 from mlflow.entities.metric import Metric
 from mlflow.utils.file_utils import TempDir
 from mlflow.tracking.artifact_utils import get_artifact_uri
+from mlflow.utils.string_utils import truncate_str_from_middle
+
 from sklearn import metrics as sk_metrics
 import math
 import pandas as pd
@@ -83,13 +85,10 @@ def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
         return model_type in ["classifier", "regressor"]
 
     @staticmethod
-    def _gen_log_key(key, dataset_name, model):
-        if hasattr(model.metadata, 'model_uuid'):
-            return f'{key}_on_data_{dataset_name}_model_{model.metadata.model_uuid}'
-        else:
-            return f'{key}_on_data_{dataset_name}'
+    def _gen_log_key(key, dataset_name):
+        return f'{key}_on_data_{dataset_name}'
 
-    def _log_metrics(self, run_id, metrics, dataset_name, model):
+    def _log_metrics(self, run_id, metrics, dataset_name):
         """
         Helper method to log metrics into specified run.
         """
@@ -98,20 +97,20 @@ def _log_metrics(self, run_id, metrics, dataset_name, model):
         client.log_batch(
             run_id,
             metrics=[
-                Metric(key=DefaultEvaluator._gen_log_key(key, dataset_name, model),
+                Metric(key=DefaultEvaluator._gen_log_key(key, dataset_name),
                        value=value, timestamp=timestamp, step=0)
                 for key, value in metrics.items()
             ],
         )
 
     def _log_image_artifact(
-        self, artifacts, temp_dir, do_plot, run_id, artifact_name, dataset_name, model
+        self, artifacts, temp_dir, do_plot, run_id, artifact_name, dataset_name,
     ):
         import matplotlib.pyplot as pyplot
         client = mlflow.tracking.MlflowClient()
         pyplot.clf()
         do_plot()
-        artifact_file_name = DefaultEvaluator._gen_log_key(artifact_name, dataset_name, model) + '.png'
+        artifact_file_name = DefaultEvaluator._gen_log_key(artifact_name, dataset_name) + '.png'
         artifact_file_local_path = temp_dir.path(artifact_file_name)
         pyplot.savefig(artifact_file_local_path)
         client.log_artifact(run_id, artifact_file_local_path)
@@ -123,7 +122,7 @@ def _log_pandas_df_artifact(
         self, artifacts, temp_dir, pandas_df, run_id, artifact_name, dataset_name, model
     ):
         client = mlflow.tracking.MlflowClient()
-        artifact_file_name = DefaultEvaluator._gen_log_key(artifact_name, dataset_name, model) + '.csv'
+        artifact_file_name = DefaultEvaluator._gen_log_key(artifact_name, dataset_name) + '.csv'
         artifact_file_local_path = temp_dir.path(artifact_file_name)
         pandas_df.to_csv(artifact_file_local_path, index=False)
         client.log_artifact(run_id, artifact_file_local_path)
@@ -173,35 +172,41 @@ def _log_model_explainability(
             except ImportError:
                 pass
 
+        # TODO: alias truncated name if duplicated
+        truncated_feature_names = [truncate_str_from_middle(f, 20) for f in feature_names]
+        truncated_feature_name_map = {f: f2 for f, f2 in zip(feature_names, truncated_feature_names)}
+        if isinstance(X, pd.DataFrame):
+            X = X.rename(columns=truncated_feature_name_map)
+
         sampled_X = shap.sample(X, sample_rows)
         if algorithm:
             explainer = shap.Explainer(
-                predict_fn, sampled_X, feature_names=feature_names, algorithm=algorithm
+                predict_fn, sampled_X, feature_names=truncated_feature_names, algorithm=algorithm
             )
             shap_values = explainer(sampled_X)
         else:
             maskers = shap.maskers.Independent(sampled_X)
             if raw_model and shap.explainers.Linear.supports_model_with_masker(raw_model, maskers):
                 explainer = shap.explainers.Linear(
-                    raw_model, maskers, feature_names=feature_names
+                    raw_model, maskers, feature_names=truncated_feature_names
                 )
                 shap_values = explainer(sampled_X)
             elif raw_model and shap.explainers.Tree.supports_model_with_masker(raw_model, maskers):
                 explainer = shap.explainers.Tree(
-                    raw_model, maskers, feature_names=feature_names
+                    raw_model, maskers, feature_names=truncated_feature_names
                 )
                 shap_values = explainer(sampled_X)
             elif raw_model and shap.explainers.Additive.supports_model_with_masker(
                     raw_model, maskers
             ):
                 explainer = shap.explainers.Additive(
-                    raw_model, maskers, feature_names=feature_names
+                    raw_model, maskers, feature_names=truncated_feature_names
                 )
                 shap_values = explainer(sampled_X)
             else:
                 # fallback to default sampling explainer
                 explainer = shap.explainers.Sampling(
-                    predict_fn, X, feature_names=feature_names
+                    predict_fn, X, feature_names=truncated_feature_names
                 )
                 shap_values = explainer(X, sample_rows)
 
@@ -210,7 +215,7 @@ def _log_model_explainability(
         # TODO: seems infer pip req fail when log_explainer.
         mlflow.shap.log_explainer(
             explainer,
-            artifact_path=DefaultEvaluator._gen_log_key('explainer', dataset_name, model)
+            artifact_path=DefaultEvaluator._gen_log_key('explainer', dataset_name)
         )
 
         def plot_summary():
@@ -218,7 +223,7 @@ def plot_summary():
             shap.plots.beeswarm(shap_values, show=False)
 
         self._log_image_artifact(
-            artifacts, temp_dir, plot_summary, run_id, "shap_summary", dataset_name, model,
+            artifacts, temp_dir, plot_summary, run_id, "shap_summary", dataset_name,
         )
 
         def plot_summary_in_js():
@@ -226,7 +231,7 @@ def plot_summary_in_js():
             shap.summary_plot(shap_values, show=False)
 
         self._log_image_artifact(
-            artifacts, temp_dir, plot_summary_in_js, run_id, "shap_summary_in_js", dataset_name, model,
+            artifacts, temp_dir, plot_summary_in_js, run_id, "shap_summary_in_js", dataset_name,
         )
 
         def plot_feature_importance():
@@ -240,7 +245,6 @@ def plot_feature_importance():
             run_id,
             "shap_feature_importance",
             dataset_name,
-            model,
         )
 
     def _evaluate_classifier(self, temp_dir, model, X, y, dataset_name, feature_names, run_id, evaluator_config):
@@ -251,6 +255,7 @@ def _evaluate_classifier(self, temp_dir, model, X, y, dataset_name, feature_name
         assert label_list[0] == 0, "Label values must being at '0'."
         num_classes = len(label_list)
 
+        # TODO: for xgb disable feature names check
         y_pred = model.predict(X)
 
         is_binomial = num_classes <= 2
@@ -266,6 +271,7 @@ def _evaluate_classifier(self, temp_dir, model, X, y, dataset_name, feature_name
 
         if is_binomial:
             if hasattr(model, "predict_proba"):
+                # TODO: for xgb disable feature names check
                 y_probs = model.predict_proba(X)
                 y_prob = y_probs[:, 1]
             else:
@@ -304,7 +310,7 @@ def plot_roc_curve():
 
                 if hasattr(sk_metrics, 'RocCurveDisplay'):
                     self._log_image_artifact(
-                        artifacts, temp_dir, plot_roc_curve, run_id, "roc_curve", dataset_name, model,
+                        artifacts, temp_dir, plot_roc_curve, run_id, "roc_curve", dataset_name,
                     )
 
                 precision, recall, thresholds = sk_metrics.precision_recall_curve(y, y_prob)
@@ -327,13 +333,13 @@ def plot_pr_curve():
 
                 if hasattr(sk_metrics, 'PrecisionRecallDisplay'):
                     self._log_image_artifact(
-                        artifacts, temp_dir, plot_pr_curve, run_id, "precision_recall_curve", dataset_name, model,
+                        artifacts, temp_dir, plot_pr_curve, run_id, "precision_recall_curve", dataset_name,
                     )
 
                 self._log_image_artifact(
                     artifacts, temp_dir,
                     lambda: plot_lift_curve(y, y_probs),
-                    run_id, "lift_curve", dataset_name, model,
+                    run_id, "lift_curve", dataset_name,
                 )
 
             def plot_confusion_matrix():
@@ -341,10 +347,10 @@ def plot_confusion_matrix():
 
             if hasattr(sk_metrics, 'ConfusionMatrixDisplay'):
                 self._log_image_artifact(
-                    artifacts, temp_dir, plot_confusion_matrix, run_id, "confusion_matrix", dataset_name, model,
+                    artifacts, temp_dir, plot_confusion_matrix, run_id, "confusion_matrix", dataset_name,
                 )
 
-        self._log_metrics(run_id, metrics, dataset_name, model)
+        self._log_metrics(run_id, metrics, dataset_name)
 
         if evaluator_config.get('log_model_explainability', True):
             self._log_model_explainability(
@@ -370,7 +376,7 @@ def _evaluate_regressor(self, temp_dir, model, X, y, dataset_name, feature_names
         self._log_model_explainability(
             artifacts, temp_dir, model, X, dataset_name, feature_names, run_id, evaluator_config
         )
-        self._log_metrics(run_id, metrics, dataset_name, model)
+        self._log_metrics(run_id, metrics, dataset_name)
         return EvaluationResult(metrics, artifacts)
 
     def evaluate(
diff --git a/mlflow/utils/string_utils.py b/mlflow/utils/string_utils.py
index e3b1d0b911301..f455edbd331e8 100644
--- a/mlflow/utils/string_utils.py
+++ b/mlflow/utils/string_utils.py
@@ -12,3 +12,13 @@ def strip_suffix(original, suffix):
 
 def is_string_type(item):
     return isinstance(item, str)
+
+
+def truncate_str_from_middle(s, max_length):
+    assert max_length > 3
+    if len(s) <= max_length:
+        return s
+    else:
+        left_part_len = (max_length - 3) // 2
+        right_part_len = max_length - 3 - left_part_len
+        return f"{s[:left_part_len]}...{s[-right_part_len:]}"

From 1705fbce84dcc107fc6217f95282c190411ad84b Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 15 Dec 2021 22:27:55 +0800
Subject: [PATCH 070/120] improve explainer code

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/default_evaluator.py | 71 +++++++++++--------
 1 file changed, 42 insertions(+), 29 deletions(-)

diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 3f05a61c832c5..24207a3ca2212 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -134,14 +134,15 @@ def _log_pandas_df_artifact(
         artifacts[artifact_name] = artifact
 
     def _log_model_explainability(
-            self, artifacts, temp_dir, model, X, dataset_name, feature_names, run_id, evaluator_config
+            self, artifacts, temp_dir, model, X, dataset_name, feature_names, run_id, evaluator_config,
+            is_binomial_classifier,
     ):
         import shap
         import shap.maskers
         import matplotlib.pyplot as pyplot
 
         sample_rows = evaluator_config.get('explainability_nsamples', _DEFAULT_SAMPLE_ROWS_FOR_SHAP)
-        algorithm = evaluator_config.get('explainality_algorithm', None)
+        algorithm = evaluator_config.get('explainability_algorithm', None)
 
         model_loader_module = model.metadata.flavors['python_function']["loader_module"]
 
@@ -176,22 +177,30 @@ def _log_model_explainability(
         truncated_feature_names = [truncate_str_from_middle(f, 20) for f in feature_names]
         truncated_feature_name_map = {f: f2 for f, f2 in zip(feature_names, truncated_feature_names)}
         if isinstance(X, pd.DataFrame):
-            X = X.rename(columns=truncated_feature_name_map)
+            X = X.rename(columns=truncated_feature_name_map, copy=False)
 
         sampled_X = shap.sample(X, sample_rows)
         if algorithm:
-            explainer = shap.Explainer(
-                predict_fn, sampled_X, feature_names=truncated_feature_names, algorithm=algorithm
-            )
-            shap_values = explainer(sampled_X)
+            if algorithm == 'sampling':
+                explainer = shap.explainers.Sampling(
+                    predict_fn, X, feature_names=truncated_feature_names
+                )
+                shap_values = explainer(X, sample_rows)
+            else:
+                explainer = shap.Explainer(
+                    predict_fn, sampled_X, feature_names=truncated_feature_names, algorithm=algorithm
+                )
+                shap_values = explainer(sampled_X)
         else:
             maskers = shap.maskers.Independent(sampled_X)
-            if raw_model and shap.explainers.Linear.supports_model_with_masker(raw_model, maskers):
+            if raw_model and is_binomial_classifier and \
+                    shap.explainers.Linear.supports_model_with_masker(raw_model, maskers):
                 explainer = shap.explainers.Linear(
                     raw_model, maskers, feature_names=truncated_feature_names
                 )
                 shap_values = explainer(sampled_X)
-            elif raw_model and shap.explainers.Tree.supports_model_with_masker(raw_model, maskers):
+            elif raw_model and is_binomial_classifier and \
+                    shap.explainers.Tree.supports_model_with_masker(raw_model, maskers):
                 explainer = shap.explainers.Tree(
                     raw_model, maskers, feature_names=truncated_feature_names
                 )
@@ -204,38 +213,39 @@ def _log_model_explainability(
                 )
                 shap_values = explainer(sampled_X)
             else:
-                # fallback to default sampling explainer
-                explainer = shap.explainers.Sampling(
-                    predict_fn, X, feature_names=truncated_feature_names
+                # fallback to default explainer
+                explainer = shap.Explainer(
+                    predict_fn, sampled_X, feature_names=truncated_feature_names
                 )
-                shap_values = explainer(X, sample_rows)
+                shap_values = explainer(sampled_X)
 
         _logger.info(f'Shap explainer {explainer.__class__.__name__} is used.')
 
         # TODO: seems infer pip req fail when log_explainer.
-        mlflow.shap.log_explainer(
-            explainer,
-            artifact_path=DefaultEvaluator._gen_log_key('explainer', dataset_name)
-        )
 
-        def plot_summary():
-            pyplot.subplots_adjust(left=0.4)
+        #mlflow.shap.log_explainer(
+        #    explainer,
+        #    artifact_path=DefaultEvaluator._gen_log_key('explainer', dataset_name)
+        #)
+
+        def plot_beeswarm():
+            pyplot.subplots_adjust(bottom=0.2, left=0.4)
             shap.plots.beeswarm(shap_values, show=False)
 
         self._log_image_artifact(
-            artifacts, temp_dir, plot_summary, run_id, "shap_summary", dataset_name,
+            artifacts, temp_dir, plot_beeswarm, run_id, "shap_beeswarm", dataset_name,
         )
 
-        def plot_summary_in_js():
-            pyplot.subplots_adjust(left=0.4)
+        def plot_summary():
+            pyplot.subplots_adjust(bottom=0.2, left=0.4)
             shap.summary_plot(shap_values, show=False)
 
         self._log_image_artifact(
-            artifacts, temp_dir, plot_summary_in_js, run_id, "shap_summary_in_js", dataset_name,
+            artifacts, temp_dir, plot_summary, run_id, "shap_summary", dataset_name,
         )
 
         def plot_feature_importance():
-            pyplot.subplots_adjust(left=0.4)
+            pyplot.subplots_adjust(bottom=0.2, left=0.4)
             shap.plots.bar(shap_values, show=False)
 
         self._log_image_artifact(
@@ -255,7 +265,6 @@ def _evaluate_classifier(self, temp_dir, model, X, y, dataset_name, feature_name
         assert label_list[0] == 0, "Label values must being at '0'."
         num_classes = len(label_list)
 
-        # TODO: for xgb disable feature names check
         y_pred = model.predict(X)
 
         is_binomial = num_classes <= 2
@@ -354,7 +363,8 @@ def plot_confusion_matrix():
 
         if evaluator_config.get('log_model_explainability', True):
             self._log_model_explainability(
-                artifacts, temp_dir, model, X, dataset_name, feature_names, run_id, evaluator_config
+                artifacts, temp_dir, model, X, dataset_name, feature_names, run_id, evaluator_config,
+                is_binomial_classifier=(num_classes <= 2)
             )
 
         return EvaluationResult(metrics, artifacts)
@@ -373,9 +383,12 @@ def _evaluate_regressor(self, temp_dir, model, X, y, dataset_name, feature_names
         metrics['max_error'] = sk_metrics.max_error(y, y_pred)
         metrics['mean_absolute_percentage_error'] = \
             sk_metrics.mean_absolute_percentage_error(y, y_pred)
-        self._log_model_explainability(
-            artifacts, temp_dir, model, X, dataset_name, feature_names, run_id, evaluator_config
-        )
+
+        if evaluator_config.get('log_model_explainability', True):
+            self._log_model_explainability(
+                artifacts, temp_dir, model, X, dataset_name, feature_names, run_id, evaluator_config,
+                is_binomial_classifier=False,
+            )
         self._log_metrics(run_id, metrics, dataset_name)
         return EvaluationResult(metrics, artifacts)
 

From 721b824969d7738268909ec4d9353220d34227df Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 15 Dec 2021 23:04:35 +0800
Subject: [PATCH 071/120] address comments

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              |  7 ++--
 mlflow/models/evaluation/default_evaluator.py | 41 +++++++++++--------
 2 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 856a3c8b94a51..746b5706e2adb 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -614,10 +614,9 @@ def evaluate(
     :return: An :py:class:`mlflow.models.evaluation.EvaluationDataset` instance containing
              evaluation results.
 
-    The default evaluator support 'regressor' and 'classifer' type model, the config item for
-    default evaluator includes:
-     - log_model_explainability: A boolean value representing whether to log model explainability.
-       Default value is True.
+    The default evaluator supports the 'regressor' and 'classifer' `model_type`s. The available
+    `evaluator_config` options for the default evaluator include:
+     - log_model_explainability: The number of rows to use for calculating model explainability.
      - explainality_algorithm: A string to specify the shap explainer algorithm. If not set, it will
        choose the best fit explainer according to the model.
      - explainability_nsamples: The sample rows for calculating model explainability.
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 24207a3ca2212..2e307ba10808f 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -135,8 +135,11 @@ def _log_pandas_df_artifact(
 
     def _log_model_explainability(
             self, artifacts, temp_dir, model, X, dataset_name, feature_names, run_id, evaluator_config,
-            is_binomial_classifier,
+            is_multinomial_classifier,
     ):
+        if not evaluator_config.get('log_model_explainability', True):
+            return
+
         import shap
         import shap.maskers
         import matplotlib.pyplot as pyplot
@@ -193,13 +196,13 @@ def _log_model_explainability(
                 shap_values = explainer(sampled_X)
         else:
             maskers = shap.maskers.Independent(sampled_X)
-            if raw_model and is_binomial_classifier and \
+            if raw_model and not is_multinomial_classifier and \
                     shap.explainers.Linear.supports_model_with_masker(raw_model, maskers):
                 explainer = shap.explainers.Linear(
                     raw_model, maskers, feature_names=truncated_feature_names
                 )
                 shap_values = explainer(sampled_X)
-            elif raw_model and is_binomial_classifier and \
+            elif raw_model and not is_multinomial_classifier and \
                     shap.explainers.Tree.supports_model_with_masker(raw_model, maskers):
                 explainer = shap.explainers.Tree(
                     raw_model, maskers, feature_names=truncated_feature_names
@@ -222,11 +225,15 @@ def _log_model_explainability(
         _logger.info(f'Shap explainer {explainer.__class__.__name__} is used.')
 
         # TODO: seems infer pip req fail when log_explainer.
-
-        #mlflow.shap.log_explainer(
-        #    explainer,
-        #    artifact_path=DefaultEvaluator._gen_log_key('explainer', dataset_name)
-        #)
+        # TODO: The explainer saver is buggy, if `get_underlying_model_flavor` return "unknown",
+        #   then fallback to shap explainer saver, and shap explainer will call `model.save`
+        #   for sklearn model, there is no `.save` method, so error will happen.
+        """
+        mlflow.shap.log_explainer(
+            explainer,
+            artifact_path=DefaultEvaluator._gen_log_key('explainer', dataset_name)
+        )
+        """
 
         def plot_beeswarm():
             pyplot.subplots_adjust(bottom=0.2, left=0.4)
@@ -361,11 +368,10 @@ def plot_confusion_matrix():
 
         self._log_metrics(run_id, metrics, dataset_name)
 
-        if evaluator_config.get('log_model_explainability', True):
-            self._log_model_explainability(
-                artifacts, temp_dir, model, X, dataset_name, feature_names, run_id, evaluator_config,
-                is_binomial_classifier=(num_classes <= 2)
-            )
+        self._log_model_explainability(
+            artifacts, temp_dir, model, X, dataset_name, feature_names, run_id, evaluator_config,
+            is_multinomial_classifier=(num_classes > 2)
+        )
 
         return EvaluationResult(metrics, artifacts)
 
@@ -384,11 +390,10 @@ def _evaluate_regressor(self, temp_dir, model, X, y, dataset_name, feature_names
         metrics['mean_absolute_percentage_error'] = \
             sk_metrics.mean_absolute_percentage_error(y, y_pred)
 
-        if evaluator_config.get('log_model_explainability', True):
-            self._log_model_explainability(
-                artifacts, temp_dir, model, X, dataset_name, feature_names, run_id, evaluator_config,
-                is_binomial_classifier=False,
-            )
+        self._log_model_explainability(
+            artifacts, temp_dir, model, X, dataset_name, feature_names, run_id, evaluator_config,
+            is_multinomial_classifier=False,
+        )
         self._log_metrics(run_id, metrics, dataset_name)
         return EvaluationResult(metrics, artifacts)
 

From ebd495f9db46e943efe17b1feb6d7f2961cd473b Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 15 Dec 2021 23:31:18 +0800
Subject: [PATCH 072/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              | 2 +-
 mlflow/models/evaluation/default_evaluator.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 746b5706e2adb..cc5c2a7895610 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -566,7 +566,7 @@ def _evaluate(
     if len(eval_results) == 0:
         raise ValueError(
             "The model could not be evaluated by any of the registered evaluators, please "
-            "check the model type and other configs are set correctly."
+            "verify that the model type and other configs are set correctly."
         )
 
     merged_eval_result = EvaluationResult(EvaluationMetrics(), dict())
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 2e307ba10808f..da21433517930 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -228,6 +228,7 @@ def _log_model_explainability(
         # TODO: The explainer saver is buggy, if `get_underlying_model_flavor` return "unknown",
         #   then fallback to shap explainer saver, and shap explainer will call `model.save`
         #   for sklearn model, there is no `.save` method, so error will happen.
+
         """
         mlflow.shap.log_explainer(
             explainer,

From a3c56696fad238f70f0824279d0c0e998ed7541e Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 16 Dec 2021 10:12:17 +0800
Subject: [PATCH 073/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              | 4 +++-
 mlflow/models/evaluation/default_evaluator.py | 6 +++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index cc5c2a7895610..3711cdb57941a 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -230,7 +230,7 @@ def __init__(self, data, labels, name=None, path=None, feature_names=None):
                     raise ValueError('feature name list must be the same length with feature data.')
                 self._feature_names = feature_names
             else:
-                self._feature_names = [f'f{i}' for i in range(num_features)]
+                self._feature_names = [f'f_{i}' for i in range(num_features)]
         else:
             pd_column_names = [c for c in self.data.columns if c != self.labels]
             if feature_names is not None:
@@ -499,6 +499,8 @@ def _normalize_evaluators_and_evaluator_config_args(
     if evaluators is None:
         evaluator_name_list = list(_model_evaluation_registry._registry.keys())
         if evaluator_config is not None:
+            if evaluator_name_list == ['default']:
+                
             raise ValueError(
                 'If `evaluators` argument is None, `evaluator_config` argument must be None too.'
             )
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index da21433517930..d430673833915 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -176,8 +176,12 @@ def _log_model_explainability(
             except ImportError:
                 pass
 
-        # TODO: alias truncated name if duplicated
         truncated_feature_names = [truncate_str_from_middle(f, 20) for f in feature_names]
+        for i, truncated_name in enumerate(truncated_feature_names):
+            if truncated_name != feature_names[i]:
+                # For truncated name, attach "(f_{feature_index})" at the end
+                truncated_feature_names[i] = f'{truncated_name}(f_{i})'
+
         truncated_feature_name_map = {f: f2 for f, f2 in zip(feature_names, truncated_feature_names)}
         if isinstance(X, pd.DataFrame):
             X = X.rename(columns=truncated_feature_name_map, copy=False)

From 09364849c70a08ca8401b5c69527d4ce92a37bd9 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 16 Dec 2021 11:02:23 +0800
Subject: [PATCH 074/120] address comments

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              | 2 --
 mlflow/models/evaluation/default_evaluator.py | 5 ++---
 mlflow/pyfunc/__init__.py                     | 3 +++
 mlflow/shap.py                                | 9 ++++++++-
 4 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 3711cdb57941a..1c93eca856b75 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -499,8 +499,6 @@ def _normalize_evaluators_and_evaluator_config_args(
     if evaluators is None:
         evaluator_name_list = list(_model_evaluation_registry._registry.keys())
         if evaluator_config is not None:
-            if evaluator_name_list == ['default']:
-                
             raise ValueError(
                 'If `evaluators` argument is None, `evaluator_config` argument must be None too.'
             )
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index d430673833915..3da749943ee89 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -233,12 +233,11 @@ def _log_model_explainability(
         #   then fallback to shap explainer saver, and shap explainer will call `model.save`
         #   for sklearn model, there is no `.save` method, so error will happen.
 
-        """
         mlflow.shap.log_explainer(
             explainer,
             artifact_path=DefaultEvaluator._gen_log_key('explainer', dataset_name)
         )
-        """
+
 
         def plot_beeswarm():
             pyplot.subplots_adjust(bottom=0.2, left=0.4)
@@ -291,7 +290,7 @@ def _evaluate_classifier(self, temp_dir, model, X, y, dataset_name, feature_name
         #  [P0] Mean: Computes the (weighted) mean of the given values.
 
         if is_binomial:
-            if hasattr(model, "predict_proba"):
+            if model.support_predict_proba():
                 # TODO: for xgb disable feature names check
                 y_probs = model.predict_proba(X)
                 y_prob = y_probs[:, 1]
diff --git a/mlflow/pyfunc/__init__.py b/mlflow/pyfunc/__init__.py
index a5d772a9d54fd..f56a2e19a0131 100644
--- a/mlflow/pyfunc/__init__.py
+++ b/mlflow/pyfunc/__init__.py
@@ -604,6 +604,9 @@ def predict(self, data: PyFuncInput) -> PyFuncOutput:
             data = _enforce_schema(data, input_schema)
         return self._model_impl.predict(data)
 
+    def support_predict_proba(self):
+        return False
+
     @property
     def metadata(self):
         """Model metadata."""
diff --git a/mlflow/shap.py b/mlflow/shap.py
index 00515eaf7c6f8..286446bf5d733 100644
--- a/mlflow/shap.py
+++ b/mlflow/shap.py
@@ -449,7 +449,14 @@ def save_explainer(
         if serialize_model_using_mlflow and serializable_by_mlflow:
             explainer.save(explainer_output_file_handle, model_saver=False)
         else:
-            explainer.save(explainer_output_file_handle)
+            try:
+                explainer.save(explainer_output_file_handle)
+            except Exception:
+                # Some model may not support explainer model_saver,
+                # in this case, fallback to save explainer without saving model.
+                explainer_output_file_handle.seek(0)
+                explainer_output_file_handle.truncate(0)
+                explainer.save(explainer_output_file_handle, model_saver=False)
 
     pyfunc.add_to_model(
         mlflow_model,

From 20f5d12889c4f66adcc83e061805c59f24c96b70 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 16 Dec 2021 21:09:18 +0800
Subject: [PATCH 075/120] update shap init

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/default_evaluator.py | 20 ++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 3da749943ee89..bf17e4102f7d0 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -80,6 +80,9 @@ def _load_content_from_file(self, local_artifact_path):
 _DEFAULT_SAMPLE_ROWS_FOR_SHAP = 2000
 
 
+_shap_initialized = False
+
+
 class DefaultEvaluator(ModelEvaluator):
     def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
         return model_type in ["classifier", "regressor"]
@@ -140,8 +143,18 @@ def _log_model_explainability(
         if not evaluator_config.get('log_model_explainability', True):
             return
 
-        import shap
-        import shap.maskers
+        try:
+            global _shap_initialized
+            import shap
+            import shap.maskers
+
+            if not _shap_initialized:
+                shap.initjs()
+                _shap_initialized = True
+        except ImportError:
+            _logger.warning('Shap package is not installed. Skip log model explainability.')
+            return
+
         import matplotlib.pyplot as pyplot
 
         sample_rows = evaluator_config.get('explainability_nsamples', _DEFAULT_SAMPLE_ROWS_FOR_SHAP)
@@ -410,9 +423,6 @@ def evaluate(
         evaluator_config,
         **kwargs,
     ):
-        import shap
-
-        shap.initjs()
         with TempDir() as temp_dir:
             X, y = dataset._extract_features_and_labels()
             if model_type == "classifier":

From a55dcd885638771a7d039b13967eaa1c0f14a6c3 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 16 Dec 2021 23:19:25 +0800
Subject: [PATCH 076/120] update explainer creating

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/default_evaluator.py | 34 ++++++-------------
 mlflow/pyfunc/__init__.py                     |  3 --
 2 files changed, 10 insertions(+), 27 deletions(-)

diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index bf17e4102f7d0..ab079081e3852 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -120,6 +120,7 @@ def _log_image_artifact(
         artifact = ImageEvaluationArtifact(uri=get_artifact_uri(run_id, artifact_file_name))
         artifact.load(artifact_file_local_path)
         artifacts[artifact_name] = artifact
+        pyplot.close(pyplot.gcf())
 
     def _log_pandas_df_artifact(
         self, artifacts, temp_dir, pandas_df, run_id, artifact_name, dataset_name, model
@@ -212,25 +213,11 @@ def _log_model_explainability(
                 )
                 shap_values = explainer(sampled_X)
         else:
-            maskers = shap.maskers.Independent(sampled_X)
-            if raw_model and not is_multinomial_classifier and \
-                    shap.explainers.Linear.supports_model_with_masker(raw_model, maskers):
-                explainer = shap.explainers.Linear(
-                    raw_model, maskers, feature_names=truncated_feature_names
-                )
-                shap_values = explainer(sampled_X)
-            elif raw_model and not is_multinomial_classifier and \
-                    shap.explainers.Tree.supports_model_with_masker(raw_model, maskers):
-                explainer = shap.explainers.Tree(
-                    raw_model, maskers, feature_names=truncated_feature_names
-                )
-                shap_values = explainer(sampled_X)
-            elif raw_model and shap.explainers.Additive.supports_model_with_masker(
-                    raw_model, maskers
-            ):
-                explainer = shap.explainers.Additive(
-                    raw_model, maskers, feature_names=truncated_feature_names
-                )
+            if raw_model and not is_multinomial_classifier:
+                # For mulitnomial classifier, shap.Explainer may choose Tree/Linear explainer for
+                # raw model, this case shap plot doesn't support it well, so exclude the
+                # multinomial_classifier case here.
+                explainer = shap.Explainer(raw_model, sampled_X, feature_names=truncated_feature_names)
                 shap_values = explainer(sampled_X)
             else:
                 # fallback to default explainer
@@ -246,11 +233,10 @@ def _log_model_explainability(
         #   then fallback to shap explainer saver, and shap explainer will call `model.save`
         #   for sklearn model, there is no `.save` method, so error will happen.
 
-        mlflow.shap.log_explainer(
-            explainer,
-            artifact_path=DefaultEvaluator._gen_log_key('explainer', dataset_name)
-        )
-
+        # mlflow.shap.log_explainer(
+        #    explainer,
+        #    artifact_path=DefaultEvaluator._gen_log_key('explainer', dataset_name)
+        # )
 
         def plot_beeswarm():
             pyplot.subplots_adjust(bottom=0.2, left=0.4)
diff --git a/mlflow/pyfunc/__init__.py b/mlflow/pyfunc/__init__.py
index f56a2e19a0131..a5d772a9d54fd 100644
--- a/mlflow/pyfunc/__init__.py
+++ b/mlflow/pyfunc/__init__.py
@@ -604,9 +604,6 @@ def predict(self, data: PyFuncInput) -> PyFuncOutput:
             data = _enforce_schema(data, input_schema)
         return self._model_impl.predict(data)
 
-    def support_predict_proba(self):
-        return False
-
     @property
     def metadata(self):
         """Model metadata."""

From 4b232095937094b80897118ce4ef1c7ddd1009b9 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 16 Dec 2021 23:28:20 +0800
Subject: [PATCH 077/120] update predict_proba

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/default_evaluator.py | 72 +++++++++++--------
 1 file changed, 41 insertions(+), 31 deletions(-)

diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index ab079081e3852..7ea2a1eb6fa87 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -137,33 +137,10 @@ def _log_pandas_df_artifact(
         artifact.load(artifact_file_local_path)
         artifacts[artifact_name] = artifact
 
-    def _log_model_explainability(
-            self, artifacts, temp_dir, model, X, dataset_name, feature_names, run_id, evaluator_config,
-            is_multinomial_classifier,
-    ):
-        if not evaluator_config.get('log_model_explainability', True):
-            return
-
-        try:
-            global _shap_initialized
-            import shap
-            import shap.maskers
-
-            if not _shap_initialized:
-                shap.initjs()
-                _shap_initialized = True
-        except ImportError:
-            _logger.warning('Shap package is not installed. Skip log model explainability.')
-            return
-
-        import matplotlib.pyplot as pyplot
-
-        sample_rows = evaluator_config.get('explainability_nsamples', _DEFAULT_SAMPLE_ROWS_FOR_SHAP)
-        algorithm = evaluator_config.get('explainability_algorithm', None)
-
+    def _extract_raw_model_and_predict_fn(self, model):
         model_loader_module = model.metadata.flavors['python_function']["loader_module"]
-
         predict_fn = model.predict
+        predict_proba_fn = None
 
         try:
             if model_loader_module == 'mlflow.sklearn':
@@ -181,15 +158,44 @@ def _log_model_explainability(
 
         if raw_model:
             predict_fn = raw_model.predict
+            predict_proba_fn = getattr(raw_model, 'predict_proba', None)
+
             try:
                 import xgboost
                 if isinstance(raw_model, xgboost.XGBModel):
                     # Because shap evaluation will pass evaluation data in ndarray format
                     # (without feature names), if set validate_features=True it will raise error.
                     predict_fn = partial(predict_fn, validate_features=False)
+                    predict_proba_fn = partial(predict_proba_fn, validate_features=False)
             except ImportError:
                 pass
 
+        return raw_model, predict_fn, predict_proba_fn
+
+    def _log_model_explainability(
+            self, artifacts, temp_dir, predict_fn, raw_model, X, dataset_name, feature_names, run_id, evaluator_config,
+            is_multinomial_classifier,
+    ):
+        if not evaluator_config.get('log_model_explainability', True):
+            return
+
+        try:
+            global _shap_initialized
+            import shap
+            import shap.maskers
+
+            if not _shap_initialized:
+                shap.initjs()
+                _shap_initialized = True
+        except ImportError:
+            _logger.warning('Shap package is not installed. Skip log model explainability.')
+            return
+
+        import matplotlib.pyplot as pyplot
+
+        sample_rows = evaluator_config.get('explainability_nsamples', _DEFAULT_SAMPLE_ROWS_FOR_SHAP)
+        algorithm = evaluator_config.get('explainability_algorithm', None)
+
         truncated_feature_names = [truncate_str_from_middle(f, 20) for f in feature_names]
         for i, truncated_name in enumerate(truncated_feature_names):
             if truncated_name != feature_names[i]:
@@ -284,12 +290,12 @@ def _evaluate_classifier(self, temp_dir, model, X, y, dataset_name, feature_name
         metrics["accuracy"] = sk_metrics.accuracy_score(y, y_pred)
         metrics["example_count"] = len(X)
 
-        # TODO: sum/mean on what data ?
-        #  [P0] Sum: Computes the (weighted) sum of the given values.
-        #  [P0] Mean: Computes the (weighted) mean of the given values.
+        raw_model, predict_fn, predict_proba_fn = self._extract_raw_model_and_predict_fn(
+            model
+        )
 
         if is_binomial:
-            if model.support_predict_proba():
+            if predict_proba_fn is not None:
                 # TODO: for xgb disable feature names check
                 y_probs = model.predict_proba(X)
                 y_prob = y_probs[:, 1]
@@ -372,7 +378,7 @@ def plot_confusion_matrix():
         self._log_metrics(run_id, metrics, dataset_name)
 
         self._log_model_explainability(
-            artifacts, temp_dir, model, X, dataset_name, feature_names, run_id, evaluator_config,
+            artifacts, temp_dir, predict_fn, raw_model, X, dataset_name, feature_names, run_id, evaluator_config,
             is_multinomial_classifier=(num_classes > 2)
         )
 
@@ -393,8 +399,12 @@ def _evaluate_regressor(self, temp_dir, model, X, y, dataset_name, feature_names
         metrics['mean_absolute_percentage_error'] = \
             sk_metrics.mean_absolute_percentage_error(y, y_pred)
 
+        raw_model, predict_fn, _ = self._extract_raw_model_and_predict_fn(
+            model
+        )
+
         self._log_model_explainability(
-            artifacts, temp_dir, model, X, dataset_name, feature_names, run_id, evaluator_config,
+            artifacts, temp_dir, predict_fn, raw_model, X, dataset_name, feature_names, run_id, evaluator_config,
             is_multinomial_classifier=False,
         )
         self._log_metrics(run_id, metrics, dataset_name)

From 2471afb4558024d392c0d2ca6d7e059841b02036 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Fri, 17 Dec 2021 17:17:42 +0800
Subject: [PATCH 078/120] address comments

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              | 40 ++++++++++++++-----
 mlflow/models/evaluation/default_evaluator.py | 38 +++++++++++++-----
 mlflow/shap.py                                |  9 +----
 mlflow/utils/string_utils.py                  |  2 +-
 4 files changed, 62 insertions(+), 27 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 1c93eca856b75..a368aa94528bf 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -496,13 +496,39 @@ def _normalize_evaluators_and_evaluator_config_args(
 ):
     from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
 
+    def check_nesting_config_dict(_evaluator_name_list, _evaluator_name_to_conf_map):
+        return isinstance(_evaluator_name_to_conf_map, dict) and \
+               all(k in _evaluator_name_list and isinstance(v, dict)
+                   for k, v in _evaluator_name_to_conf_map.items())
+
     if evaluators is None:
         evaluator_name_list = list(_model_evaluation_registry._registry.keys())
+        if len(evaluator_name_list) > 1:
+            print(f'Hint: Multiple registered evaluators are found {evaluator_name_list} and '
+                  'they will all be used in evaluation. If you want to evaluate with one '
+                  'evaluator, specify the `evaluator` argument and (optional) specify the '
+                  '`evaluator_config` argument.')
         if evaluator_config is not None:
-            raise ValueError(
-                'If `evaluators` argument is None, `evaluator_config` argument must be None too.'
+            conf_dict_value_error = ValueError(
+                "If `evaluators` argument is None, all registered evaluators will be used, "
+                "if only default evaluator available, the `evaluator_config` argument can be "
+                "config dict for default evaluator, otherwise the `evaluator_config` argument "
+                "must be a dict contains mapping from evaluator name to individual "
+                "evaluator config dict."
             )
-        evaluator_name_to_conf_map = {}
+            if evaluator_name_list == ['default']:
+                if not isinstance(evaluator_config, dict):
+                    raise conf_dict_value_error
+                elif 'default' not in evaluator_config:
+                    evaluator_name_to_conf_map = {'default': evaluator_config}
+                else:
+                    evaluator_name_to_conf_map = evaluator_config
+            else:
+                if not check_nesting_config_dict(evaluator_name_list, evaluator_config):
+                    raise conf_dict_value_error
+                evaluator_name_to_conf_map = evaluator_config
+        else:
+            evaluator_name_to_conf_map = {}
     elif isinstance(evaluators, str):
         if not (evaluator_config is None or isinstance(evaluator_config, dict)):
             raise ValueError(
@@ -510,14 +536,10 @@ def _normalize_evaluators_and_evaluator_config_args(
                 "None or a dict containing config items for the evaluator."
             )
         evaluator_name_list = [evaluators]
-        evaluator_name_to_conf_map = {evaluators[0]: evaluator_config}
+        evaluator_name_to_conf_map = {evaluators: evaluator_config}
     elif isinstance(evaluators, list):
         if evaluator_config is not None:
-            if not (
-                    isinstance(evaluator_config, dict)
-                    and all(k in evaluators and isinstance(v, dict)
-                            for k, v in evaluator_config.items())
-            ):
+            if not check_nesting_config_dict(evaluators, evaluator_config):
                 raise ValueError(
                     "If `evaluators` argument is a evaluator name list, evaluator_config "
                     "must be a dict contains mapping from evaluator name to individual "
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 7ea2a1eb6fa87..140a8e1324976 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -83,6 +83,16 @@ def _load_content_from_file(self, local_artifact_path):
 _shap_initialized = False
 
 
+def _infer_model_type_by_labels(labels):
+    distinct_labels = set(labels)
+    for v in distinct_labels:
+        if v < 0 or not float(v).is_integer():
+            return "regressor"
+    if len(distinct_labels) > 1000 and len(distinct_labels) / len(labels) > 0.7:
+        return "regressor"
+    return "classifier"
+
+
 class DefaultEvaluator(ModelEvaluator):
     def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
         return model_type in ["classifier", "regressor"]
@@ -234,15 +244,16 @@ def _log_model_explainability(
 
         _logger.info(f'Shap explainer {explainer.__class__.__name__} is used.')
 
-        # TODO: seems infer pip req fail when log_explainer.
-        # TODO: The explainer saver is buggy, if `get_underlying_model_flavor` return "unknown",
-        #   then fallback to shap explainer saver, and shap explainer will call `model.save`
-        #   for sklearn model, there is no `.save` method, so error will happen.
-
-        # mlflow.shap.log_explainer(
-        #    explainer,
-        #    artifact_path=DefaultEvaluator._gen_log_key('explainer', dataset_name)
-        # )
+        try:
+            mlflow.shap.log_explainer(
+               explainer,
+               artifact_path=DefaultEvaluator._gen_log_key('explainer', dataset_name)
+            )
+        except Exception as e:
+            # TODO: The explainer saver is buggy, if `get_underlying_model_flavor` return "unknown",
+            #   then fallback to shap explainer saver, and shap explainer will call `model.save`
+            #   for sklearn model, there is no `.save` method, so error will happen.
+            _logger.warning(f'Log explainer failed. Reason: {str(e)}')
 
         def plot_beeswarm():
             pyplot.subplots_adjust(bottom=0.2, left=0.4)
@@ -421,6 +432,15 @@ def evaluate(
     ):
         with TempDir() as temp_dir:
             X, y = dataset._extract_features_and_labels()
+            infered_model_type = _infer_model_type_by_labels(y)
+
+            if model_type != infered_model_type:
+                _logger.warning(
+                    f"According to the evaluation dataset label values, the model type looks like "
+                    f"{infered_model_type}, but you specified model type {model_type}. Please "
+                    f"check you set model type or evaluation dataset correctly."
+                )
+
             if model_type == "classifier":
                 return self._evaluate_classifier(
                     temp_dir, model, X, y, dataset.name, dataset.feature_names, run_id, evaluator_config
diff --git a/mlflow/shap.py b/mlflow/shap.py
index 286446bf5d733..00515eaf7c6f8 100644
--- a/mlflow/shap.py
+++ b/mlflow/shap.py
@@ -449,14 +449,7 @@ def save_explainer(
         if serialize_model_using_mlflow and serializable_by_mlflow:
             explainer.save(explainer_output_file_handle, model_saver=False)
         else:
-            try:
-                explainer.save(explainer_output_file_handle)
-            except Exception:
-                # Some model may not support explainer model_saver,
-                # in this case, fallback to save explainer without saving model.
-                explainer_output_file_handle.seek(0)
-                explainer_output_file_handle.truncate(0)
-                explainer.save(explainer_output_file_handle, model_saver=False)
+            explainer.save(explainer_output_file_handle)
 
     pyfunc.add_to_model(
         mlflow_model,
diff --git a/mlflow/utils/string_utils.py b/mlflow/utils/string_utils.py
index f455edbd331e8..4cce65499b7cb 100644
--- a/mlflow/utils/string_utils.py
+++ b/mlflow/utils/string_utils.py
@@ -15,7 +15,7 @@ def is_string_type(item):
 
 
 def truncate_str_from_middle(s, max_length):
-    assert max_length > 3
+    assert max_length > 5
     if len(s) <= max_length:
         return s
     else:

From 3abbe04e3243feb3ab6ef60f91064373f3660d78 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 20 Dec 2021 16:12:36 +0800
Subject: [PATCH 079/120] refactor

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/default_evaluator.py | 340 +++++++++---------
 1 file changed, 169 insertions(+), 171 deletions(-)

diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 140a8e1324976..b8916384ac007 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -93,100 +93,99 @@ def _infer_model_type_by_labels(labels):
     return "classifier"
 
 
+def _extract_raw_model_and_predict_fn(model):
+    model_loader_module = model.metadata.flavors['python_function']["loader_module"]
+    predict_fn = model.predict
+    predict_proba_fn = None
+
+    try:
+        if model_loader_module == 'mlflow.sklearn':
+            raw_model = model._model_impl
+        elif model_loader_module == 'mlflow.lightgbm':
+            raw_model = model._model_impl.lgb_model
+        elif model_loader_module == 'mlflow.xgboost':
+            raw_model = model._model_impl.xgb_model
+        else:
+            raw_model = None
+    except Exception as e:
+        raw_model = None
+        _logger.warning(f'Raw model resolution fails unexpectedly on PyFuncModel {model!r}, '
+                        f'error message is {e}')
+
+    if raw_model:
+        predict_fn = raw_model.predict
+        predict_proba_fn = getattr(raw_model, 'predict_proba', None)
+
+        try:
+            import xgboost
+            if isinstance(raw_model, xgboost.XGBModel):
+                # Because shap evaluation will pass evaluation data in ndarray format
+                # (without feature names), if set validate_features=True it will raise error.
+                predict_fn = partial(predict_fn, validate_features=False)
+                predict_proba_fn = partial(predict_proba_fn, validate_features=False)
+        except ImportError:
+            pass
+
+    return raw_model, predict_fn, predict_proba_fn
+
+
+def _gen_log_key(key, dataset_name):
+    return f'{key}_on_data_{dataset_name}'
+
+
 class DefaultEvaluator(ModelEvaluator):
     def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
         return model_type in ["classifier", "regressor"]
 
-    @staticmethod
-    def _gen_log_key(key, dataset_name):
-        return f'{key}_on_data_{dataset_name}'
-
-    def _log_metrics(self, run_id, metrics, dataset_name):
+    def _log_metrics(self):
         """
         Helper method to log metrics into specified run.
         """
-        client = mlflow.tracking.MlflowClient()
         timestamp = int(time.time() * 1000)
-        client.log_batch(
-            run_id,
+        self.client.log_batch(
+            self.run_id,
             metrics=[
-                Metric(key=DefaultEvaluator._gen_log_key(key, dataset_name),
+                Metric(key=_gen_log_key(key, self.dataset_name),
                        value=value, timestamp=timestamp, step=0)
-                for key, value in metrics.items()
+                for key, value in self.metrics.items()
             ],
         )
 
     def _log_image_artifact(
-        self, artifacts, temp_dir, do_plot, run_id, artifact_name, dataset_name,
+        self, do_plot, artifact_name,
     ):
         import matplotlib.pyplot as pyplot
-        client = mlflow.tracking.MlflowClient()
-        pyplot.clf()
-        do_plot()
-        artifact_file_name = DefaultEvaluator._gen_log_key(artifact_name, dataset_name) + '.png'
-        artifact_file_local_path = temp_dir.path(artifact_file_name)
-        pyplot.savefig(artifact_file_local_path)
-        client.log_artifact(run_id, artifact_file_local_path)
-        artifact = ImageEvaluationArtifact(uri=get_artifact_uri(run_id, artifact_file_name))
+        artifact_file_name = _gen_log_key(artifact_name, self.dataset_name) + '.png'
+        artifact_file_local_path = self.temp_dir.path(artifact_file_name)
+
+        try:
+            pyplot.clf()
+            do_plot()
+            pyplot.savefig(artifact_file_local_path)
+        finally:
+            pyplot.close(pyplot.gcf())
+
+        mlflow.log_artifact(artifact_file_local_path)
+        artifact = ImageEvaluationArtifact(uri=mlflow.get_artifact_uri(artifact_file_name))
         artifact.load(artifact_file_local_path)
-        artifacts[artifact_name] = artifact
-        pyplot.close(pyplot.gcf())
+        self.artifacts[artifact_name] = artifact
 
     def _log_pandas_df_artifact(
-        self, artifacts, temp_dir, pandas_df, run_id, artifact_name, dataset_name, model
+        self, pandas_df, artifact_name
     ):
-        client = mlflow.tracking.MlflowClient()
-        artifact_file_name = DefaultEvaluator._gen_log_key(artifact_name, dataset_name) + '.csv'
-        artifact_file_local_path = temp_dir.path(artifact_file_name)
+        artifact_file_name = _gen_log_key(artifact_name, self.dataset_name) + '.csv'
+        artifact_file_local_path = self.temp_dir.path(artifact_file_name)
         pandas_df.to_csv(artifact_file_local_path, index=False)
-        client.log_artifact(run_id, artifact_file_local_path)
+        mlflow.log_artifact(artifact_file_local_path)
         artifact = CsvEvaluationArtifact(
-            uri=get_artifact_uri(run_id, artifact_file_name),
+            uri=mlflow.get_artifact_uri(artifact_file_name),
             content=pandas_df,
         )
         artifact.load(artifact_file_local_path)
-        artifacts[artifact_name] = artifact
-
-    def _extract_raw_model_and_predict_fn(self, model):
-        model_loader_module = model.metadata.flavors['python_function']["loader_module"]
-        predict_fn = model.predict
-        predict_proba_fn = None
+        self.artifacts[artifact_name] = artifact
 
-        try:
-            if model_loader_module == 'mlflow.sklearn':
-                raw_model = model._model_impl
-            elif model_loader_module == 'mlflow.lightgbm':
-                raw_model = model._model_impl.lgb_model
-            elif model_loader_module == 'mlflow.xgboost':
-                raw_model = model._model_impl.xgb_model
-            else:
-                raw_model = None
-        except Exception as e:
-            raw_model = None
-            _logger.warning(f'Raw model resolution fails unexpectedly on PyFuncModel {model!r}, '
-                            f'error message is {e}')
-
-        if raw_model:
-            predict_fn = raw_model.predict
-            predict_proba_fn = getattr(raw_model, 'predict_proba', None)
-
-            try:
-                import xgboost
-                if isinstance(raw_model, xgboost.XGBModel):
-                    # Because shap evaluation will pass evaluation data in ndarray format
-                    # (without feature names), if set validate_features=True it will raise error.
-                    predict_fn = partial(predict_fn, validate_features=False)
-                    predict_proba_fn = partial(predict_proba_fn, validate_features=False)
-            except ImportError:
-                pass
-
-        return raw_model, predict_fn, predict_proba_fn
-
-    def _log_model_explainability(
-            self, artifacts, temp_dir, predict_fn, raw_model, X, dataset_name, feature_names, run_id, evaluator_config,
-            is_multinomial_classifier,
-    ):
-        if not evaluator_config.get('log_model_explainability', True):
+    def _log_model_explainability(self):
+        if not self.evaluator_config.get('log_model_explainability', True):
             return
 
         try:
@@ -203,42 +202,49 @@ def _log_model_explainability(
 
         import matplotlib.pyplot as pyplot
 
-        sample_rows = evaluator_config.get('explainability_nsamples', _DEFAULT_SAMPLE_ROWS_FOR_SHAP)
-        algorithm = evaluator_config.get('explainability_algorithm', None)
+        is_multinomial_classifier = self.model_type == 'classifier' and self.num_classes > 2
+
+        sample_rows = self.evaluator_config.get('explainability_nsamples', _DEFAULT_SAMPLE_ROWS_FOR_SHAP)
+        algorithm = self.evaluator_config.get('explainability_algorithm', None)
 
-        truncated_feature_names = [truncate_str_from_middle(f, 20) for f in feature_names]
+        truncated_feature_names = [truncate_str_from_middle(f, 20) for f in self.feature_names]
         for i, truncated_name in enumerate(truncated_feature_names):
-            if truncated_name != feature_names[i]:
+            if truncated_name != self.feature_names[i]:
                 # For truncated name, attach "(f_{feature_index})" at the end
                 truncated_feature_names[i] = f'{truncated_name}(f_{i})'
 
-        truncated_feature_name_map = {f: f2 for f, f2 in zip(feature_names, truncated_feature_names)}
-        if isinstance(X, pd.DataFrame):
-            X = X.rename(columns=truncated_feature_name_map, copy=False)
+        truncated_feature_name_map = {f: f2 for f, f2 in zip(self.feature_names, truncated_feature_names)}
 
-        sampled_X = shap.sample(X, sample_rows)
+        if isinstance(self.X, pd.DataFrame):
+            # For some shap explainer, the plot will use the DataFrame column names instead of
+            # using feature_names argument value. So rename the dataframe column names.
+            renamed_X = self.X.rename(columns=truncated_feature_name_map, copy=False)
+        else:
+            renamed_X = self.X
+
+        sampled_X = shap.sample(renamed_X, sample_rows)
         if algorithm:
             if algorithm == 'sampling':
                 explainer = shap.explainers.Sampling(
-                    predict_fn, X, feature_names=truncated_feature_names
+                    self.predict_fn, renamed_X, feature_names=truncated_feature_names
                 )
-                shap_values = explainer(X, sample_rows)
+                shap_values = explainer(renamed_X, sample_rows)
             else:
                 explainer = shap.Explainer(
-                    predict_fn, sampled_X, feature_names=truncated_feature_names, algorithm=algorithm
+                    self.predict_fn, sampled_X, feature_names=truncated_feature_names, algorithm=algorithm
                 )
                 shap_values = explainer(sampled_X)
         else:
-            if raw_model and not is_multinomial_classifier:
+            if self.raw_model and not is_multinomial_classifier:
                 # For mulitnomial classifier, shap.Explainer may choose Tree/Linear explainer for
                 # raw model, this case shap plot doesn't support it well, so exclude the
                 # multinomial_classifier case here.
-                explainer = shap.Explainer(raw_model, sampled_X, feature_names=truncated_feature_names)
+                explainer = shap.Explainer(self.raw_model, sampled_X, feature_names=truncated_feature_names)
                 shap_values = explainer(sampled_X)
             else:
                 # fallback to default explainer
                 explainer = shap.Explainer(
-                    predict_fn, sampled_X, feature_names=truncated_feature_names
+                    self.predict_fn, sampled_X, feature_names=truncated_feature_names
                 )
                 shap_values = explainer(sampled_X)
 
@@ -247,7 +253,7 @@ def _log_model_explainability(
         try:
             mlflow.shap.log_explainer(
                explainer,
-               artifact_path=DefaultEvaluator._gen_log_key('explainer', dataset_name)
+               artifact_path=_gen_log_key('explainer', self.dataset_name)
             )
         except Exception as e:
             # TODO: The explainer saver is buggy, if `get_underlying_model_flavor` return "unknown",
@@ -260,7 +266,7 @@ def plot_beeswarm():
             shap.plots.beeswarm(shap_values, show=False)
 
         self._log_image_artifact(
-            artifacts, temp_dir, plot_beeswarm, run_id, "shap_beeswarm", dataset_name,
+            plot_beeswarm, "shap_beeswarm",
         )
 
         def plot_summary():
@@ -268,7 +274,7 @@ def plot_summary():
             shap.summary_plot(shap_values, show=False)
 
         self._log_image_artifact(
-            artifacts, temp_dir, plot_summary, run_id, "shap_summary", dataset_name,
+            plot_summary, "shap_summary",
         )
 
         def plot_feature_importance():
@@ -276,68 +282,60 @@ def plot_feature_importance():
             shap.plots.bar(shap_values, show=False)
 
         self._log_image_artifact(
-            artifacts,
-            temp_dir,
             plot_feature_importance,
-            run_id,
             "shap_feature_importance",
-            dataset_name,
         )
 
-    def _evaluate_classifier(self, temp_dir, model, X, y, dataset_name, feature_names, run_id, evaluator_config):
+    def _evaluate_classifier(self):
         from mlflow.models.evaluation.lift_curve import plot_lift_curve
+        raw_model, predict_fn, predict_proba_fn = _extract_raw_model_and_predict_fn(self.model)
+        self.raw_model = raw_model
+        self.predict_fn = predict_fn
+        self.predict_proba_fn = predict_proba_fn
 
         # Note: require labels to be number of 0, 1, 2, .. num_classes - 1
-        label_list = sorted(list(set(y)))
+        label_list = sorted(list(set(self.y)))
         assert label_list[0] == 0, "Label values must being at '0'."
-        num_classes = len(label_list)
-
-        y_pred = model.predict(X)
+        self.num_classes = len(label_list)
 
-        is_binomial = num_classes <= 2
+        self.y_pred = predict_fn(self.X)
+        self.is_binomial = self.num_classes <= 2
 
-        metrics = EvaluationMetrics()
-        artifacts = {}
-        metrics["accuracy"] = sk_metrics.accuracy_score(y, y_pred)
-        metrics["example_count"] = len(X)
+        self.metrics["accuracy"] = sk_metrics.accuracy_score(self.y, self.y_pred)
+        self.metrics["example_count"] = len(self.X)
 
-        raw_model, predict_fn, predict_proba_fn = self._extract_raw_model_and_predict_fn(
-            model
-        )
-
-        if is_binomial:
+        if self.is_binomial:
             if predict_proba_fn is not None:
-                # TODO: for xgb disable feature names check
-                y_probs = model.predict_proba(X)
-                y_prob = y_probs[:, 1]
+                self.y_probs = predict_proba_fn(self.X)
+                self.y_prob = self.y_probs[:, 1]
             else:
-                y_probs = None
-                y_prob = None
+                self.y_probs = None
+                self.y_prob = None
 
-            confusion_matrix = sk_metrics.confusion_matrix(y, y_pred)
+            confusion_matrix = sk_metrics.confusion_matrix(self.y, self.y_pred)
             tn, fp, fn, tp = confusion_matrix.ravel()
-            metrics["true_negatives"] = tn
-            metrics["false_positives"] = fp
-            metrics["false_negatives"] = fn
-            metrics["true_positives"] = tp
-            metrics["recall"] = sk_metrics.recall_score(y, y_pred)
-            metrics["precision"] = sk_metrics.precision_score(y, y_pred)
-            metrics["f1_score"] = sk_metrics.f1_score(y, y_pred)
+            self.metrics["true_negatives"] = tn
+            self.metrics["false_positives"] = fp
+            self.metrics["false_negatives"] = fn
+            self.metrics["true_positives"] = tp
+            self.metrics["recall"] = sk_metrics.recall_score(self.y, self.y_pred)
+            self.metrics["precision"] = sk_metrics.precision_score(self.y, self.y_pred)
+            self.metrics["f1_score"] = sk_metrics.f1_score(self.y, self.y_pred)
 
             # TODO:
             #  compute hinge loss, this requires calling decision_function of the model
             #  e.g., see https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC.decision_function
-            if y_probs is not None:
-                fpr, tpr, thresholds = sk_metrics.roc_curve(y, y_prob)
+            if self.y_probs is not None:
+                fpr, tpr, thresholds = sk_metrics.roc_curve(self.y, self.y_prob)
                 roc_curve_pandas_df = pd.DataFrame(
                     {"fpr": fpr, "tpr": tpr, "thresholds": thresholds}
                 )
                 self._log_pandas_df_artifact(
-                    artifacts, temp_dir, roc_curve_pandas_df, run_id, "roc_curve", dataset_name, model,
+                    roc_curve_pandas_df, "roc_curve",
                 )
 
                 roc_auc = sk_metrics.auc(fpr, tpr)
-                metrics["roc_auc"] = roc_auc
+                self.metrics["roc_auc"] = roc_auc
 
                 def plot_roc_curve():
                     sk_metrics.RocCurveDisplay(
@@ -346,21 +344,21 @@ def plot_roc_curve():
 
                 if hasattr(sk_metrics, 'RocCurveDisplay'):
                     self._log_image_artifact(
-                        artifacts, temp_dir, plot_roc_curve, run_id, "roc_curve", dataset_name,
+                        plot_roc_curve, "roc_curve",
                     )
 
-                precision, recall, thresholds = sk_metrics.precision_recall_curve(y, y_prob)
+                precision, recall, thresholds = \
+                    sk_metrics.precision_recall_curve(self.y, self.y_prob)
                 thresholds = np.append(thresholds, [1.0], axis=0)
                 pr_curve_pandas_df = pd.DataFrame(
                     {"precision": precision, "recall": recall, "thresholds": thresholds}
                 )
                 self._log_pandas_df_artifact(
-                    artifacts, temp_dir, pr_curve_pandas_df, run_id, "precision_recall_curve",
-                    dataset_name, model,
+                    pr_curve_pandas_df, "precision_recall_curve",
                 )
 
                 pr_auc = sk_metrics.auc(recall, precision)
-                metrics["precision_recall_auc"] = pr_auc
+                self.metrics["precision_recall_auc"] = pr_auc
 
                 def plot_pr_curve():
                     sk_metrics.PrecisionRecallDisplay(
@@ -369,13 +367,11 @@ def plot_pr_curve():
 
                 if hasattr(sk_metrics, 'PrecisionRecallDisplay'):
                     self._log_image_artifact(
-                        artifacts, temp_dir, plot_pr_curve, run_id, "precision_recall_curve", dataset_name,
+                        plot_pr_curve, "precision_recall_curve",
                     )
 
                 self._log_image_artifact(
-                    artifacts, temp_dir,
-                    lambda: plot_lift_curve(y, y_probs),
-                    run_id, "lift_curve", dataset_name,
+                    lambda: plot_lift_curve(self.y, self.y_probs), "lift_curve",
                 )
 
             def plot_confusion_matrix():
@@ -383,43 +379,33 @@ def plot_confusion_matrix():
 
             if hasattr(sk_metrics, 'ConfusionMatrixDisplay'):
                 self._log_image_artifact(
-                    artifacts, temp_dir, plot_confusion_matrix, run_id, "confusion_matrix", dataset_name,
+                    plot_confusion_matrix, "confusion_matrix",
                 )
 
-        self._log_metrics(run_id, metrics, dataset_name)
-
-        self._log_model_explainability(
-            artifacts, temp_dir, predict_fn, raw_model, X, dataset_name, feature_names, run_id, evaluator_config,
-            is_multinomial_classifier=(num_classes > 2)
-        )
-
-        return EvaluationResult(metrics, artifacts)
-
-    def _evaluate_regressor(self, temp_dir, model, X, y, dataset_name, feature_names, run_id, evaluator_config):
-        metrics = EvaluationMetrics()
-        artifacts = {}
-        y_pred = model.predict(X)
-        metrics["example_count"] = len(X)
-        metrics["mean_absolute_error"] = sk_metrics.mean_absolute_error(y, y_pred)
-        metrics["mean_squared_error"] = sk_metrics.mean_squared_error(y, y_pred)
-        metrics["root_mean_squared_error"] = math.sqrt(metrics["mean_squared_error"])
-        metrics['sum_on_label'] = sum(y)
-        metrics['mean_on_label'] = metrics['sum_on_label'] / metrics["example_count"]
-        metrics['r2_score'] = sk_metrics.r2_score(y, y_pred)
-        metrics['max_error'] = sk_metrics.max_error(y, y_pred)
-        metrics['mean_absolute_percentage_error'] = \
-            sk_metrics.mean_absolute_percentage_error(y, y_pred)
-
-        raw_model, predict_fn, _ = self._extract_raw_model_and_predict_fn(
-            model
-        )
-
-        self._log_model_explainability(
-            artifacts, temp_dir, predict_fn, raw_model, X, dataset_name, feature_names, run_id, evaluator_config,
-            is_multinomial_classifier=False,
-        )
-        self._log_metrics(run_id, metrics, dataset_name)
-        return EvaluationResult(metrics, artifacts)
+        self._log_metrics()
+        self._log_model_explainability()
+        return EvaluationResult(self.metrics, self.artifacts)
+
+    def _evaluate_regressor(self):
+        self.y_pred = self.model.predict(self.X)
+        self.metrics["example_count"] = len(self.X)
+        self.metrics["mean_absolute_error"] = sk_metrics.mean_absolute_error(self.y, self.y_pred)
+        self.metrics["mean_squared_error"] = sk_metrics.mean_squared_error(self.y, self.y_pred)
+        self.metrics["root_mean_squared_error"] = math.sqrt(self.metrics["mean_squared_error"])
+        self.metrics['sum_on_label'] = sum(self.y)
+        self.metrics['mean_on_label'] = self.metrics['sum_on_label'] / self.metrics["example_count"]
+        self.metrics['r2_score'] = sk_metrics.r2_score(self.y, self.y_pred)
+        self.metrics['max_error'] = sk_metrics.max_error(self.y, self.y_pred)
+        self.metrics['mean_absolute_percentage_error'] = \
+            sk_metrics.mean_absolute_percentage_error(self.y, self.y_pred)
+
+        raw_model, predict_fn, _ = _extract_raw_model_and_predict_fn(self.model)
+        self.raw_model = raw_model
+        self.predict_fn = predict_fn
+
+        self._log_metrics()
+        self._log_model_explainability()
+        return EvaluationResult(self.metrics, self.artifacts)
 
     def evaluate(
         self,
@@ -431,7 +417,23 @@ def evaluate(
         **kwargs,
     ):
         with TempDir() as temp_dir:
+            self.client = mlflow.tracking.MlflowClient()
+
+            self.temp_dir = temp_dir
+            self.model = model
+            self.model_type = model_type
+            self.dataset = dataset
+            self.run_id = run_id
+            self.evaluator_config = evaluator_config
+            self.dataset_name = dataset.name
+            self.feature_names = dataset.feature_names
+
             X, y = dataset._extract_features_and_labels()
+            self.X = X
+            self.y = y
+            self.metrics = EvaluationMetrics()
+            self.artifacts = {}
+
             infered_model_type = _infer_model_type_by_labels(y)
 
             if model_type != infered_model_type:
@@ -442,12 +444,8 @@ def evaluate(
                 )
 
             if model_type == "classifier":
-                return self._evaluate_classifier(
-                    temp_dir, model, X, y, dataset.name, dataset.feature_names, run_id, evaluator_config
-                )
+                return self._evaluate_classifier()
             elif model_type == "regressor":
-                return self._evaluate_regressor(
-                    temp_dir, model, X, y, dataset.name, dataset.feature_names, run_id, evaluator_config
-                )
+                return self._evaluate_regressor()
             else:
                 raise ValueError(f"Unsupported model type {model_type}")

From 74775e10f9b51bd5da29f5620f9e7a23e29df5ac Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 20 Dec 2021 21:41:26 +0800
Subject: [PATCH 080/120] add multi-class metrics artifacts

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/default_evaluator.py | 163 ++++++++++--------
 1 file changed, 95 insertions(+), 68 deletions(-)

diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index b8916384ac007..36b56e1be9de0 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -14,6 +14,7 @@
 import math
 import pandas as pd
 import numpy as np
+import json
 import time
 from functools import partial
 import logging
@@ -286,6 +287,70 @@ def plot_feature_importance():
             "shap_feature_importance",
         )
 
+    def _evaluate_per_class(self, positive_class, y, y_pred, y_proba):
+        """
+        if positive_class is an interger, generate metrics and artifacts on this class vs. rest,
+         and the y/y_pred/y_proba must be sum up to a binary "is class" and "is not class"
+        if positive_class is None, generate metrics and artifacts on binary y/y_pred/y_proba
+        """
+
+        def _gen_metric_name(name):
+            if positive_class is not None:
+                return f"class_{positive_class}_{name}"
+            else:
+                return name
+
+        confusion_matrix = sk_metrics.confusion_matrix(y, y_pred)
+        tn, fp, fn, tp = confusion_matrix.ravel()
+        self.metrics[_gen_metric_name("true_negatives")] = tn
+        self.metrics[_gen_metric_name("false_positives")] = fp
+        self.metrics[_gen_metric_name("false_negatives")] = fn
+        self.metrics[_gen_metric_name("true_positives")] = tp
+        self.metrics[_gen_metric_name("recall")] = sk_metrics.recall_score(y, y_pred)
+        self.metrics[_gen_metric_name("precision")] = sk_metrics.precision_score(y, y_pred)
+        self.metrics[_gen_metric_name("f1_score")] = sk_metrics.f1_score(y, y_pred)
+
+        if y_proba is not None:
+            fpr, tpr, thresholds = sk_metrics.roc_curve(y, y_proba)
+            roc_curve_pandas_df = pd.DataFrame(
+                {"fpr": fpr, "tpr": tpr, "thresholds": thresholds}
+            )
+            self._log_pandas_df_artifact(
+                roc_curve_pandas_df, _gen_metric_name("roc_curve"),
+            )
+
+            roc_auc = sk_metrics.auc(fpr, tpr)
+            self.metrics[_gen_metric_name("roc_auc")] = roc_auc
+
+            def plot_roc_curve():
+                sk_metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc).plot()
+
+            if hasattr(sk_metrics, 'RocCurveDisplay'):
+                self._log_image_artifact(plot_roc_curve, _gen_metric_name("roc_curve"))
+
+            precision, recall, thresholds = \
+                sk_metrics.precision_recall_curve(y, y_proba)
+            thresholds = np.append(thresholds, [1.0], axis=0)
+            pr_curve_pandas_df = pd.DataFrame(
+                {"precision": precision, "recall": recall, "thresholds": thresholds}
+            )
+            self._log_pandas_df_artifact(
+                pr_curve_pandas_df,
+                _gen_metric_name("precision_recall_curve"),
+            )
+
+            pr_auc = sk_metrics.auc(recall, precision)
+            self.metrics[_gen_metric_name("precision_recall_auc")] = pr_auc
+
+            def plot_pr_curve():
+                sk_metrics.PrecisionRecallDisplay(precision, recall).plot()
+
+            if hasattr(sk_metrics, 'PrecisionRecallDisplay'):
+                self._log_image_artifact(
+                    plot_pr_curve,
+                    _gen_metric_name("precision_recall_curve"),
+                )
+
     def _evaluate_classifier(self):
         from mlflow.models.evaluation.lift_curve import plot_lift_curve
         raw_model, predict_fn, predict_proba_fn = _extract_raw_model_and_predict_fn(self.model)
@@ -304,83 +369,45 @@ def _evaluate_classifier(self):
         self.metrics["accuracy"] = sk_metrics.accuracy_score(self.y, self.y_pred)
         self.metrics["example_count"] = len(self.X)
 
-        if self.is_binomial:
-            if predict_proba_fn is not None:
-                self.y_probs = predict_proba_fn(self.X)
+        if predict_proba_fn is not None:
+            self.y_probs = predict_proba_fn(self.X)
+            if self.is_binomial:
                 self.y_prob = self.y_probs[:, 1]
             else:
-                self.y_probs = None
                 self.y_prob = None
+        else:
+            self.y_probs = None
+            self.y_prob = None
 
-            confusion_matrix = sk_metrics.confusion_matrix(self.y, self.y_pred)
-            tn, fp, fn, tp = confusion_matrix.ravel()
-            self.metrics["true_negatives"] = tn
-            self.metrics["false_positives"] = fp
-            self.metrics["false_negatives"] = fn
-            self.metrics["true_positives"] = tp
-            self.metrics["recall"] = sk_metrics.recall_score(self.y, self.y_pred)
-            self.metrics["precision"] = sk_metrics.precision_score(self.y, self.y_pred)
-            self.metrics["f1_score"] = sk_metrics.f1_score(self.y, self.y_pred)
-
-            # TODO:
-            #  compute hinge loss, this requires calling decision_function of the model
-            #  e.g., see https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC.decision_function
-            if self.y_probs is not None:
-                fpr, tpr, thresholds = sk_metrics.roc_curve(self.y, self.y_prob)
-                roc_curve_pandas_df = pd.DataFrame(
-                    {"fpr": fpr, "tpr": tpr, "thresholds": thresholds}
-                )
-                self._log_pandas_df_artifact(
-                    roc_curve_pandas_df, "roc_curve",
-                )
-
-                roc_auc = sk_metrics.auc(fpr, tpr)
-                self.metrics["roc_auc"] = roc_auc
-
-                def plot_roc_curve():
-                    sk_metrics.RocCurveDisplay(
-                        fpr=fpr, tpr=tpr, roc_auc=roc_auc,
-                    ).plot()
-
-                if hasattr(sk_metrics, 'RocCurveDisplay'):
-                    self._log_image_artifact(
-                        plot_roc_curve, "roc_curve",
-                    )
-
-                precision, recall, thresholds = \
-                    sk_metrics.precision_recall_curve(self.y, self.y_prob)
-                thresholds = np.append(thresholds, [1.0], axis=0)
-                pr_curve_pandas_df = pd.DataFrame(
-                    {"precision": precision, "recall": recall, "thresholds": thresholds}
-                )
-                self._log_pandas_df_artifact(
-                    pr_curve_pandas_df, "precision_recall_curve",
-                )
-
-                pr_auc = sk_metrics.auc(recall, precision)
-                self.metrics["precision_recall_auc"] = pr_auc
-
-                def plot_pr_curve():
-                    sk_metrics.PrecisionRecallDisplay(
-                        precision, recall,
-                    ).plot()
-
-                if hasattr(sk_metrics, 'PrecisionRecallDisplay'):
-                    self._log_image_artifact(
-                        plot_pr_curve, "precision_recall_curve",
-                    )
-
+        if predict_proba_fn is not None:
+            if self.is_binomial:
+                self._evaluate_per_class(None, self.y, self.y_pred, self.y_prob)
                 self._log_image_artifact(
                     lambda: plot_lift_curve(self.y, self.y_probs), "lift_curve",
                 )
+            else:
+                self.metrics['f1_score_micro'] = \
+                    sk_metrics.f1_score(self.y, self.y_pred, average='micro')
+                self.metrics['f1_score_macro'] = \
+                    sk_metrics.f1_score(self.y, self.y_pred, average='macro')
+                for postive_class in range(self.num_classes):
+                    y_per_class = np.where(self.y == postive_class, 1, 0)
+                    y_pred_per_class = np.where(self.y_pred == postive_class, 1, 0)
+                    pos_class_prob = self.y_probs[:, postive_class]
+                    self._evaluate_per_class(
+                        postive_class, y_per_class, y_pred_per_class, pos_class_prob
+                    )
 
-            def plot_confusion_matrix():
-                sk_metrics.ConfusionMatrixDisplay(confusion_matrix=confusion_matrix).plot()
+        # TODO: Shall we also log confusion_matrix data as a json artifact ?
+        confusion_matrix = sk_metrics.confusion_matrix(self.y, self.y_pred)
 
-            if hasattr(sk_metrics, 'ConfusionMatrixDisplay'):
-                self._log_image_artifact(
-                    plot_confusion_matrix, "confusion_matrix",
-                )
+        def plot_confusion_matrix():
+            sk_metrics.ConfusionMatrixDisplay(confusion_matrix=confusion_matrix).plot()
+
+        if hasattr(sk_metrics, 'ConfusionMatrixDisplay'):
+            self._log_image_artifact(
+                plot_confusion_matrix, "confusion_matrix",
+            )
 
         self._log_metrics()
         self._log_model_explainability()

From db63d7929d8269ebbdd8c0495f71de2585fc3fe3 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 20 Dec 2021 21:51:52 +0800
Subject: [PATCH 081/120] update doc

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index a368aa94528bf..624ab8dfd5c45 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -504,17 +504,17 @@ def check_nesting_config_dict(_evaluator_name_list, _evaluator_name_to_conf_map)
     if evaluators is None:
         evaluator_name_list = list(_model_evaluation_registry._registry.keys())
         if len(evaluator_name_list) > 1:
-            print(f'Hint: Multiple registered evaluators are found {evaluator_name_list} and '
-                  'they will all be used in evaluation. If you want to evaluate with one '
-                  'evaluator, specify the `evaluator` argument and (optional) specify the '
-                  '`evaluator_config` argument.')
+            print(f'Multiple registered evaluators are found {evaluator_name_list} and '
+                  'they will all be used in evaluation if they support the specified model type. '
+                  'If you want to evaluate with one evaluator, specify the `evaluator` argument '
+                  'and (optional) specify the `evaluator_config` argument.')
         if evaluator_config is not None:
             conf_dict_value_error = ValueError(
-                "If `evaluators` argument is None, all registered evaluators will be used, "
-                "if only default evaluator available, the `evaluator_config` argument can be "
-                "config dict for default evaluator, otherwise the `evaluator_config` argument "
-                "must be a dict contains mapping from evaluator name to individual "
-                "evaluator config dict."
+                "If `evaluators` argument is None, all available evaluators will be used. "
+                "If only the default evaluator is available, the `evaluator_config` argument is "
+                "interpreted as the config dictionary for the default evaluator. Otherwise, the "
+                "`evaluator_config` argument must be a dictionary mapping each evaluator's name "
+                "to its own evaluator config dictionary."
             )
             if evaluator_name_list == ['default']:
                 if not isinstance(evaluator_config, dict):
@@ -550,7 +550,7 @@ def check_nesting_config_dict(_evaluator_name_list, _evaluator_name_to_conf_map)
     else:
         raise ValueError(
             '`evaluators` argument must be None, a evaluator name string, or a list of '
-            'evalautor names.'
+            'evaluator names.'
         )
 
     return evaluator_name_list, evaluator_name_to_conf_map
@@ -560,7 +560,6 @@ def _evaluate(
         model, model_type, dataset, actual_run_id, evaluator_name_list, evaluator_name_to_conf_map
 ):
     """
-    This method is the patch point for databricks instrumentation.
     The public API "evaluate" will verify argument first, and then pass normalized arguments
     to the _evaluate method.
     """
@@ -641,8 +640,8 @@ def evaluate(
      - log_model_explainability: The number of rows to use for calculating model explainability.
      - explainality_algorithm: A string to specify the shap explainer algorithm. If not set, it will
        choose the best fit explainer according to the model.
-     - explainability_nsamples: The sample rows for calculating model explainability.
-       Default value is 2000.
+     - explainability_nsamples: The number of sample rows to use for calculating model
+       explainability. Default value is 2000.
     """
     from mlflow.pyfunc import PyFuncModel
 

From 1618b8ae53560ec723d40a832d91da77aca9c831 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 20 Dec 2021 23:10:31 +0800
Subject: [PATCH 082/120] add log_loss metric

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/default_evaluator.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 36b56e1be9de0..d787238551e49 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -380,6 +380,8 @@ def _evaluate_classifier(self):
             self.y_prob = None
 
         if predict_proba_fn is not None:
+            self.metrics['log_loss'] = sk_metrics.log_loss(self.y, self.y_probs)
+
             if self.is_binomial:
                 self._evaluate_per_class(None, self.y, self.y_pred, self.y_prob)
                 self._log_image_artifact(

From 1b49dc900d11bb1143385af6374050245bc37045 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 20 Dec 2021 23:37:55 +0800
Subject: [PATCH 083/120] lazy load pyspark

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 624ab8dfd5c45..37fffad808752 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -13,6 +13,7 @@
 from mlflow.utils.annotations import experimental
 import logging
 import struct
+import sys
 
 
 _logger = logging.getLogger(__name__)
@@ -186,9 +187,15 @@ def __init__(self, data, labels, name=None, path=None, feature_names=None):
         import pandas as pd
 
         try:
-            from pyspark.sql import DataFrame as SparkDataFrame
-
-            supported_dataframe_types = (pd.DataFrame, SparkDataFrame)
+            # add checking `'pyspark' in sys.modules` to avoid importing pyspark when user
+            # run code not related to pyspark.
+            if 'pyspark' in sys.modules:
+                from pyspark.sql import DataFrame as SparkDataFrame
+                supported_dataframe_types = (pd.DataFrame, SparkDataFrame)
+                self._spark_df_type = SparkDataFrame
+            else:
+                supported_dataframe_types = (pd.DataFrame,)
+                self._spark_df_type = None
         except ImportError:
             supported_dataframe_types = (pd.DataFrame,)
 
@@ -250,14 +257,7 @@ def data(self):
         if self._data is not None:
             return self._data
 
-        try:
-            from pyspark.sql import DataFrame as SparkDataFrame
-
-            spark_df_type = SparkDataFrame
-        except ImportError:
-            spark_df_type = None
-
-        if spark_df_type and isinstance(self._original_data, spark_df_type):
+        if self._spark_df_type and isinstance(self._original_data, self._spark_df_type):
             self._data = self._original_data.limit(
                 EvaluationDataset.SPARK_DATAFRAME_LIMIT
             ).toPandas()

From df7150551bf115c5a55acc4f8587c1873a052b05 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Tue, 21 Dec 2021 17:06:38 +0800
Subject: [PATCH 084/120] address ben comments

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 37fffad808752..060f8013e17bc 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -186,6 +186,10 @@ def __init__(self, data, labels, name=None, path=None, feature_names=None):
         import numpy as np
         import pandas as pd
 
+        # TODO:
+        #  for pandas.DataFrame input, check column type and raise error if unsupported column
+        #   found
+        #  For spark DataFrame input, support feature column with `pyspark.ml.Vector` column type.
         try:
             # add checking `'pyspark' in sys.modules` to avoid importing pyspark when user
             # run code not related to pyspark.
@@ -504,10 +508,11 @@ def check_nesting_config_dict(_evaluator_name_list, _evaluator_name_to_conf_map)
     if evaluators is None:
         evaluator_name_list = list(_model_evaluation_registry._registry.keys())
         if len(evaluator_name_list) > 1:
-            print(f'Multiple registered evaluators are found {evaluator_name_list} and '
-                  'they will all be used in evaluation if they support the specified model type. '
-                  'If you want to evaluate with one evaluator, specify the `evaluator` argument '
-                  'and (optional) specify the `evaluator_config` argument.')
+            _logger.warning(
+                f'Multiple registered evaluators are found {evaluator_name_list} and '
+                'they will all be used in evaluation if they support the specified model type. '
+                'If you want to evaluate with one evaluator, specify the `evaluator` argument '
+                'and (optional) specify the `evaluator_config` argument.')
         if evaluator_config is not None:
             conf_dict_value_error = ValueError(
                 "If `evaluators` argument is None, all available evaluators will be used. "
@@ -637,9 +642,11 @@ def evaluate(
 
     The default evaluator supports the 'regressor' and 'classifer' `model_type`s. The available
     `evaluator_config` options for the default evaluator include:
-     - log_model_explainability: The number of rows to use for calculating model explainability.
-     - explainality_algorithm: A string to specify the shap explainer algorithm. If not set, it will
-       choose the best fit explainer according to the model.
+     - log_model_explainability: A boolean value to specify whether log model explainability,
+       default value is True.
+     - explainality_algorithm: A string to specify the shap explainer algorithm. If not set,
+       it will run `shap.Explainer` with "auto" algorithm, which choose the best fit explainer
+        according to the model.
      - explainability_nsamples: The number of sample rows to use for calculating model
        explainability. Default value is 2000.
     """

From d3862332fef310fea15a1355247d00ff48bcda9e Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Tue, 21 Dec 2021 20:44:40 +0800
Subject: [PATCH 085/120] fix

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py |  4 +-
 tests/models/test_evaluation.py  | 87 ++++++++++++++++++++++++--------
 2 files changed, 68 insertions(+), 23 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 060f8013e17bc..1887758a1a929 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -408,9 +408,7 @@ def _log_dataset_tag(self, client, run_id, model_uuid):
                     metadata["model"] == model_uuid:
                 break
         else:
-            new_metadata = self._metadata
-            new_metadata["model"] = model_uuid
-            dataset_metadata_list.append(new_metadata)
+            dataset_metadata_list.append({**self._metadata, "model": model_uuid})
 
         dataset_metadata_str = json.dumps(dataset_metadata_list, separators=(",", ":"))
         client.log_batch(
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index 6892e6d8227b7..a7f47af2c41d5 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -33,6 +33,7 @@
 
 from mlflow.tracking.artifact_utils import get_artifact_uri
 import json
+import uuid
 
 
 def get_iris():
@@ -45,6 +46,11 @@ def get_diabetes_dataset():
     return data.data[:, :2], data.target
 
 
+def get_breast_cancer_dataset():
+    data = sklearn.datasets.load_breast_cancer()
+    return data.data[:, :2], data.target
+
+
 def get_run_data(run_id):
     client = mlflow.tracking.MlflowClient()
     data = client.get_run(run_id).data
@@ -70,6 +76,27 @@ def spark_session():
     session.stop()
 
 
+@pytest.fixture(scope="module")
+def iris_dataset():
+    X, y = get_iris()
+    eval_X, eval_y = X[0::3], y[0::3]
+    return EvaluationDataset(data=eval_X, labels=eval_y, name="iris_dataset")
+
+
+@pytest.fixture(scope="module")
+def diabetes_dataset():
+    X, y = get_diabetes_dataset()
+    eval_X, eval_y = X[0::3], y[0::3]
+    return EvaluationDataset(data=eval_X, labels=eval_y, name="diabetes_dataset")
+
+
+@pytest.fixture(scope="module")
+def breast_cancer_dataset():
+    X, y = get_breast_cancer_dataset()
+    eval_X, eval_y = X[0::3], y[0::3]
+    return EvaluationDataset(data=eval_X, labels=eval_y, name="breast_cancer_dataset")
+
+
 @pytest.fixture(scope="module")
 def regressor_model_uri():
     X, y = get_diabetes_dataset()
@@ -97,10 +124,16 @@ def classifier_model_uri():
 
 
 @pytest.fixture(scope="module")
-def iris_dataset():
-    X, y = get_iris()
-    eval_X, eval_y = X[0::3], y[0::3]
-    return EvaluationDataset(data=eval_X, labels=eval_y, name="iris_dataset")
+def binary_classifier_model_uri():
+    X, y = get_breast_cancer_dataset()
+    clf = sklearn.linear_model.LogisticRegression()
+    clf.fit(X, y)
+
+    with mlflow.start_run() as run:
+        mlflow.sklearn.log_model(clf, "bin_clf_model")
+        binary_classifier_model_uri = get_artifact_uri(run.info.run_id, "bin_clf_model")
+
+    return binary_classifier_model_uri
 
 
 @pytest.fixture(scope="module")
@@ -264,29 +297,35 @@ def test_spark_df_dataset(spark_session):
 
 
 def test_log_dataset_tag(iris_dataset, iris_pandas_df_dataset):
+    model_uuid = uuid.uuid4().hex
     with mlflow.start_run() as run:
         client = mlflow.tracking.MlflowClient()
-        iris_dataset._log_dataset_tag(client, run.info.run_id)
+        iris_dataset._log_dataset_tag(client, run.info.run_id, model_uuid=model_uuid)
         _, _, tags, _ = get_run_data(run.info.run_id)
-        assert json.loads(tags["mlflow.datasets"]) == [iris_dataset._metadata]
+
+        logged_meta1 = {**iris_dataset._metadata, 'model': model_uuid}
+        logged_meta2 = {**iris_pandas_df_dataset._metadata, 'model': model_uuid}
+
+        assert json.loads(tags["mlflow.datasets"]) == \
+            [logged_meta1]
 
         raw_tag = get_raw_tag(run.info.run_id, "mlflow.datasets")
         assert " " not in raw_tag  # assert the tag string remove all whitespace chars.
 
         # Test appending dataset tag
-        iris_pandas_df_dataset._log_dataset_tag(client, run.info.run_id)
+        iris_pandas_df_dataset._log_dataset_tag(client, run.info.run_id, model_uuid=model_uuid)
         _, _, tags, _ = get_run_data(run.info.run_id)
         assert json.loads(tags["mlflow.datasets"]) == [
-            iris_dataset._metadata,
-            iris_pandas_df_dataset._metadata,
+            logged_meta1,
+            logged_meta2,
         ]
 
         # Test log repetitive dataset
-        iris_dataset._log_dataset_tag(client, run.info.run_id)
+        iris_dataset._log_dataset_tag(client, run.info.run_id, model_uuid=model_uuid)
         _, _, tags, _ = get_run_data(run.info.run_id)
         assert json.loads(tags["mlflow.datasets"]) == [
-            iris_dataset._metadata,
-            iris_pandas_df_dataset._metadata,
+            logged_meta1,
+            logged_meta2,
         ]
 
 
@@ -337,14 +376,18 @@ def test_evaluator_interface(classifier_model_uri, iris_dataset):
             FakeEvauator1, "evaluate", return_value=evaluator1_return_value
         ) as mock_evaluate:
             with mlflow.start_run():
-                evaluate(
-                    classifier_model_uri,
-                    "classifier",
-                    iris_dataset,
-                    run_id=None,
-                    evaluators="test_evaluator1",
-                    evaluator_config=evaluator1_config,
-                )
+                with pytest.raises(
+                    ValueError,
+                    match='The model could not be evaluated by any of the registered evaluators',
+                ):
+                    evaluate(
+                        classifier_model_uri,
+                        "classifier",
+                        iris_dataset,
+                        run_id=None,
+                        evaluators="test_evaluator1",
+                        evaluator_config=evaluator1_config,
+                    )
                 mock_can_evaluate.assert_called_once_with("classifier", evaluator1_config)
                 mock_evaluate.assert_not_called()
         with mock.patch.object(
@@ -462,3 +505,7 @@ def test_start_run_or_reuse_active_run():
         with pytest.raises(ValueError, match="An active run exists"):
             with _start_run_or_reuse_active_run(run_id=previous_run_id):
                 pass
+
+
+
+

From 17e6b0c0a50124e9cf9654da052e0b9cc33ddac5 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 22 Dec 2021 17:43:25 +0800
Subject: [PATCH 086/120] prevent show shap logo, add tests

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/default_evaluator.py |  5 +-
 tests/models/test_default_evaluator.py        | 72 +++++++++++++++++++
 tests/models/test_evaluation.py               | 15 ++--
 3 files changed, 84 insertions(+), 8 deletions(-)
 create mode 100644 tests/models/test_default_evaluator.py

diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index d787238551e49..7a7890795179b 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -193,9 +193,12 @@ def _log_model_explainability(self):
             global _shap_initialized
             import shap
             import shap.maskers
+            from IPython.core.display import display, HTML
 
             if not _shap_initialized:
-                shap.initjs()
+                # Call `shap.getjs` instead of call `shap.initjs` to prevent
+                # display a logo picture in IPython notebook.
+                display(HTML(shap.getjs()))
                 _shap_initialized = True
         except ImportError:
             _logger.warning('Shap package is not installed. Skip log model explainability.')
diff --git a/tests/models/test_default_evaluator.py b/tests/models/test_default_evaluator.py
new file mode 100644
index 0000000000000..ccb26b79ebe31
--- /dev/null
+++ b/tests/models/test_default_evaluator.py
@@ -0,0 +1,72 @@
+import numpy as np
+import json
+
+from mlflow.models.evaluation import evaluate, EvaluationDataset
+import mlflow
+from sklearn.datasets import load_boston
+from sklearn.linear_model import LinearRegression
+
+from tests.models.test_evaluation import get_run_data, \
+    regressor_model_uri, diabetes_dataset, \
+    classifier_model_uri, iris_dataset, \
+    binary_classifier_model_uri, breast_cancer_dataset
+
+
+def test_regressor_evaluation(regressor_model_uri, diabetes_dataset):
+    with mlflow.start_run() as run:
+        result = evaluate(
+            regressor_model_uri,
+            model_type='regressor',
+            dataset=diabetes_dataset,
+            evaluators='default',
+        )
+        print(f'regressor evaluation run: {run.info.run_id}')
+
+    params, metrics, tags, artifacts = \
+        get_run_data(run.info.run_id)
+
+    expected_metrics = {
+        'example_count_on_data_diabetes_dataset': 148.0,
+        'mean_absolute_error_on_data_diabetes_dataset': 42.927,
+        'mean_squared_error_on_data_diabetes_dataset': 2747.513,
+        'root_mean_squared_error_on_data_diabetes_dataset': 52.416,
+        'sum_on_label_on_data_diabetes_dataset': 23099.0,
+        'mean_on_label_on_data_diabetes_dataset': 156.074,
+        'r2_score_on_data_diabetes_dataset': 0.565,
+        'max_error_on_data_diabetes_dataset': 151.354,
+        'mean_absolute_percentage_error_on_data_diabetes_dataset': 0.413
+    }
+    for metric_key in metrics:
+        assert np.isclose(metrics[metric_key], expected_metrics[metric_key], rtol=1e-3)
+
+    model = mlflow.pyfunc.load_model(regressor_model_uri)
+
+    assert json.loads(tags['mlflow.datasets']) == \
+        [{**diabetes_dataset._metadata, 'model': model.metadata.model_uuid}]
+
+    assert set(artifacts) == {
+        'shap_beeswarm_on_data_diabetes_dataset.png',
+        'shap_feature_importance_on_data_diabetes_dataset.png',
+        'shap_summary_on_data_diabetes_dataset.png',
+    }
+
+
+def test_multi_classifier_evaluation(classifier_model_uri, iris_dataset):
+    with mlflow.start_run() as run:
+        result = evaluate(
+            classifier_model_uri,
+            model_type='classifier',
+            dataset=iris_dataset,
+            evaluators='default',
+        )
+        print(f'multi-classifier evaluation run: {run.info.run_id}')
+
+
+def test_bin_classifier_evaluation(binary_classifier_model_uri, breast_cancer_dataset):
+    with mlflow.start_run() as run:
+        result = evaluate(
+            binary_classifier_model_uri,
+            model_type='classifier',
+            dataset=breast_cancer_dataset,
+            evaluators='default',
+        )
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index a7f47af2c41d5..260dc07f4275e 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -1,4 +1,5 @@
 import mlflow
+from collections import namedtuple
 
 from mlflow.models.evaluation import (
     evaluate,
@@ -38,17 +39,20 @@
 
 def get_iris():
     iris = sklearn.datasets.load_iris()
-    return iris.data[:, :2], iris.target
+    return iris.data, iris.target
 
 
 def get_diabetes_dataset():
     data = sklearn.datasets.load_diabetes()
-    return data.data[:, :2], data.target
+    return data.data, data.target
 
 
 def get_breast_cancer_dataset():
     data = sklearn.datasets.load_breast_cancer()
-    return data.data[:, :2], data.target
+    return data.data, data.target
+
+
+RunData = namedtuple('RunData', ['params', 'metrics', 'tags', 'artifacts'])
 
 
 def get_run_data(run_id):
@@ -56,7 +60,7 @@ def get_run_data(run_id):
     data = client.get_run(run_id).data
     tags = {k: v for k, v in data.tags.items()}
     artifacts = [f.path for f in client.list_artifacts(run_id)]
-    return data.params, data.metrics, tags, artifacts
+    return RunData(params=data.params, metrics=data.metrics, tags=tags, artifacts=artifacts)
 
 
 def get_raw_tag(run_id, tag_name):
@@ -506,6 +510,3 @@ def test_start_run_or_reuse_active_run():
             with _start_run_or_reuse_active_run(run_id=previous_run_id):
                 pass
 
-
-
-

From 836f7487ddc20b3423a364aac34c350cfecff11e Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 22 Dec 2021 22:55:07 +0800
Subject: [PATCH 087/120] support spark model

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              | 23 +++++++-
 mlflow/models/evaluation/default_evaluator.py |  1 -
 tests/models/test_default_evaluator.py        | 58 ++++++++++++++-----
 tests/models/test_evaluation.py               | 30 ++++++++++
 4 files changed, 97 insertions(+), 15 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 1887758a1a929..37a7116a7840e 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -195,8 +195,10 @@ def __init__(self, data, labels, name=None, path=None, feature_names=None):
             # run code not related to pyspark.
             if 'pyspark' in sys.modules:
                 from pyspark.sql import DataFrame as SparkDataFrame
+                from pyspark.ml.linalg import VectorUDT as SparkVectorUDT
                 supported_dataframe_types = (pd.DataFrame, SparkDataFrame)
                 self._spark_df_type = SparkDataFrame
+                self._spark_vector_type = SparkVectorUDT
             else:
                 supported_dataframe_types = (pd.DataFrame,)
                 self._spark_df_type = None
@@ -375,7 +377,26 @@ def hash(self):
                 column_names = ",".join(self.data.columns)
                 meta_str = f"columns={column_names}\nlabels={self.labels}"
                 md5_gen.update(meta_str.encode("UTF-8"))
-                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
+
+                data_for_hash = self.data
+
+                if self._spark_df_type and isinstance(self._original_data, self._spark_df_type):
+                    # For spark dataframe, the Vector type column we need expand it
+                    # into multiple columns, otherwise pandas hash function cannot compute hash
+                    # over it.
+                    transform_func = {}
+                    for field in self._original_data.schema:
+                        if isinstance(field.dataType, self._spark_vector_type):
+                            transform_func[field.name] = lambda x: pd.Series(x.toArray())
+                        else:
+                            transform_func[field.name] = lambda x: x
+
+                    data_for_hash = self.data.transform(transform_func)
+
+                # TODO:
+                #  For array/list type column values in pandas DataFrame, pandas hash function
+                #  also cannot support it, expand them if we need support them.
+                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, data_for_hash)
             self._hash = md5_gen.hexdigest()
         return self._hash
 
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 7a7890795179b..edc7f9278699f 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -7,7 +7,6 @@
 )
 from mlflow.entities.metric import Metric
 from mlflow.utils.file_utils import TempDir
-from mlflow.tracking.artifact_utils import get_artifact_uri
 from mlflow.utils.string_utils import truncate_str_from_middle
 
 from sklearn import metrics as sk_metrics
diff --git a/tests/models/test_default_evaluator.py b/tests/models/test_default_evaluator.py
index ccb26b79ebe31..03f3283dd2da6 100644
--- a/tests/models/test_default_evaluator.py
+++ b/tests/models/test_default_evaluator.py
@@ -9,7 +9,8 @@
 from tests.models.test_evaluation import get_run_data, \
     regressor_model_uri, diabetes_dataset, \
     classifier_model_uri, iris_dataset, \
-    binary_classifier_model_uri, breast_cancer_dataset
+    binary_classifier_model_uri, breast_cancer_dataset, \
+    spark_regressor_model_uri, diabetes_spark_dataset
 
 
 def test_regressor_evaluation(regressor_model_uri, diabetes_dataset):
@@ -26,30 +27,43 @@ def test_regressor_evaluation(regressor_model_uri, diabetes_dataset):
         get_run_data(run.info.run_id)
 
     expected_metrics = {
-        'example_count_on_data_diabetes_dataset': 148.0,
-        'mean_absolute_error_on_data_diabetes_dataset': 42.927,
-        'mean_squared_error_on_data_diabetes_dataset': 2747.513,
-        'root_mean_squared_error_on_data_diabetes_dataset': 52.416,
-        'sum_on_label_on_data_diabetes_dataset': 23099.0,
-        'mean_on_label_on_data_diabetes_dataset': 156.074,
-        'r2_score_on_data_diabetes_dataset': 0.565,
-        'max_error_on_data_diabetes_dataset': 151.354,
-        'mean_absolute_percentage_error_on_data_diabetes_dataset': 0.413
+        'example_count': 148.0,
+        'mean_absolute_error': 42.927,
+        'mean_squared_error': 2747.513,
+        'root_mean_squared_error': 52.416,
+        'sum_on_label': 23099.0,
+        'mean_on_label': 156.074,
+        'r2_score': 0.565,
+        'max_error': 151.354,
+        'mean_absolute_percentage_error': 0.413
     }
-    for metric_key in metrics:
-        assert np.isclose(metrics[metric_key], expected_metrics[metric_key], rtol=1e-3)
+    for metric_key in expected_metrics:
+        assert np.isclose(
+            expected_metrics[metric_key],
+            metrics[metric_key + '_on_data_diabetes_dataset'],
+            rtol=1e-3
+        )
+        assert np.isclose(
+            expected_metrics[metric_key],
+            result.metrics[metric_key],
+            rtol=1e-3
+        )
 
     model = mlflow.pyfunc.load_model(regressor_model_uri)
 
     assert json.loads(tags['mlflow.datasets']) == \
         [{**diabetes_dataset._metadata, 'model': model.metadata.model_uuid}]
 
-    assert set(artifacts) == {
+    expected_artifacts = {
         'shap_beeswarm_on_data_diabetes_dataset.png',
         'shap_feature_importance_on_data_diabetes_dataset.png',
         'shap_summary_on_data_diabetes_dataset.png',
     }
 
+    assert set(artifacts) == expected_artifacts
+
+    assert result.artifacts.keys() == expected_artifacts
+
 
 def test_multi_classifier_evaluation(classifier_model_uri, iris_dataset):
     with mlflow.start_run() as run:
@@ -70,3 +84,21 @@ def test_bin_classifier_evaluation(binary_classifier_model_uri, breast_cancer_da
             dataset=breast_cancer_dataset,
             evaluators='default',
         )
+        print(f'bin-classifier evaluation run: {run.info.run_id}')
+
+
+def test_spark_model_evaluation(spark_regressor_model_uri, diabetes_spark_dataset):
+    with mlflow.start_run() as run:
+        result = evaluate(
+            spark_regressor_model_uri,
+            model_type='regressor',
+            dataset=diabetes_spark_dataset,
+            evaluators='default',
+            evaluator_config={
+                'log_model_explainability': False
+            }
+        )
+        print(f'spark model evaluation run: {run.info.run_id}')
+
+    params, metrics, tags, artifacts = get_run_data(run.info.run_id)
+    print(f'spark model metrics={metrics}\n')
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index 260dc07f4275e..762cbe694d345 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -31,6 +31,8 @@
 )
 
 from pyspark.sql import SparkSession
+from pyspark.ml.linalg import Vectors
+from pyspark.ml.regression import LinearRegression as SparkLinearRegression
 
 from mlflow.tracking.artifact_utils import get_artifact_uri
 import json
@@ -47,6 +49,15 @@ def get_diabetes_dataset():
     return data.data, data.target
 
 
+def get_diabetes_spark_dataset():
+    data = sklearn.datasets.load_diabetes()
+    spark = SparkSession.builder.master("local[*]").getOrCreate()
+    rows = [(Vectors.dense(features), float(label))
+            for features, label in zip(data.data, data.target)]
+
+    return spark.createDataFrame(rows, ['features', 'label'])
+
+
 def get_breast_cancer_dataset():
     data = sklearn.datasets.load_breast_cancer()
     return data.data, data.target
@@ -94,6 +105,12 @@ def diabetes_dataset():
     return EvaluationDataset(data=eval_X, labels=eval_y, name="diabetes_dataset")
 
 
+@pytest.fixture(scope="module")
+def diabetes_spark_dataset():
+    spark_df = get_diabetes_spark_dataset().sample(fraction=0.3, seed=1)
+    return EvaluationDataset(data=spark_df, labels='label', name="diabetes_spark_dataset")
+
+
 @pytest.fixture(scope="module")
 def breast_cancer_dataset():
     X, y = get_breast_cancer_dataset()
@@ -114,6 +131,19 @@ def regressor_model_uri():
     return regressor_model_uri
 
 
+@pytest.fixture(scope="module")
+def spark_regressor_model_uri():
+    spark_df = get_diabetes_spark_dataset()
+    reg = SparkLinearRegression()
+    spark_reg_model = reg.fit(spark_df)
+
+    with mlflow.start_run() as run:
+        mlflow.spark.log_model(spark_reg_model, "spark_reg_model")
+        spark_regressor_model_uri = get_artifact_uri(run.info.run_id, "spark_reg_model")
+
+    return spark_regressor_model_uri
+
+
 @pytest.fixture(scope="module")
 def classifier_model_uri():
     X, y = get_iris()

From 19fb0105c80aa10012d3bf7b0790fd30c92f2c18 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 22 Dec 2021 23:50:32 +0800
Subject: [PATCH 088/120] add tests

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/default_evaluator.py |  52 +++---
 tests/models/test_default_evaluator.py        | 164 ++++++++++++++++--
 tests/models/test_evaluation.py               |   2 +-
 3 files changed, 186 insertions(+), 32 deletions(-)

diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index edc7f9278699f..448822815e63d 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -126,7 +126,7 @@ def _extract_raw_model_and_predict_fn(model):
         except ImportError:
             pass
 
-    return raw_model, predict_fn, predict_proba_fn
+    return model_loader_module, raw_model, predict_fn, predict_proba_fn
 
 
 def _gen_log_key(key, dataset_name):
@@ -188,6 +188,17 @@ def _log_model_explainability(self):
         if not self.evaluator_config.get('log_model_explainability', True):
             return
 
+        if self.model_loader_module == 'mlflow.spark':
+            # TODO: Shap explainer need to manipulate on each feature values,
+            #  but spark model input dataframe contains Vector type feature column
+            #  which shap explainer does not support.
+            #  To support this, we need expand the Vector type feature column into
+            #  multiple scaler feature columns and pass it to shap explainer.
+            _logger.warning(
+                'Log model explainability currently does not support spark model.'
+            )
+            return
+
         try:
             global _shap_initialized
             import shap
@@ -269,7 +280,7 @@ def plot_beeswarm():
             shap.plots.beeswarm(shap_values, show=False)
 
         self._log_image_artifact(
-            plot_beeswarm, "shap_beeswarm",
+            plot_beeswarm, "shap_beeswarm_plot",
         )
 
         def plot_summary():
@@ -277,7 +288,7 @@ def plot_summary():
             shap.summary_plot(shap_values, show=False)
 
         self._log_image_artifact(
-            plot_summary, "shap_summary",
+            plot_summary, "shap_summary_plot",
         )
 
         def plot_feature_importance():
@@ -286,7 +297,7 @@ def plot_feature_importance():
 
         self._log_image_artifact(
             plot_feature_importance,
-            "shap_feature_importance",
+            "shap_feature_importance_plot",
         )
 
     def _evaluate_per_class(self, positive_class, y, y_pred, y_proba):
@@ -318,7 +329,7 @@ def _gen_metric_name(name):
                 {"fpr": fpr, "tpr": tpr, "thresholds": thresholds}
             )
             self._log_pandas_df_artifact(
-                roc_curve_pandas_df, _gen_metric_name("roc_curve"),
+                roc_curve_pandas_df, _gen_metric_name("roc_curve_data"),
             )
 
             roc_auc = sk_metrics.auc(fpr, tpr)
@@ -328,7 +339,7 @@ def plot_roc_curve():
                 sk_metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc).plot()
 
             if hasattr(sk_metrics, 'RocCurveDisplay'):
-                self._log_image_artifact(plot_roc_curve, _gen_metric_name("roc_curve"))
+                self._log_image_artifact(plot_roc_curve, _gen_metric_name("roc_curve_plot"))
 
             precision, recall, thresholds = \
                 sk_metrics.precision_recall_curve(y, y_proba)
@@ -338,7 +349,7 @@ def plot_roc_curve():
             )
             self._log_pandas_df_artifact(
                 pr_curve_pandas_df,
-                _gen_metric_name("precision_recall_curve"),
+                _gen_metric_name("precision_recall_curve_data"),
             )
 
             pr_auc = sk_metrics.auc(recall, precision)
@@ -350,29 +361,25 @@ def plot_pr_curve():
             if hasattr(sk_metrics, 'PrecisionRecallDisplay'):
                 self._log_image_artifact(
                     plot_pr_curve,
-                    _gen_metric_name("precision_recall_curve"),
+                    _gen_metric_name("precision_recall_curve_plot"),
                 )
 
     def _evaluate_classifier(self):
         from mlflow.models.evaluation.lift_curve import plot_lift_curve
-        raw_model, predict_fn, predict_proba_fn = _extract_raw_model_and_predict_fn(self.model)
-        self.raw_model = raw_model
-        self.predict_fn = predict_fn
-        self.predict_proba_fn = predict_proba_fn
 
         # Note: require labels to be number of 0, 1, 2, .. num_classes - 1
         label_list = sorted(list(set(self.y)))
         assert label_list[0] == 0, "Label values must being at '0'."
         self.num_classes = len(label_list)
 
-        self.y_pred = predict_fn(self.X)
+        self.y_pred = self.predict_fn(self.X)
         self.is_binomial = self.num_classes <= 2
 
         self.metrics["accuracy"] = sk_metrics.accuracy_score(self.y, self.y_pred)
         self.metrics["example_count"] = len(self.X)
 
-        if predict_proba_fn is not None:
-            self.y_probs = predict_proba_fn(self.X)
+        if self.predict_proba_fn is not None:
+            self.y_probs = self.predict_proba_fn(self.X)
             if self.is_binomial:
                 self.y_prob = self.y_probs[:, 1]
             else:
@@ -381,13 +388,13 @@ def _evaluate_classifier(self):
             self.y_probs = None
             self.y_prob = None
 
-        if predict_proba_fn is not None:
+        if self.predict_proba_fn is not None:
             self.metrics['log_loss'] = sk_metrics.log_loss(self.y, self.y_probs)
 
             if self.is_binomial:
                 self._evaluate_per_class(None, self.y, self.y_pred, self.y_prob)
                 self._log_image_artifact(
-                    lambda: plot_lift_curve(self.y, self.y_probs), "lift_curve",
+                    lambda: plot_lift_curve(self.y, self.y_probs), "lift_curve_plot",
                 )
             else:
                 self.metrics['f1_score_micro'] = \
@@ -430,10 +437,6 @@ def _evaluate_regressor(self):
         self.metrics['mean_absolute_percentage_error'] = \
             sk_metrics.mean_absolute_percentage_error(self.y, self.y_pred)
 
-        raw_model, predict_fn, _ = _extract_raw_model_and_predict_fn(self.model)
-        self.raw_model = raw_model
-        self.predict_fn = predict_fn
-
         self._log_metrics()
         self._log_model_explainability()
         return EvaluationResult(self.metrics, self.artifacts)
@@ -459,6 +462,13 @@ def evaluate(
             self.dataset_name = dataset.name
             self.feature_names = dataset.feature_names
 
+            model_loader_module, raw_model, predict_fn, predict_proba_fn = \
+                _extract_raw_model_and_predict_fn(model)
+            self.model_loader_module = model_loader_module
+            self.raw_model = raw_model
+            self.predict_fn = predict_fn
+            self.predict_proba_fn = predict_proba_fn
+
             X, y = dataset._extract_features_and_labels()
             self.X = X
             self.y = y
diff --git a/tests/models/test_default_evaluator.py b/tests/models/test_default_evaluator.py
index 03f3283dd2da6..c8b40e5fd7916 100644
--- a/tests/models/test_default_evaluator.py
+++ b/tests/models/test_default_evaluator.py
@@ -54,15 +54,16 @@ def test_regressor_evaluation(regressor_model_uri, diabetes_dataset):
     assert json.loads(tags['mlflow.datasets']) == \
         [{**diabetes_dataset._metadata, 'model': model.metadata.model_uuid}]
 
-    expected_artifacts = {
-        'shap_beeswarm_on_data_diabetes_dataset.png',
-        'shap_feature_importance_on_data_diabetes_dataset.png',
-        'shap_summary_on_data_diabetes_dataset.png',
+    assert set(artifacts) == {
+        'shap_beeswarm_plot_on_data_diabetes_dataset.png',
+        'shap_feature_importance_plot_on_data_diabetes_dataset.png',
+        'shap_summary_plot_on_data_diabetes_dataset.png',
+    }
+    assert result.artifacts.keys() == {
+        'shap_beeswarm_plot',
+        'shap_feature_importance_plot',
+        'shap_summary_plot',
     }
-
-    assert set(artifacts) == expected_artifacts
-
-    assert result.artifacts.keys() == expected_artifacts
 
 
 def test_multi_classifier_evaluation(classifier_model_uri, iris_dataset):
@@ -75,6 +76,68 @@ def test_multi_classifier_evaluation(classifier_model_uri, iris_dataset):
         )
         print(f'multi-classifier evaluation run: {run.info.run_id}')
 
+    params, metrics, tags, artifacts = \
+        get_run_data(run.info.run_id)
+
+    expected_metrics = {
+        'accuracy': 0.32, 'example_count': 50, 'log_loss': 0.9712, 'f1_score_micro': 0.32,
+        'f1_score_macro': 0.1616, 'class_0_true_negatives': 33,
+        'class_0_false_positives': 0, 'class_0_false_negatives': 17, 'class_0_true_positives': 0,
+        'class_0_recall': 0.0, 'class_0_precision': 0.0, 'class_0_f1_score': 0.0,
+        'class_0_roc_auc': 1.0, 'class_0_precision_recall_auc': 1.0, 'class_1_true_negatives': 33,
+        'class_1_false_positives': 0, 'class_1_false_negatives': 17, 'class_1_true_positives': 0,
+        'class_1_recall': 0.0, 'class_1_precision': 0.0, 'class_1_f1_score': 0.0,
+        'class_1_roc_auc': 0.9411, 'class_1_precision_recall_auc': 0.8989,
+        'class_2_true_negatives': 0, 'class_2_false_positives': 34, 'class_2_false_negatives': 0,
+        'class_2_true_positives': 16, 'class_2_recall': 1.0, 'class_2_precision': 0.32,
+        'class_2_f1_score': 0.4848, 'class_2_roc_auc': 0.9963,
+        'class_2_precision_recall_auc': 0.9921}
+
+    for metric_key in expected_metrics:
+        assert np.isclose(
+            expected_metrics[metric_key],
+            metrics[metric_key + '_on_data_iris_dataset'],
+            rtol=1e-3
+        )
+        assert np.isclose(
+            expected_metrics[metric_key],
+            result.metrics[metric_key],
+            rtol=1e-3
+        )
+
+    model = mlflow.pyfunc.load_model(classifier_model_uri)
+
+    assert json.loads(tags['mlflow.datasets']) == \
+        [{**iris_dataset._metadata, 'model': model.metadata.model_uuid}]
+
+    assert set(artifacts) == {
+        'class_0_precision_recall_curve_data_on_data_iris_dataset.csv',
+        'class_0_precision_recall_curve_plot_on_data_iris_dataset.png',
+        'class_0_roc_curve_data_on_data_iris_dataset.csv',
+        'class_0_roc_curve_plot_on_data_iris_dataset.png',
+        'class_1_precision_recall_curve_data_on_data_iris_dataset.csv',
+        'class_1_precision_recall_curve_plot_on_data_iris_dataset.png',
+        'class_1_roc_curve_data_on_data_iris_dataset.csv',
+        'class_1_roc_curve_plot_on_data_iris_dataset.png',
+        'class_2_precision_recall_curve_data_on_data_iris_dataset.csv',
+        'class_2_precision_recall_curve_plot_on_data_iris_dataset.png',
+        'class_2_roc_curve_data_on_data_iris_dataset.csv',
+        'class_2_roc_curve_plot_on_data_iris_dataset.png',
+        'confusion_matrix_on_data_iris_dataset.png',
+        'explainer_on_data_iris_dataset',
+        'shap_beeswarm_plot_on_data_iris_dataset.png',
+        'shap_feature_importance_plot_on_data_iris_dataset.png',
+        'shap_summary_plot_on_data_iris_dataset.png',
+    }
+    assert result.artifacts.keys() == {
+        'class_0_roc_curve_data', 'class_0_roc_curve_plot', 'class_0_precision_recall_curve_data',
+        'class_0_precision_recall_curve_plot', 'class_1_roc_curve_data', 'class_1_roc_curve_plot',
+        'class_1_precision_recall_curve_data', 'class_1_precision_recall_curve_plot',
+        'class_2_roc_curve_data', 'class_2_roc_curve_plot', 'class_2_precision_recall_curve_data',
+        'class_2_precision_recall_curve_plot', 'confusion_matrix', 'shap_beeswarm_plot',
+        'shap_summary_plot', 'shap_feature_importance_plot'
+    }
+
 
 def test_bin_classifier_evaluation(binary_classifier_model_uri, breast_cancer_dataset):
     with mlflow.start_run() as run:
@@ -86,6 +149,57 @@ def test_bin_classifier_evaluation(binary_classifier_model_uri, breast_cancer_da
         )
         print(f'bin-classifier evaluation run: {run.info.run_id}')
 
+    params, metrics, tags, artifacts = \
+        get_run_data(run.info.run_id)
+
+    expected_metrics = {
+        'accuracy': 0.957,
+        'example_count': 190,
+        'log_loss': 0.0918,
+        'true_negatives': 71,
+        'false_positives': 5,
+        'false_negatives': 3,
+        'true_positives': 111,
+        'recall': 0.9736,
+        'precision': 0.9568,
+        'f1_score': 0.9652,
+        'roc_auc': 0.995,
+        'precision_recall_auc': 0.997
+    }
+    for metric_key in expected_metrics:
+        assert np.isclose(
+            expected_metrics[metric_key],
+            metrics[metric_key + '_on_data_breast_cancer_dataset'],
+            rtol=1e-3
+        )
+        assert np.isclose(
+            expected_metrics[metric_key],
+            result.metrics[metric_key],
+            rtol=1e-3
+        )
+
+    model = mlflow.pyfunc.load_model(binary_classifier_model_uri)
+
+    assert json.loads(tags['mlflow.datasets']) == \
+        [{**breast_cancer_dataset._metadata, 'model': model.metadata.model_uuid}]
+
+    assert set(artifacts) == {
+        'confusion_matrix_on_data_breast_cancer_dataset.png',
+        'lift_curve_plot_on_data_breast_cancer_dataset.png',
+        'precision_recall_curve_data_on_data_breast_cancer_dataset.csv',
+        'precision_recall_curve_plot_on_data_breast_cancer_dataset.png',
+        'roc_curve_data_on_data_breast_cancer_dataset.csv',
+        'roc_curve_plot_on_data_breast_cancer_dataset.png',
+        'shap_beeswarm_plot_on_data_breast_cancer_dataset.png',
+        'shap_feature_importance_plot_on_data_breast_cancer_dataset.png',
+        'shap_summary_plot_on_data_breast_cancer_dataset.png'
+    }
+    assert result.artifacts.keys() == {
+        'roc_curve_data', 'roc_curve_plot', 'precision_recall_curve_data',
+        'precision_recall_curve_plot', 'lift_curve_plot', 'confusion_matrix',
+        'shap_beeswarm_plot', 'shap_summary_plot', 'shap_feature_importance_plot'
+    }
+
 
 def test_spark_model_evaluation(spark_regressor_model_uri, diabetes_spark_dataset):
     with mlflow.start_run() as run:
@@ -95,10 +209,40 @@ def test_spark_model_evaluation(spark_regressor_model_uri, diabetes_spark_datase
             dataset=diabetes_spark_dataset,
             evaluators='default',
             evaluator_config={
-                'log_model_explainability': False
+                'log_model_explainability': True
             }
         )
         print(f'spark model evaluation run: {run.info.run_id}')
 
     params, metrics, tags, artifacts = get_run_data(run.info.run_id)
-    print(f'spark model metrics={metrics}\n')
+
+    expected_metrics = {
+        'example_count': 139.0,
+        'mean_absolute_error': 45.672,
+        'mean_squared_error': 3009.048,
+        'root_mean_squared_error': 54.854,
+        'sum_on_label': 21183.0,
+        'mean_on_label': 152.395,
+        'r2_score': 0.491,
+        'max_error': 136.170,
+        'mean_absolute_percentage_error': 0.41392110539896615
+    }
+    for metric_key in expected_metrics:
+        assert np.isclose(
+            expected_metrics[metric_key],
+            metrics[metric_key + '_on_data_diabetes_spark_dataset'],
+            rtol=1e-3
+        )
+        assert np.isclose(
+            expected_metrics[metric_key],
+            result.metrics[metric_key],
+            rtol=1e-3
+        )
+
+    model = mlflow.pyfunc.load_model(spark_regressor_model_uri)
+
+    assert json.loads(tags['mlflow.datasets']) == \
+        [{**diabetes_spark_dataset._metadata, 'model': model.metadata.model_uuid}]
+
+    assert set(artifacts) == set()
+    assert result.artifacts == {}
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index 762cbe694d345..1511e8598f8ad 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -147,7 +147,7 @@ def spark_regressor_model_uri():
 @pytest.fixture(scope="module")
 def classifier_model_uri():
     X, y = get_iris()
-    clf = sklearn.linear_model.LogisticRegression()
+    clf = sklearn.linear_model.LogisticRegression(max_iter=2)
     clf.fit(X, y)
 
     with mlflow.start_run() as run:

From 0241e0aedbf77685667bf94213a5b5e35b20d91b Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 23 Dec 2021 09:38:10 +0800
Subject: [PATCH 089/120] add shap version check

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/default_evaluator.py | 25 ++++++++-----------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 448822815e63d..4932f2b8e9a0e 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -17,6 +17,7 @@
 import time
 from functools import partial
 import logging
+from packaging.version import Version
 
 """
 [P0] Accuracy: Calculates how often predictions equal labels.
@@ -80,9 +81,6 @@ def _load_content_from_file(self, local_artifact_path):
 _DEFAULT_SAMPLE_ROWS_FOR_SHAP = 2000
 
 
-_shap_initialized = False
-
-
 def _infer_model_type_by_labels(labels):
     distinct_labels = set(labels)
     for v in distinct_labels:
@@ -200,21 +198,18 @@ def _log_model_explainability(self):
             return
 
         try:
-            global _shap_initialized
             import shap
-            import shap.maskers
-            from IPython.core.display import display, HTML
-
-            if not _shap_initialized:
-                # Call `shap.getjs` instead of call `shap.initjs` to prevent
-                # display a logo picture in IPython notebook.
-                display(HTML(shap.getjs()))
-                _shap_initialized = True
+            import matplotlib.pyplot as pyplot
         except ImportError:
-            _logger.warning('Shap package is not installed. Skip log model explainability.')
-            return
+            _logger.warning(
+                'Shap or matplotlib package is not installed, Skip log model explainability.'
+            )
 
-        import matplotlib.pyplot as pyplot
+        if Version(shap.__version__) < Version('0.40'):
+            _logger.warning(
+                'Shap package version is lower than 0.40, Skip log model explainability.'
+            )
+            return
 
         is_multinomial_classifier = self.model_type == 'classifier' and self.num_classes > 2
 

From c94612e7b79f1459abf4efa43ac954c7a5c225fc Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 23 Dec 2021 18:35:27 +0800
Subject: [PATCH 090/120] update docs, loose classifier label limit

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              | 32 ++++++---
 mlflow/models/evaluation/default_evaluator.py | 72 +++++++------------
 2 files changed, 47 insertions(+), 57 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 37a7116a7840e..9081451cc7a1a 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -531,7 +531,7 @@ def check_nesting_config_dict(_evaluator_name_list, _evaluator_name_to_conf_map)
                 f'Multiple registered evaluators are found {evaluator_name_list} and '
                 'they will all be used in evaluation if they support the specified model type. '
                 'If you want to evaluate with one evaluator, specify the `evaluator` argument '
-                'and (optional) specify the `evaluator_config` argument.')
+                'and optionally specify the `evaluator_config` argument.')
         if evaluator_config is not None:
             conf_dict_value_error = ValueError(
                 "If `evaluators` argument is None, all available evaluators will be used. "
@@ -565,7 +565,7 @@ def check_nesting_config_dict(_evaluator_name_list, _evaluator_name_to_conf_map)
         if evaluator_config is not None:
             if not check_nesting_config_dict(evaluators, evaluator_config):
                 raise ValueError(
-                    "If `evaluators` argument is a evaluator name list, evaluator_config "
+                    "If `evaluators` argument is an evaluator name list, evaluator_config "
                     "must be a dict contains mapping from evaluator name to individual "
                     "evaluator config dict."
                 )
@@ -573,7 +573,7 @@ def check_nesting_config_dict(_evaluator_name_list, _evaluator_name_to_conf_map)
         evaluator_name_to_conf_map = evaluator_config or {}
     else:
         raise ValueError(
-            '`evaluators` argument must be None, a evaluator name string, or a list of '
+            '`evaluators` argument must be None, an evaluator name string, or a list of '
             'evaluator names.'
         )
 
@@ -661,13 +661,25 @@ def evaluate(
 
     The default evaluator supports the 'regressor' and 'classifer' `model_type`s. The available
     `evaluator_config` options for the default evaluator include:
-     - log_model_explainability: A boolean value to specify whether log model explainability,
-       default value is True.
-     - explainality_algorithm: A string to specify the shap explainer algorithm. If not set,
-       it will run `shap.Explainer` with "auto" algorithm, which choose the best fit explainer
-        according to the model.
-     - explainability_nsamples: The number of sample rows to use for calculating model
-       explainability. Default value is 2000.
+     - log_model_explainability: A boolean value specifying whether or not to log model
+       explainability insights, default value is True.
+     - explainability_algorithm: A string to specify the SHAP Explainer algorithm for model
+       explainability. If not set, `shap.Explainer` is used with the "auto" algorithm, which
+       chooses the best Explainer based on the model.
+     - explainability_nsamples: The number of sample rows to use for computing model
+       explainability insights. Default value is 2000.
+
+    Limitations of evaluation dataset:
+     - If the input dataset is pandas dataframe, the feature columns in pandas dataframe must be
+       scalar value columns, or object type columns with values of "pyspark.ml.Vector" type.
+       Other object types (nd.array/list/etc.) are not supported yet.
+
+    Limitations of default evaluator logging model explainability insights:
+     - The `shap.Explainer` "auto" algorithm will choose Linear explainer for linear model,
+       and choose Tree explainer for tree model. But the shap Linear/Tree explainer does not
+       support multi-class classifier, in this case, default evaluator will fallback to use
+       shap Exact or Permutation explainer.
+     - Logging model explainability insights is not currently supported for PySpark models.
     """
     from mlflow.pyfunc import PyFuncModel
 
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 4932f2b8e9a0e..939f0561aa6ae 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -19,42 +19,8 @@
 import logging
 from packaging.version import Version
 
-"""
-[P0] Accuracy: Calculates how often predictions equal labels.
-[P0] BinaryCrossentropy: Computes the crossentropy metric between the labels and predictions.
-[P0] Hinge: Computes the hinge metric between y_true and y_pred.
-[P0] Sum: Computes the (weighted) sum of the given values.
-[P0] Mean: Computes the (weighted) mean of the given values.
-[P0] ExampleCount: Computes the total number of evaluation examples.
-[P0] MeanAbsoluteError: Computes the mean absolute error between the labels and predictions.
-[P0] MeanSquaredError: Computes the mean squared error between y_true and y_pred.
-[P0] RootMeanSquaredError: Computes root mean squared error metric between y_true and y_pred.
-
-[P0] TrueNegatives: Calculates the number of true negatives.
-[P0] TruePositives: Calculates the number of true positives.
-[P0] FalseNegatives: Calculates the number of false negatives.
-[P0] FalsePositives: Calculates the number of false positives.
-https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html#sklearn.metrics.confusion_matrix
-
-[P0] Precision: Computes the precision of the predictions with respect to the labels.
-[P0] Recall: Computes the recall of the predictions with respect to the labels.
-[P0] AUC: Approximates the AUC (Area under the curve) of the ROC or PR curves.
-[P0] F1 Score: 2*precision*recall / (precision+recall)
-
-[P0] BinaryClassConfusionMatrix
-
-Plots
-[P0] Confusion matrix
-[P0] Interactive ROC curve with metrics (TP/TN/FP/FN/Acc/F1/AUC), binary classification
-[P0] Lift chart
-
-Global explainability
-[P0] Model built-in feature importance (supported models)
-[P0] SHAP explainers
-     [P0] Summary plot
-"""
-
-from PIL.Image import Image, open as open_image
+
+from PIL.Image import open as open_image
 
 
 _logger = logging.getLogger(__name__)
@@ -193,7 +159,8 @@ def _log_model_explainability(self):
             #  To support this, we need expand the Vector type feature column into
             #  multiple scaler feature columns and pass it to shap explainer.
             _logger.warning(
-                'Log model explainability currently does not support spark model.'
+                'Logging model explainability insights is not currently supported for PySpark'
+                ' models.'
             )
             return
 
@@ -202,7 +169,8 @@ def _log_model_explainability(self):
             import matplotlib.pyplot as pyplot
         except ImportError:
             _logger.warning(
-                'Shap or matplotlib package is not installed, Skip log model explainability.'
+                'SHAP or matplotlib package is not installed, so model explainability insights '
+                'will not be logged.'
             )
 
         if Version(shap.__version__) < Version('0.40'):
@@ -295,11 +263,12 @@ def plot_feature_importance():
             "shap_feature_importance_plot",
         )
 
-    def _evaluate_per_class(self, positive_class, y, y_pred, y_proba):
+    def _log_per_class_evaluations(self, positive_class, y, y_pred, y_proba):
         """
-        if positive_class is an interger, generate metrics and artifacts on this class vs. rest,
-         and the y/y_pred/y_proba must be sum up to a binary "is class" and "is not class"
-        if positive_class is None, generate metrics and artifacts on binary y/y_pred/y_proba
+        if positive_class is an integer, generate metrics and artifacts on this class vs. rest.
+        The y/y_pred/y_proba must sum up to a binary "is class" and "is not class".
+
+        If positive_class is None, generate metrics and artifacts on binary y/y_pred/y_proba.
         """
 
         def _gen_metric_name(name):
@@ -362,14 +331,20 @@ def plot_pr_curve():
     def _evaluate_classifier(self):
         from mlflow.models.evaluation.lift_curve import plot_lift_curve
 
-        # Note: require labels to be number of 0, 1, 2, .. num_classes - 1
         label_list = sorted(list(set(self.y)))
-        assert label_list[0] == 0, "Label values must being at '0'."
         self.num_classes = len(label_list)
 
         self.y_pred = self.predict_fn(self.X)
         self.is_binomial = self.num_classes <= 2
 
+        if self.is_binomial:
+            for label in label_list:
+                if int(label) not in [-1, 0, 1]:
+                    raise ValueError(
+                        'Binomial classification require evaluation dataset label values to be '
+                        '-1, 0, or 1.'
+                    )
+
         self.metrics["accuracy"] = sk_metrics.accuracy_score(self.y, self.y_pred)
         self.metrics["example_count"] = len(self.X)
 
@@ -387,7 +362,10 @@ def _evaluate_classifier(self):
             self.metrics['log_loss'] = sk_metrics.log_loss(self.y, self.y_probs)
 
             if self.is_binomial:
-                self._evaluate_per_class(None, self.y, self.y_pred, self.y_prob)
+                self._log_per_class_evaluations(
+                    positive_class=None,
+                    y=self.y, y_pred=self.y_pred, y_proba=self.y_prob
+                )
                 self._log_image_artifact(
                     lambda: plot_lift_curve(self.y, self.y_probs), "lift_curve_plot",
                 )
@@ -400,7 +378,7 @@ def _evaluate_classifier(self):
                     y_per_class = np.where(self.y == postive_class, 1, 0)
                     y_pred_per_class = np.where(self.y_pred == postive_class, 1, 0)
                     pos_class_prob = self.y_probs[:, postive_class]
-                    self._evaluate_per_class(
+                    self._log_per_class_evaluations(
                         postive_class, y_per_class, y_pred_per_class, pos_class_prob
                     )
 
@@ -476,7 +454,7 @@ def evaluate(
                 _logger.warning(
                     f"According to the evaluation dataset label values, the model type looks like "
                     f"{infered_model_type}, but you specified model type {model_type}. Please "
-                    f"check you set model type or evaluation dataset correctly."
+                    f"verify that you set the `model_type` and `dataset` arguments correctly."
                 )
 
             if model_type == "classifier":

From 47bde7b068f12042141812ac156f29464e743b74 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 23 Dec 2021 21:08:24 +0800
Subject: [PATCH 091/120] add tests

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              |  4 +-
 mlflow/models/evaluation/default_evaluator.py |  2 +-
 tests/models/test_evaluation.py               | 42 +++++++++++++++++++
 3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 9081451cc7a1a..a688b39f5c2b3 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -14,6 +14,7 @@
 import logging
 import struct
 import sys
+from collections import OrderedDict
 
 
 _logger = logging.getLogger(__name__)
@@ -569,7 +570,8 @@ def check_nesting_config_dict(_evaluator_name_list, _evaluator_name_to_conf_map)
                     "must be a dict contains mapping from evaluator name to individual "
                     "evaluator config dict."
                 )
-        evaluator_name_list = list(set(evaluators))
+        # Use `OrderedDict.fromkeys` to deduplicate elements but keep elements order.
+        evaluator_name_list = list(OrderedDict.fromkeys(evaluators))
         evaluator_name_to_conf_map = evaluator_config or {}
     else:
         raise ValueError(
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 939f0561aa6ae..7f938cc18fd61 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -50,7 +50,7 @@ def _load_content_from_file(self, local_artifact_path):
 def _infer_model_type_by_labels(labels):
     distinct_labels = set(labels)
     for v in distinct_labels:
-        if v < 0 or not float(v).is_integer():
+        if not float(v).is_integer():
             return "regressor"
     if len(distinct_labels) > 1000 and len(distinct_labels) / len(labels) > 0.7:
         return "regressor"
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index 1511e8598f8ad..b6f59d0afd30e 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -9,6 +9,8 @@
     EvaluationArtifact,
     EvaluationMetrics,
 )
+from mlflow.models.evaluation.base import \
+    _normalize_evaluators_and_evaluator_config_args as _normalize_config
 import hashlib
 from mlflow.models.evaluation.base import _start_run_or_reuse_active_run
 import sklearn
@@ -22,6 +24,7 @@
 from mlflow.utils.file_utils import TempDir
 from mlflow_test_plugin.dummy_evaluator import Array2DEvaluationArtifact
 from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
+from mlflow.models.evaluation.base import _logger as _base_logger
 
 from sklearn.metrics import (
     accuracy_score,
@@ -540,3 +543,42 @@ def test_start_run_or_reuse_active_run():
             with _start_run_or_reuse_active_run(run_id=previous_run_id):
                 pass
 
+
+def test_normalize_evaluators_and_evaluator_config_args():
+    from mlflow.models.evaluation.default_evaluator import DefaultEvaluator
+    with mock.patch.object(
+        _model_evaluation_registry,
+        "_registry",
+        {"default": DefaultEvaluator},
+    ):
+        assert _normalize_config(None, None) == (['default'], {})
+        assert _normalize_config(None, {'a': 3}) == (['default'], {'default': {'a': 3}})
+        assert _normalize_config(None, {'default': {'a': 3}}) == (['default'], {'default': {'a': 3}})
+
+    assert _normalize_config(None, None) == (['default', 'dummy_evaluator'], {})
+    with pytest.raises(
+            ValueError,
+            match='`evaluator_config` argument must be a dictionary mapping each evaluator'
+    ):
+        assert _normalize_config(None, {'a': 3}) == (['default', 'dummy_evaluator'], {})
+
+    assert _normalize_config(None, {'default': {'a': 3}}) == (
+        ['default', 'dummy_evaluator'], {'default': {'a': 3}}
+    )
+
+    with mock.patch.object(_base_logger, 'warning') as patched_warning_fn:
+        _normalize_config(None, None)
+        patched_warning_fn.assert_called_once()
+        assert 'Multiple registered evaluators are found' in patched_warning_fn.call_args[0][0]
+
+    assert _normalize_config('dummy_evaluator', {'a': 3}) == \
+        (['dummy_evaluator'], {'dummy_evaluator': {'a': 3}})
+
+    assert _normalize_config(['default', 'dummy_evaluator'], {'dummy_evaluator': {'a': 3}}) == \
+        (['default', 'dummy_evaluator'], {'dummy_evaluator': {'a': 3}})
+
+    with pytest.raises(
+            ValueError,
+            match='evaluator_config must be a dict contains mapping from evaluator name to'
+    ):
+        _normalize_config(['default', 'dummy_evaluator'], {'abc': {'a': 3}})

From 653b1f357f368cd39e2500de1ce49220cf094e13 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Tue, 28 Dec 2021 13:11:20 +0800
Subject: [PATCH 092/120] multiclass classifier merge metrics/plots

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/default_evaluator.py | 185 +++++++++++++-----
 mlflow/models/utils.py                        |  15 ++
 2 files changed, 149 insertions(+), 51 deletions(-)

diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 7f938cc18fd61..52456566fa688 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -8,9 +8,11 @@
 from mlflow.entities.metric import Metric
 from mlflow.utils.file_utils import TempDir
 from mlflow.utils.string_utils import truncate_str_from_middle
+from mlflow.models.utils import plot_lines
 
 from sklearn import metrics as sk_metrics
 import math
+from collections import namedtuple
 import pandas as pd
 import numpy as np
 import json
@@ -263,75 +265,163 @@ def plot_feature_importance():
             "shap_feature_importance_plot",
         )
 
-    def _log_per_class_evaluations(self, positive_class, y, y_pred, y_proba):
+    def _get_per_class_metrics(self, y, y_pred):
         """
-        if positive_class is an integer, generate metrics and artifacts on this class vs. rest.
-        The y/y_pred/y_proba must sum up to a binary "is class" and "is not class".
-
-        If positive_class is None, generate metrics and artifacts on binary y/y_pred/y_proba.
+        For binary classifier, y/y_pred is for the positive class.
+        For multiclass classifier, y/y_pred sum up to a binary "is class" and "is not class".
         """
-
-        def _gen_metric_name(name):
-            if positive_class is not None:
-                return f"class_{positive_class}_{name}"
-            else:
-                return name
-
+        metrics = {}
         confusion_matrix = sk_metrics.confusion_matrix(y, y_pred)
         tn, fp, fn, tp = confusion_matrix.ravel()
-        self.metrics[_gen_metric_name("true_negatives")] = tn
-        self.metrics[_gen_metric_name("false_positives")] = fp
-        self.metrics[_gen_metric_name("false_negatives")] = fn
-        self.metrics[_gen_metric_name("true_positives")] = tp
-        self.metrics[_gen_metric_name("recall")] = sk_metrics.recall_score(y, y_pred)
-        self.metrics[_gen_metric_name("precision")] = sk_metrics.precision_score(y, y_pred)
-        self.metrics[_gen_metric_name("f1_score")] = sk_metrics.f1_score(y, y_pred)
-
-        if y_proba is not None:
-            fpr, tpr, thresholds = sk_metrics.roc_curve(y, y_proba)
+        metrics["true_negatives"] = tn
+        metrics["false_positives"] = fp
+        metrics["false_negatives"] = fn
+        metrics["true_positives"] = tp
+        metrics["recall"] = sk_metrics.recall_score(y, y_pred)
+        metrics["precision"] = sk_metrics.precision_score(y, y_pred)
+        metrics["f1_score"] = sk_metrics.f1_score(y, y_pred)
+        return metrics
+
+    def _log_binary_classifier(self):
+        self.metrics.update(self._get_per_class_metrics(self.y, self.y_pred))
+
+        if self.y_prob is not None:
+            fpr, tpr, thresholds = sk_metrics.roc_curve(self.y, self.y_prob)
             roc_curve_pandas_df = pd.DataFrame(
                 {"fpr": fpr, "tpr": tpr, "thresholds": thresholds}
             )
             self._log_pandas_df_artifact(
-                roc_curve_pandas_df, _gen_metric_name("roc_curve_data"),
+                roc_curve_pandas_df, "roc_curve_data",
             )
 
             roc_auc = sk_metrics.auc(fpr, tpr)
-            self.metrics[_gen_metric_name("roc_auc")] = roc_auc
+            self.metrics["roc_auc"] = roc_auc
 
             def plot_roc_curve():
-                sk_metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc).plot()
+                # Do not use sklearn.metrics roc plot API because
+                # older sklearn verison < 0.24 does not support.
+                plot_lines(
+                    {'roc': (fpr, tpr)},
+                    xlabel='False Positive Rate', ylabel='True Positive Rate',
+                    line_kwargs={"drawstyle": "steps-post"}
+                )
 
-            if hasattr(sk_metrics, 'RocCurveDisplay'):
-                self._log_image_artifact(plot_roc_curve, _gen_metric_name("roc_curve_plot"))
+            self._log_image_artifact(plot_roc_curve, "roc_curve_plot")
 
             precision, recall, thresholds = \
-                sk_metrics.precision_recall_curve(y, y_proba)
+                sk_metrics.precision_recall_curve(self.y, self.y_prob)
             thresholds = np.append(thresholds, [1.0], axis=0)
             pr_curve_pandas_df = pd.DataFrame(
                 {"precision": precision, "recall": recall, "thresholds": thresholds}
             )
-            self._log_pandas_df_artifact(
-                pr_curve_pandas_df,
-                _gen_metric_name("precision_recall_curve_data"),
-            )
+            self._log_pandas_df_artifact(pr_curve_pandas_df, "precision_recall_curve_data")
 
             pr_auc = sk_metrics.auc(recall, precision)
-            self.metrics[_gen_metric_name("precision_recall_auc")] = pr_auc
+            self.metrics["precision_recall_auc"] = pr_auc
+
+            def plot_precision_recall_curve():
+                # Do not use sklearn.metrics precision-recall plot API because
+                # older sklearn verison < 0.24 does not support.
+                plot_lines(
+                    {'pr_curve': (recall, precision)}, xlabel='recall', ylabel='precision',
+                    line_kwargs={"drawstyle": "steps-post"}
+                )
 
-            def plot_pr_curve():
-                sk_metrics.PrecisionRecallDisplay(precision, recall).plot()
+            self._log_image_artifact(plot_precision_recall_curve, "precision_recall_curve_plot")
 
-            if hasattr(sk_metrics, 'PrecisionRecallDisplay'):
-                self._log_image_artifact(
-                    plot_pr_curve,
-                    _gen_metric_name("precision_recall_curve_plot"),
+    def _log_multiclass_classifier(self):
+
+        per_class_metrics_list = []
+        per_class_roc_curve_data_list = []
+        per_class_precision_recall_curve_data_list = []
+
+        PerClassRocCurveData = namedtuple('PerClassRocCurveData', ['postive_class', 'fpr', 'tpr', 'thresholds'])
+        PerClassPrecisionRecallCurveData = namedtuple(
+            'PerClassPrecisionRecallCurveData', ['postive_class', 'precision', 'recall', 'thresholds']
+        )
+
+        for postive_class in self.label_list:
+            y_is_positive = np.where(self.y == postive_class, 1, 0)
+            y_pred_is_positive = np.where(self.y_pred == postive_class, 1, 0)
+
+            if self.y_probs is not None:
+                prob_of_positive = self.y_probs[:, postive_class]
+
+            per_class_metrics = {'positive_class': postive_class}
+            per_class_metrics_list.append(per_class_metrics)
+
+            per_class_metrics.update(self._get_per_class_metrics(y_is_positive, y_pred_is_positive))
+
+            if self.y_probs is not None:
+                fpr, tpr, thresholds = sk_metrics.roc_curve(y_is_positive, prob_of_positive)
+                per_class_roc_curve_data_list.append(
+                    PerClassRocCurveData(postive_class, fpr, tpr, thresholds)
+                )
+                roc_auc = sk_metrics.auc(fpr, tpr)
+                per_class_metrics["roc_auc"] = roc_auc
+
+                precision, recall, thresholds = \
+                    sk_metrics.precision_recall_curve(y_is_positive, prob_of_positive)
+                thresholds = np.append(thresholds, [1.0], axis=0)
+                per_class_precision_recall_curve_data_list.append(
+                    PerClassPrecisionRecallCurveData(postive_class, precision, recall, thresholds)
+                )
+                pr_auc = sk_metrics.auc(recall, precision)
+                per_class_metrics["precision_recall_auc"] = pr_auc
+
+        per_class_metrics_pandas_df = pd.DataFrame(per_class_metrics_list)
+        self._log_pandas_df_artifact(per_class_metrics_pandas_df, "per_class_metrics_data")
+
+        if self.y_probs is not None:
+            per_class_roc_curve_pandas_df = pd.concat(
+                [pd.DataFrame(item._asdict()) for item in per_class_roc_curve_data_list],
+                ignore_index=True
+            )
+            self._log_pandas_df_artifact(per_class_roc_curve_pandas_df, "per_class_roc_curve_data")
+
+            per_class_precision_recall_curve_pandas_df = pd.concat(
+                [pd.DataFrame(item._asdict()) for item in per_class_precision_recall_curve_data_list],
+                ignore_index=True
+            )
+            self._log_pandas_df_artifact(
+                per_class_precision_recall_curve_pandas_df,
+                "per_class_precision_recall_curve_data"
+            )
+
+            def plot_roc_curve():
+                data_series = {
+                    f'Positive Class = {postive_class}': (fpr, tpr)
+                    for postive_class, fpr, tpr, _ in per_class_roc_curve_data_list
+                }
+                plot_lines(
+                    data_series, xlabel='False Positive Rate', ylabel='True Positive Rate',
+                    legend_loc='lower right',
+                    line_kwargs={"drawstyle": "steps-post"}
                 )
 
+            def plot_precision_recall_curve():
+                data_series = {
+                    f'Positive Class = {postive_class}': (recall, precision)
+                    for postive_class, precision, recall, _ in per_class_precision_recall_curve_data_list
+                }
+                plot_lines(
+                    data_series, xlabel='recall', ylabel='precision',
+                    legend_loc='lower left',
+                    line_kwargs={"drawstyle": "steps-post"}
+                )
+
+            if self.num_classes <= 10:
+                self._log_image_artifact(plot_roc_curve, "roc_curve_plot")
+                self._log_image_artifact(plot_precision_recall_curve, "precision_recall_curve_plot")
+            else:
+                _logger.warning('The classifier num_classes > 10, skip logging plots for ROC curve and '
+                                'Precision-Recall curve.')
+
     def _evaluate_classifier(self):
         from mlflow.models.evaluation.lift_curve import plot_lift_curve
 
         label_list = sorted(list(set(self.y)))
+        self.label_list = label_list
         self.num_classes = len(label_list)
 
         self.y_pred = self.predict_fn(self.X)
@@ -358,14 +448,14 @@ def _evaluate_classifier(self):
             self.y_probs = None
             self.y_prob = None
 
+        if self.is_binomial:
+            self._log_binary_classifier()
+        else:
+            self._log_multiclass_classifier()
+
         if self.predict_proba_fn is not None:
             self.metrics['log_loss'] = sk_metrics.log_loss(self.y, self.y_probs)
-
             if self.is_binomial:
-                self._log_per_class_evaluations(
-                    positive_class=None,
-                    y=self.y, y_pred=self.y_pred, y_proba=self.y_prob
-                )
                 self._log_image_artifact(
                     lambda: plot_lift_curve(self.y, self.y_probs), "lift_curve_plot",
                 )
@@ -374,13 +464,6 @@ def _evaluate_classifier(self):
                     sk_metrics.f1_score(self.y, self.y_pred, average='micro')
                 self.metrics['f1_score_macro'] = \
                     sk_metrics.f1_score(self.y, self.y_pred, average='macro')
-                for postive_class in range(self.num_classes):
-                    y_per_class = np.where(self.y == postive_class, 1, 0)
-                    y_pred_per_class = np.where(self.y_pred == postive_class, 1, 0)
-                    pos_class_prob = self.y_probs[:, postive_class]
-                    self._log_per_class_evaluations(
-                        postive_class, y_per_class, y_pred_per_class, pos_class_prob
-                    )
 
         # TODO: Shall we also log confusion_matrix data as a json artifact ?
         confusion_matrix = sk_metrics.confusion_matrix(self.y, self.y_pred)
diff --git a/mlflow/models/utils.py b/mlflow/models/utils.py
index 137a1eac0f0ab..a8813aa136a84 100644
--- a/mlflow/models/utils.py
+++ b/mlflow/models/utils.py
@@ -234,3 +234,18 @@ def _read_sparse_matrix_from_json(path, example_type):
             return csc_matrix((data, indices, indptr), shape=shape)
         else:
             return csr_matrix((data, indices, indptr), shape=shape)
+
+
+def plot_lines(data_series, xlabel, ylabel, legend_loc=None, line_kwargs={}):
+    import matplotlib.pyplot as plt
+    fig, ax = plt.subplots()
+
+    for label, (data_x, data_y) in data_series.items():
+        ax.plot(data_x, data_y, label=label, **line_kwargs)
+
+    if legend_loc:
+        ax.legend(loc=legend_loc)
+
+    ax.set(xlabel=xlabel, ylabel=ylabel)
+
+    return fig

From c6b7ed6c5069266df1b05e89d066261b32cba3a7 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 29 Dec 2021 17:37:26 +0800
Subject: [PATCH 093/120] zfill feature name

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index a688b39f5c2b3..f3cdea4875197 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -14,6 +14,7 @@
 import logging
 import struct
 import sys
+import math
 from collections import OrderedDict
 
 
@@ -244,7 +245,10 @@ def __init__(self, data, labels, name=None, path=None, feature_names=None):
                     raise ValueError('feature name list must be the same length with feature data.')
                 self._feature_names = feature_names
             else:
-                self._feature_names = [f'f_{i}' for i in range(num_features)]
+                self._feature_names = [
+                    f'f_{str(i).zfill(math.ceil((math.log10(num_features))))}'
+                    for i in range(num_features)
+                ]
         else:
             pd_column_names = [c for c in self.data.columns if c != self.labels]
             if feature_names is not None:

From f414cc0464c2f313dcd533b2915b4d2445feb4f2 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 29 Dec 2021 18:22:27 +0800
Subject: [PATCH 094/120] update doc

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index f3cdea4875197..b09a01692307a 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -172,6 +172,8 @@ def __init__(self, data, labels, name=None, path=None, feature_names=None):
          - A Pandas DataFrame, or a spark DataFrame,
            containing evaluation features and labels. All columns will be regarded as feature
            columns except the "labels" column.
+         Note: If the mlflow model to be evaluated is a pyspark ML model, then the input data must
+           be a spark DataFrame contains a feature column of "Vector" type, and a label column.
 
         :param labels: One of the following:
          - A numpy array or list of evaluation labels, if `data` is also a numpy array or list.
@@ -183,7 +185,10 @@ def __init__(self, data, labels, name=None, path=None, feature_names=None):
         :param path: (Optional) the path to a serialized DataFrame (must not contain ").
           (e.g. a delta table, parquet file)
 
-        :param feature_names: (Optional) A list of the feature names.
+        :param feature_names: (Optional) A list of the feature names attached to the numpy array
+          input data. The argument is only useful in the case the input data is numpy array.
+          For pandas DataFrame input case, the pandas column name will be used as feature name.
+          The feature names will be shown in model explainability plots.
         """
         import numpy as np
         import pandas as pd
@@ -677,8 +682,9 @@ def evaluate(
 
     Limitations of evaluation dataset:
      - If the input dataset is pandas dataframe, the feature columns in pandas dataframe must be
-       scalar value columns, or object type columns with values of "pyspark.ml.Vector" type.
-       Other object types (nd.array/list/etc.) are not supported yet.
+       scalar value columns, other object types (nd.array/list/etc.) are not supported yet.
+     - If the mlflow model to be evaluated is a pyspark ML model, then the input data must
+       be a spark DataFrame contains a feature column of "Vector" type, and a label column.
 
     Limitations of default evaluator logging model explainability insights:
      - The `shap.Explainer` "auto" algorithm will choose Linear explainer for linear model,

From ea921d701c6d852a87e524cd3edc7890db14822e Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 29 Dec 2021 20:01:10 +0800
Subject: [PATCH 095/120] add config
 max_num_classes_threshold_logging_roc_pr_curve_for_multiclass_classifier

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              |  3 ++
 mlflow/models/evaluation/default_evaluator.py | 37 ++++++++++++-------
 2 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index b09a01692307a..4eb7d852ef2d2 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -679,6 +679,9 @@ def evaluate(
        chooses the best Explainer based on the model.
      - explainability_nsamples: The number of sample rows to use for computing model
        explainability insights. Default value is 2000.
+     - max_num_classes_threshold_logging_roc_pr_curve_for_multiclass_classifier:
+       For multiclass classifier, specify the max number of classes which allow logging per-class
+       ROC curve and Precision-Recall curve.
 
     Limitations of evaluation dataset:
      - If the input dataset is pandas dataframe, the feature columns in pandas dataframe must be
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 52456566fa688..fbe5331270662 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -339,6 +339,19 @@ def _log_multiclass_classifier(self):
         PerClassPrecisionRecallCurveData = namedtuple(
             'PerClassPrecisionRecallCurveData', ['postive_class', 'precision', 'recall', 'thresholds']
         )
+        log_roc_pr_curve = False
+        if self.y_probs is not None:
+            max_num_classes_for_logging_curve = \
+                self.evaluator_config.get(
+                    'max_num_classes_threshold_logging_roc_pr_curve_for_multiclass_classifier', 10
+                )
+            if self.num_classes <= max_num_classes_for_logging_curve:
+                log_roc_pr_curve = True
+            else:
+                _logger.warning(f'The classifier num_classes > {max_num_classes_for_logging_curve}, skip logging '
+                                f'ROC curve and Precision-Recall curve. You can add evaluator config '
+                                f"'max_num_classes_threshold_logging_roc_pr_curve_for_multiclass_classifier' to "
+                                f"increase the threshold.")
 
         for postive_class in self.label_list:
             y_is_positive = np.where(self.y == postive_class, 1, 0)
@@ -354,25 +367,27 @@ def _log_multiclass_classifier(self):
 
             if self.y_probs is not None:
                 fpr, tpr, thresholds = sk_metrics.roc_curve(y_is_positive, prob_of_positive)
-                per_class_roc_curve_data_list.append(
-                    PerClassRocCurveData(postive_class, fpr, tpr, thresholds)
-                )
+                if log_roc_pr_curve:
+                    per_class_roc_curve_data_list.append(
+                        PerClassRocCurveData(postive_class, fpr, tpr, thresholds)
+                    )
                 roc_auc = sk_metrics.auc(fpr, tpr)
                 per_class_metrics["roc_auc"] = roc_auc
 
                 precision, recall, thresholds = \
                     sk_metrics.precision_recall_curve(y_is_positive, prob_of_positive)
                 thresholds = np.append(thresholds, [1.0], axis=0)
-                per_class_precision_recall_curve_data_list.append(
-                    PerClassPrecisionRecallCurveData(postive_class, precision, recall, thresholds)
-                )
+                if log_roc_pr_curve:
+                    per_class_precision_recall_curve_data_list.append(
+                        PerClassPrecisionRecallCurveData(postive_class, precision, recall, thresholds)
+                    )
                 pr_auc = sk_metrics.auc(recall, precision)
                 per_class_metrics["precision_recall_auc"] = pr_auc
 
         per_class_metrics_pandas_df = pd.DataFrame(per_class_metrics_list)
         self._log_pandas_df_artifact(per_class_metrics_pandas_df, "per_class_metrics_data")
 
-        if self.y_probs is not None:
+        if self.y_probs is not None and log_roc_pr_curve:
             per_class_roc_curve_pandas_df = pd.concat(
                 [pd.DataFrame(item._asdict()) for item in per_class_roc_curve_data_list],
                 ignore_index=True
@@ -410,12 +425,8 @@ def plot_precision_recall_curve():
                     line_kwargs={"drawstyle": "steps-post"}
                 )
 
-            if self.num_classes <= 10:
-                self._log_image_artifact(plot_roc_curve, "roc_curve_plot")
-                self._log_image_artifact(plot_precision_recall_curve, "precision_recall_curve_plot")
-            else:
-                _logger.warning('The classifier num_classes > 10, skip logging plots for ROC curve and '
-                                'Precision-Recall curve.')
+            self._log_image_artifact(plot_roc_curve, "roc_curve_plot")
+            self._log_image_artifact(plot_precision_recall_curve, "precision_recall_curve_plot")
 
     def _evaluate_classifier(self):
         from mlflow.models.evaluation.lift_curve import plot_lift_curve

From e63e19720d5972d0c156d50af855f520a6b79bb5 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 29 Dec 2021 21:20:10 +0800
Subject: [PATCH 096/120] refactor

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              |   2 +-
 mlflow/models/evaluation/default_evaluator.py | 127 +++++++++++-------
 tests/models/test_default_evaluator.py        |  22 ++-
 3 files changed, 87 insertions(+), 64 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 4eb7d852ef2d2..023ecdd7504d7 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -251,7 +251,7 @@ def __init__(self, data, labels, name=None, path=None, feature_names=None):
                 self._feature_names = feature_names
             else:
                 self._feature_names = [
-                    f'f_{str(i).zfill(math.ceil((math.log10(num_features))))}'
+                    f'feature_{str(i).zfill(math.ceil((math.log10(num_features))))}'
                     for i in range(num_features)
                 ]
         else:
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index fbe5331270662..c7a78436fc28c 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -99,6 +99,67 @@ def _gen_log_key(key, dataset_name):
     return f'{key}_on_data_{dataset_name}'
 
 
+def _get_regressor_metrics(y, y_pred):
+    return {
+        'example_count': len(y),
+        'mean_absolute_error': sk_metrics.mean_absolute_error(y, y_pred),
+        'mean_squared_error': sk_metrics.mean_squared_error(y, y_pred),
+        'root_mean_squared_error': math.sqrt(sk_metrics.mean_squared_error(y, y_pred)),
+        'sum_on_label': sum(y),
+        'mean_on_label': sum(y) / len(y),
+        'r2_score': sk_metrics.r2_score(y, y_pred),
+        'max_error': sk_metrics.max_error(y, y_pred),
+        'mean_absolute_percentage_error': sk_metrics.mean_absolute_percentage_error(y, y_pred)
+    }
+
+
+def _get_binary_sum_up_label_pred_prob(postive_class, y, y_pred, y_probs):
+    y_is_positive = np.where(y == postive_class, 1, 0)
+    y_pred_is_positive = np.where(y_pred == postive_class, 1, 0)
+
+    if y_probs is not None:
+        prob_of_positive = y_probs[:, postive_class]
+    else:
+        prob_of_positive = None
+
+    return y_is_positive, y_pred_is_positive, prob_of_positive
+
+
+def _get_classifier_per_class_metrics(y, y_pred):
+    """
+    For binary classifier, y/y_pred is for the positive class.
+    For multiclass classifier, y/y_pred sum up to a binary "is class" and "is not class".
+    """
+    metrics = {}
+    confusion_matrix = sk_metrics.confusion_matrix(y, y_pred)
+    tn, fp, fn, tp = confusion_matrix.ravel()
+    metrics["true_negatives"] = tn
+    metrics["false_positives"] = fp
+    metrics["false_negatives"] = fn
+    metrics["true_positives"] = tp
+    metrics["recall"] = sk_metrics.recall_score(y, y_pred)
+    metrics["precision"] = sk_metrics.precision_score(y, y_pred)
+    metrics["f1_score"] = sk_metrics.f1_score(y, y_pred)
+    return metrics
+
+
+def _get_classifier_global_metrics(is_binomial, y, y_pred, y_probs):
+    metrics = {}
+    metrics["accuracy"] = sk_metrics.accuracy_score(y, y_pred)
+    metrics["example_count"] = len(X)
+
+    if not is_binomial:
+        metrics['f1_score_micro'] = \
+            sk_metrics.f1_score(y, y_pred, average='micro')
+        metrics['f1_score_macro'] = \
+            sk_metrics.f1_score(y, y_pred, average='macro')
+
+    if y_probs is not None:
+        metrics['log_loss'] = sk_metrics.log_loss(y, y_probs)
+
+    return metrics
+
+
 class DefaultEvaluator(ModelEvaluator):
     def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
         return model_type in ["classifier", "regressor"]
@@ -189,7 +250,7 @@ def _log_model_explainability(self):
         truncated_feature_names = [truncate_str_from_middle(f, 20) for f in self.feature_names]
         for i, truncated_name in enumerate(truncated_feature_names):
             if truncated_name != self.feature_names[i]:
-                # For truncated name, attach "(f_{feature_index})" at the end
+                # For duplicated truncated name, attach "(f_{feature_index})" at the end
                 truncated_feature_names[i] = f'{truncated_name}(f_{i})'
 
         truncated_feature_name_map = {f: f2 for f, f2 in zip(self.feature_names, truncated_feature_names)}
@@ -265,25 +326,8 @@ def plot_feature_importance():
             "shap_feature_importance_plot",
         )
 
-    def _get_per_class_metrics(self, y, y_pred):
-        """
-        For binary classifier, y/y_pred is for the positive class.
-        For multiclass classifier, y/y_pred sum up to a binary "is class" and "is not class".
-        """
-        metrics = {}
-        confusion_matrix = sk_metrics.confusion_matrix(y, y_pred)
-        tn, fp, fn, tp = confusion_matrix.ravel()
-        metrics["true_negatives"] = tn
-        metrics["false_positives"] = fp
-        metrics["false_negatives"] = fn
-        metrics["true_positives"] = tp
-        metrics["recall"] = sk_metrics.recall_score(y, y_pred)
-        metrics["precision"] = sk_metrics.precision_score(y, y_pred)
-        metrics["f1_score"] = sk_metrics.f1_score(y, y_pred)
-        return metrics
-
     def _log_binary_classifier(self):
-        self.metrics.update(self._get_per_class_metrics(self.y, self.y_pred))
+        self.metrics.update(_get_classifier_per_class_metrics(self.y, self.y_pred))
 
         if self.y_prob is not None:
             fpr, tpr, thresholds = sk_metrics.roc_curve(self.y, self.y_prob)
@@ -330,7 +374,6 @@ def plot_precision_recall_curve():
             self._log_image_artifact(plot_precision_recall_curve, "precision_recall_curve_plot")
 
     def _log_multiclass_classifier(self):
-
         per_class_metrics_list = []
         per_class_roc_curve_data_list = []
         per_class_precision_recall_curve_data_list = []
@@ -354,16 +397,15 @@ def _log_multiclass_classifier(self):
                                 f"increase the threshold.")
 
         for postive_class in self.label_list:
-            y_is_positive = np.where(self.y == postive_class, 1, 0)
-            y_pred_is_positive = np.where(self.y_pred == postive_class, 1, 0)
-
-            if self.y_probs is not None:
-                prob_of_positive = self.y_probs[:, postive_class]
+            y_is_positive, y_pred_is_positive, prob_of_positive = \
+                _get_binary_sum_up_label_pred_prob(self.y, self.y_pred, self.y_probs)
 
             per_class_metrics = {'positive_class': postive_class}
             per_class_metrics_list.append(per_class_metrics)
 
-            per_class_metrics.update(self._get_per_class_metrics(y_is_positive, y_pred_is_positive))
+            per_class_metrics.update(
+                _get_classifier_per_class_metrics(y_is_positive, y_pred_is_positive)
+            )
 
             if self.y_probs is not None:
                 fpr, tpr, thresholds = sk_metrics.roc_curve(y_is_positive, prob_of_positive)
@@ -446,9 +488,6 @@ def _evaluate_classifier(self):
                         '-1, 0, or 1.'
                     )
 
-        self.metrics["accuracy"] = sk_metrics.accuracy_score(self.y, self.y_pred)
-        self.metrics["example_count"] = len(self.X)
-
         if self.predict_proba_fn is not None:
             self.y_probs = self.predict_proba_fn(self.X)
             if self.is_binomial:
@@ -459,22 +498,19 @@ def _evaluate_classifier(self):
             self.y_probs = None
             self.y_prob = None
 
+        self.metrics.update(
+            _get_classifier_global_metrics(self.is_binomial, self.y, self.y_pred, self.y_probs)
+        )
+
         if self.is_binomial:
             self._log_binary_classifier()
         else:
             self._log_multiclass_classifier()
 
-        if self.predict_proba_fn is not None:
-            self.metrics['log_loss'] = sk_metrics.log_loss(self.y, self.y_probs)
-            if self.is_binomial:
-                self._log_image_artifact(
-                    lambda: plot_lift_curve(self.y, self.y_probs), "lift_curve_plot",
-                )
-            else:
-                self.metrics['f1_score_micro'] = \
-                    sk_metrics.f1_score(self.y, self.y_pred, average='micro')
-                self.metrics['f1_score_macro'] = \
-                    sk_metrics.f1_score(self.y, self.y_pred, average='macro')
+        if self.is_binomial and self.y_probs is not None:
+            self._log_image_artifact(
+                lambda: plot_lift_curve(self.y, self.y_probs), "lift_curve_plot",
+            )
 
         # TODO: Shall we also log confusion_matrix data as a json artifact ?
         confusion_matrix = sk_metrics.confusion_matrix(self.y, self.y_pred)
@@ -493,16 +529,7 @@ def plot_confusion_matrix():
 
     def _evaluate_regressor(self):
         self.y_pred = self.model.predict(self.X)
-        self.metrics["example_count"] = len(self.X)
-        self.metrics["mean_absolute_error"] = sk_metrics.mean_absolute_error(self.y, self.y_pred)
-        self.metrics["mean_squared_error"] = sk_metrics.mean_squared_error(self.y, self.y_pred)
-        self.metrics["root_mean_squared_error"] = math.sqrt(self.metrics["mean_squared_error"])
-        self.metrics['sum_on_label'] = sum(self.y)
-        self.metrics['mean_on_label'] = self.metrics['sum_on_label'] / self.metrics["example_count"]
-        self.metrics['r2_score'] = sk_metrics.r2_score(self.y, self.y_pred)
-        self.metrics['max_error'] = sk_metrics.max_error(self.y, self.y_pred)
-        self.metrics['mean_absolute_percentage_error'] = \
-            sk_metrics.mean_absolute_percentage_error(self.y, self.y_pred)
+        self.metrics.update(_get_regressor_metrics(self.y, self.y_pred))
 
         self._log_metrics()
         self._log_model_explainability()
diff --git a/tests/models/test_default_evaluator.py b/tests/models/test_default_evaluator.py
index c8b40e5fd7916..63f97efea1868 100644
--- a/tests/models/test_default_evaluator.py
+++ b/tests/models/test_default_evaluator.py
@@ -1,7 +1,10 @@
 import numpy as np
 import json
+import math
+import sklearn.metrics
 
 from mlflow.models.evaluation import evaluate, EvaluationDataset
+from mlflow.models.evaluation.default_evaluator import _get_regressor_metrics
 import mlflow
 from sklearn.datasets import load_boston
 from sklearn.linear_model import LinearRegression
@@ -26,17 +29,12 @@ def test_regressor_evaluation(regressor_model_uri, diabetes_dataset):
     params, metrics, tags, artifacts = \
         get_run_data(run.info.run_id)
 
-    expected_metrics = {
-        'example_count': 148.0,
-        'mean_absolute_error': 42.927,
-        'mean_squared_error': 2747.513,
-        'root_mean_squared_error': 52.416,
-        'sum_on_label': 23099.0,
-        'mean_on_label': 156.074,
-        'r2_score': 0.565,
-        'max_error': 151.354,
-        'mean_absolute_percentage_error': 0.413
-    }
+    model = mlflow.pyfunc.load_model(regressor_model_uri)
+
+    y = diabetes_dataset.labels
+    y_pred = model.predict(diabetes_dataset.data)
+
+    expected_metrics = _get_regressor_metrics(y, y_pred)
     for metric_key in expected_metrics:
         assert np.isclose(
             expected_metrics[metric_key],
@@ -49,8 +47,6 @@ def test_regressor_evaluation(regressor_model_uri, diabetes_dataset):
             rtol=1e-3
         )
 
-    model = mlflow.pyfunc.load_model(regressor_model_uri)
-
     assert json.loads(tags['mlflow.datasets']) == \
         [{**diabetes_dataset._metadata, 'model': model.metadata.model_uuid}]
 

From 946a7ccb088c775e1150914abd361c7bd2aebbe8 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 29 Dec 2021 22:29:30 +0800
Subject: [PATCH 097/120] update tests

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              |  66 ++--
 mlflow/models/evaluation/default_evaluator.py | 239 +++++++-----
 .../models/evaluation/evaluator_registry.py   |   3 +-
 mlflow/models/evaluation/lift_curve.py        |  54 +--
 mlflow/models/utils.py                        |   1 +
 tests/models/test_default_evaluator.py        | 350 ++++++++++--------
 tests/models/test_evaluation.py               | 140 ++++---
 7 files changed, 479 insertions(+), 374 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 023ecdd7504d7..511ad9fd3ea88 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -200,9 +200,10 @@ def __init__(self, data, labels, name=None, path=None, feature_names=None):
         try:
             # add checking `'pyspark' in sys.modules` to avoid importing pyspark when user
             # run code not related to pyspark.
-            if 'pyspark' in sys.modules:
+            if "pyspark" in sys.modules:
                 from pyspark.sql import DataFrame as SparkDataFrame
                 from pyspark.ml.linalg import VectorUDT as SparkVectorUDT
+
                 supported_dataframe_types = (pd.DataFrame, SparkDataFrame)
                 self._spark_df_type = SparkDataFrame
                 self._spark_vector_type = SparkVectorUDT
@@ -247,11 +248,11 @@ def __init__(self, data, labels, name=None, path=None, feature_names=None):
             if feature_names is not None:
                 feature_names = list(feature_names)
                 if num_features != len(feature_names):
-                    raise ValueError('feature name list must be the same length with feature data.')
+                    raise ValueError("feature name list must be the same length with feature data.")
                 self._feature_names = feature_names
             else:
                 self._feature_names = [
-                    f'feature_{str(i).zfill(math.ceil((math.log10(num_features))))}'
+                    f"feature_{str(i).zfill(math.ceil((math.log10(num_features))))}"
                     for i in range(num_features)
                 ]
         else:
@@ -259,8 +260,9 @@ def __init__(self, data, labels, name=None, path=None, feature_names=None):
             if feature_names is not None:
                 feature_names = list(feature_names)
                 if pd_column_names != list(feature_names):
-                    raise ValueError('feature names must match feature column names in the pandas '
-                                     'dataframe')
+                    raise ValueError(
+                        "feature names must match feature column names in the pandas " "dataframe"
+                    )
             self._feature_names = pd_column_names
 
     @property
@@ -434,9 +436,11 @@ def _log_dataset_tag(self, client, run_id, model_uuid):
         dataset_metadata_list = json.loads(existing_dataset_metadata_str)
 
         for metadata in dataset_metadata_list:
-            if metadata["hash"] == self.hash and \
-                    metadata["name"] == self.name and \
-                    metadata["model"] == model_uuid:
+            if (
+                metadata["hash"] == self.hash
+                and metadata["name"] == self.name
+                and metadata["model"] == model_uuid
+            ):
                 break
         else:
             dataset_metadata_list.append({**self._metadata, "model": model_uuid})
@@ -524,24 +528,26 @@ def _start_run_or_reuse_active_run(run_id):
 
 
 def _normalize_evaluators_and_evaluator_config_args(
-        evaluators,
-        evaluator_config,
+    evaluators,
+    evaluator_config,
 ):
     from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
 
     def check_nesting_config_dict(_evaluator_name_list, _evaluator_name_to_conf_map):
-        return isinstance(_evaluator_name_to_conf_map, dict) and \
-               all(k in _evaluator_name_list and isinstance(v, dict)
-                   for k, v in _evaluator_name_to_conf_map.items())
+        return isinstance(_evaluator_name_to_conf_map, dict) and all(
+            k in _evaluator_name_list and isinstance(v, dict)
+            for k, v in _evaluator_name_to_conf_map.items()
+        )
 
     if evaluators is None:
         evaluator_name_list = list(_model_evaluation_registry._registry.keys())
         if len(evaluator_name_list) > 1:
             _logger.warning(
-                f'Multiple registered evaluators are found {evaluator_name_list} and '
-                'they will all be used in evaluation if they support the specified model type. '
-                'If you want to evaluate with one evaluator, specify the `evaluator` argument '
-                'and optionally specify the `evaluator_config` argument.')
+                f"Multiple registered evaluators are found {evaluator_name_list} and "
+                "they will all be used in evaluation if they support the specified model type. "
+                "If you want to evaluate with one evaluator, specify the `evaluator` argument "
+                "and optionally specify the `evaluator_config` argument."
+            )
         if evaluator_config is not None:
             conf_dict_value_error = ValueError(
                 "If `evaluators` argument is None, all available evaluators will be used. "
@@ -550,11 +556,11 @@ def check_nesting_config_dict(_evaluator_name_list, _evaluator_name_to_conf_map)
                 "`evaluator_config` argument must be a dictionary mapping each evaluator's name "
                 "to its own evaluator config dictionary."
             )
-            if evaluator_name_list == ['default']:
+            if evaluator_name_list == ["default"]:
                 if not isinstance(evaluator_config, dict):
                     raise conf_dict_value_error
-                elif 'default' not in evaluator_config:
-                    evaluator_name_to_conf_map = {'default': evaluator_config}
+                elif "default" not in evaluator_config:
+                    evaluator_name_to_conf_map = {"default": evaluator_config}
                 else:
                     evaluator_name_to_conf_map = evaluator_config
             else:
@@ -584,15 +590,15 @@ def check_nesting_config_dict(_evaluator_name_list, _evaluator_name_to_conf_map)
         evaluator_name_to_conf_map = evaluator_config or {}
     else:
         raise ValueError(
-            '`evaluators` argument must be None, an evaluator name string, or a list of '
-            'evaluator names.'
+            "`evaluators` argument must be None, an evaluator name string, or a list of "
+            "evaluator names."
         )
 
     return evaluator_name_list, evaluator_name_to_conf_map
 
 
 def _evaluate(
-        model, model_type, dataset, actual_run_id, evaluator_name_list, evaluator_name_to_conf_map
+    model, model_type, dataset, actual_run_id, evaluator_name_list, evaluator_name_to_conf_map
 ):
     """
     The public API "evaluate" will verify argument first, and then pass normalized arguments
@@ -708,11 +714,17 @@ def evaluate(
             "an instance of `mlflow.pyfunc.PyFuncModel`."
         )
 
-    evaluator_name_list, evaluator_name_to_conf_map = \
-        _normalize_evaluators_and_evaluator_config_args(evaluators, evaluator_config)
+    (
+        evaluator_name_list,
+        evaluator_name_to_conf_map,
+    ) = _normalize_evaluators_and_evaluator_config_args(evaluators, evaluator_config)
 
     with _start_run_or_reuse_active_run(run_id) as actual_run_id:
         return _evaluate(
-            model, model_type, dataset, actual_run_id,
-            evaluator_name_list, evaluator_name_to_conf_map
+            model,
+            model_type,
+            dataset,
+            actual_run_id,
+            evaluator_name_list,
+            evaluator_name_to_conf_map,
         )
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index c7a78436fc28c..7d0a863c3c504 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -60,30 +60,33 @@ def _infer_model_type_by_labels(labels):
 
 
 def _extract_raw_model_and_predict_fn(model):
-    model_loader_module = model.metadata.flavors['python_function']["loader_module"]
+    model_loader_module = model.metadata.flavors["python_function"]["loader_module"]
     predict_fn = model.predict
     predict_proba_fn = None
 
     try:
-        if model_loader_module == 'mlflow.sklearn':
+        if model_loader_module == "mlflow.sklearn":
             raw_model = model._model_impl
-        elif model_loader_module == 'mlflow.lightgbm':
+        elif model_loader_module == "mlflow.lightgbm":
             raw_model = model._model_impl.lgb_model
-        elif model_loader_module == 'mlflow.xgboost':
+        elif model_loader_module == "mlflow.xgboost":
             raw_model = model._model_impl.xgb_model
         else:
             raw_model = None
     except Exception as e:
         raw_model = None
-        _logger.warning(f'Raw model resolution fails unexpectedly on PyFuncModel {model!r}, '
-                        f'error message is {e}')
+        _logger.warning(
+            f"Raw model resolution fails unexpectedly on PyFuncModel {model!r}, "
+            f"error message is {e}"
+        )
 
     if raw_model:
         predict_fn = raw_model.predict
-        predict_proba_fn = getattr(raw_model, 'predict_proba', None)
+        predict_proba_fn = getattr(raw_model, "predict_proba", None)
 
         try:
             import xgboost
+
             if isinstance(raw_model, xgboost.XGBModel):
                 # Because shap evaluation will pass evaluation data in ndarray format
                 # (without feature names), if set validate_features=True it will raise error.
@@ -96,20 +99,20 @@ def _extract_raw_model_and_predict_fn(model):
 
 
 def _gen_log_key(key, dataset_name):
-    return f'{key}_on_data_{dataset_name}'
+    return f"{key}_on_data_{dataset_name}"
 
 
 def _get_regressor_metrics(y, y_pred):
     return {
-        'example_count': len(y),
-        'mean_absolute_error': sk_metrics.mean_absolute_error(y, y_pred),
-        'mean_squared_error': sk_metrics.mean_squared_error(y, y_pred),
-        'root_mean_squared_error': math.sqrt(sk_metrics.mean_squared_error(y, y_pred)),
-        'sum_on_label': sum(y),
-        'mean_on_label': sum(y) / len(y),
-        'r2_score': sk_metrics.r2_score(y, y_pred),
-        'max_error': sk_metrics.max_error(y, y_pred),
-        'mean_absolute_percentage_error': sk_metrics.mean_absolute_percentage_error(y, y_pred)
+        "example_count": len(y),
+        "mean_absolute_error": sk_metrics.mean_absolute_error(y, y_pred),
+        "mean_squared_error": sk_metrics.mean_squared_error(y, y_pred),
+        "root_mean_squared_error": math.sqrt(sk_metrics.mean_squared_error(y, y_pred)),
+        "sum_on_label": sum(y),
+        "mean_on_label": sum(y) / len(y),
+        "r2_score": sk_metrics.r2_score(y, y_pred),
+        "max_error": sk_metrics.max_error(y, y_pred),
+        "mean_absolute_percentage_error": sk_metrics.mean_absolute_percentage_error(y, y_pred),
     }
 
 
@@ -146,16 +149,14 @@ def _get_classifier_per_class_metrics(y, y_pred):
 def _get_classifier_global_metrics(is_binomial, y, y_pred, y_probs):
     metrics = {}
     metrics["accuracy"] = sk_metrics.accuracy_score(y, y_pred)
-    metrics["example_count"] = len(X)
+    metrics["example_count"] = len(y)
 
     if not is_binomial:
-        metrics['f1_score_micro'] = \
-            sk_metrics.f1_score(y, y_pred, average='micro')
-        metrics['f1_score_macro'] = \
-            sk_metrics.f1_score(y, y_pred, average='macro')
+        metrics["f1_score_micro"] = sk_metrics.f1_score(y, y_pred, average="micro")
+        metrics["f1_score_macro"] = sk_metrics.f1_score(y, y_pred, average="macro")
 
     if y_probs is not None:
-        metrics['log_loss'] = sk_metrics.log_loss(y, y_probs)
+        metrics["log_loss"] = sk_metrics.log_loss(y, y_probs)
 
     return metrics
 
@@ -172,17 +173,24 @@ def _log_metrics(self):
         self.client.log_batch(
             self.run_id,
             metrics=[
-                Metric(key=_gen_log_key(key, self.dataset_name),
-                       value=value, timestamp=timestamp, step=0)
+                Metric(
+                    key=_gen_log_key(key, self.dataset_name),
+                    value=value,
+                    timestamp=timestamp,
+                    step=0,
+                )
                 for key, value in self.metrics.items()
             ],
         )
 
     def _log_image_artifact(
-        self, do_plot, artifact_name,
+        self,
+        do_plot,
+        artifact_name,
     ):
         import matplotlib.pyplot as pyplot
-        artifact_file_name = _gen_log_key(artifact_name, self.dataset_name) + '.png'
+
+        artifact_file_name = _gen_log_key(artifact_name, self.dataset_name) + ".png"
         artifact_file_local_path = self.temp_dir.path(artifact_file_name)
 
         try:
@@ -197,10 +205,8 @@ def _log_image_artifact(
         artifact.load(artifact_file_local_path)
         self.artifacts[artifact_name] = artifact
 
-    def _log_pandas_df_artifact(
-        self, pandas_df, artifact_name
-    ):
-        artifact_file_name = _gen_log_key(artifact_name, self.dataset_name) + '.csv'
+    def _log_pandas_df_artifact(self, pandas_df, artifact_name):
+        artifact_file_name = _gen_log_key(artifact_name, self.dataset_name) + ".csv"
         artifact_file_local_path = self.temp_dir.path(artifact_file_name)
         pandas_df.to_csv(artifact_file_local_path, index=False)
         mlflow.log_artifact(artifact_file_local_path)
@@ -212,18 +218,18 @@ def _log_pandas_df_artifact(
         self.artifacts[artifact_name] = artifact
 
     def _log_model_explainability(self):
-        if not self.evaluator_config.get('log_model_explainability', True):
+        if not self.evaluator_config.get("log_model_explainability", True):
             return
 
-        if self.model_loader_module == 'mlflow.spark':
+        if self.model_loader_module == "mlflow.spark":
             # TODO: Shap explainer need to manipulate on each feature values,
             #  but spark model input dataframe contains Vector type feature column
             #  which shap explainer does not support.
             #  To support this, we need expand the Vector type feature column into
             #  multiple scaler feature columns and pass it to shap explainer.
             _logger.warning(
-                'Logging model explainability insights is not currently supported for PySpark'
-                ' models.'
+                "Logging model explainability insights is not currently supported for PySpark"
+                " models."
             )
             return
 
@@ -232,28 +238,32 @@ def _log_model_explainability(self):
             import matplotlib.pyplot as pyplot
         except ImportError:
             _logger.warning(
-                'SHAP or matplotlib package is not installed, so model explainability insights '
-                'will not be logged.'
+                "SHAP or matplotlib package is not installed, so model explainability insights "
+                "will not be logged."
             )
 
-        if Version(shap.__version__) < Version('0.40'):
+        if Version(shap.__version__) < Version("0.40"):
             _logger.warning(
-                'Shap package version is lower than 0.40, Skip log model explainability.'
+                "Shap package version is lower than 0.40, Skip log model explainability."
             )
             return
 
-        is_multinomial_classifier = self.model_type == 'classifier' and self.num_classes > 2
+        is_multinomial_classifier = self.model_type == "classifier" and self.num_classes > 2
 
-        sample_rows = self.evaluator_config.get('explainability_nsamples', _DEFAULT_SAMPLE_ROWS_FOR_SHAP)
-        algorithm = self.evaluator_config.get('explainability_algorithm', None)
+        sample_rows = self.evaluator_config.get(
+            "explainability_nsamples", _DEFAULT_SAMPLE_ROWS_FOR_SHAP
+        )
+        algorithm = self.evaluator_config.get("explainability_algorithm", None)
 
         truncated_feature_names = [truncate_str_from_middle(f, 20) for f in self.feature_names]
         for i, truncated_name in enumerate(truncated_feature_names):
             if truncated_name != self.feature_names[i]:
                 # For duplicated truncated name, attach "(f_{feature_index})" at the end
-                truncated_feature_names[i] = f'{truncated_name}(f_{i})'
+                truncated_feature_names[i] = f"{truncated_name}(f_{i})"
 
-        truncated_feature_name_map = {f: f2 for f, f2 in zip(self.feature_names, truncated_feature_names)}
+        truncated_feature_name_map = {
+            f: f2 for f, f2 in zip(self.feature_names, truncated_feature_names)
+        }
 
         if isinstance(self.X, pd.DataFrame):
             # For some shap explainer, the plot will use the DataFrame column names instead of
@@ -264,14 +274,17 @@ def _log_model_explainability(self):
 
         sampled_X = shap.sample(renamed_X, sample_rows)
         if algorithm:
-            if algorithm == 'sampling':
+            if algorithm == "sampling":
                 explainer = shap.explainers.Sampling(
                     self.predict_fn, renamed_X, feature_names=truncated_feature_names
                 )
                 shap_values = explainer(renamed_X, sample_rows)
             else:
                 explainer = shap.Explainer(
-                    self.predict_fn, sampled_X, feature_names=truncated_feature_names, algorithm=algorithm
+                    self.predict_fn,
+                    sampled_X,
+                    feature_names=truncated_feature_names,
+                    algorithm=algorithm,
                 )
                 shap_values = explainer(sampled_X)
         else:
@@ -279,7 +292,9 @@ def _log_model_explainability(self):
                 # For mulitnomial classifier, shap.Explainer may choose Tree/Linear explainer for
                 # raw model, this case shap plot doesn't support it well, so exclude the
                 # multinomial_classifier case here.
-                explainer = shap.Explainer(self.raw_model, sampled_X, feature_names=truncated_feature_names)
+                explainer = shap.Explainer(
+                    self.raw_model, sampled_X, feature_names=truncated_feature_names
+                )
                 shap_values = explainer(sampled_X)
             else:
                 # fallback to default explainer
@@ -288,25 +303,25 @@ def _log_model_explainability(self):
                 )
                 shap_values = explainer(sampled_X)
 
-        _logger.info(f'Shap explainer {explainer.__class__.__name__} is used.')
+        _logger.info(f"Shap explainer {explainer.__class__.__name__} is used.")
 
         try:
             mlflow.shap.log_explainer(
-               explainer,
-               artifact_path=_gen_log_key('explainer', self.dataset_name)
+                explainer, artifact_path=_gen_log_key("explainer", self.dataset_name)
             )
         except Exception as e:
             # TODO: The explainer saver is buggy, if `get_underlying_model_flavor` return "unknown",
             #   then fallback to shap explainer saver, and shap explainer will call `model.save`
             #   for sklearn model, there is no `.save` method, so error will happen.
-            _logger.warning(f'Log explainer failed. Reason: {str(e)}')
+            _logger.warning(f"Log explainer failed. Reason: {str(e)}")
 
         def plot_beeswarm():
             pyplot.subplots_adjust(bottom=0.2, left=0.4)
             shap.plots.beeswarm(shap_values, show=False)
 
         self._log_image_artifact(
-            plot_beeswarm, "shap_beeswarm_plot",
+            plot_beeswarm,
+            "shap_beeswarm_plot",
         )
 
         def plot_summary():
@@ -314,7 +329,8 @@ def plot_summary():
             shap.summary_plot(shap_values, show=False)
 
         self._log_image_artifact(
-            plot_summary, "shap_summary_plot",
+            plot_summary,
+            "shap_summary_plot",
         )
 
         def plot_feature_importance():
@@ -331,11 +347,10 @@ def _log_binary_classifier(self):
 
         if self.y_prob is not None:
             fpr, tpr, thresholds = sk_metrics.roc_curve(self.y, self.y_prob)
-            roc_curve_pandas_df = pd.DataFrame(
-                {"fpr": fpr, "tpr": tpr, "thresholds": thresholds}
-            )
+            roc_curve_pandas_df = pd.DataFrame({"fpr": fpr, "tpr": tpr, "thresholds": thresholds})
             self._log_pandas_df_artifact(
-                roc_curve_pandas_df, "roc_curve_data",
+                roc_curve_pandas_df,
+                "roc_curve_data",
             )
 
             roc_auc = sk_metrics.auc(fpr, tpr)
@@ -345,15 +360,15 @@ def plot_roc_curve():
                 # Do not use sklearn.metrics roc plot API because
                 # older sklearn verison < 0.24 does not support.
                 plot_lines(
-                    {'roc': (fpr, tpr)},
-                    xlabel='False Positive Rate', ylabel='True Positive Rate',
-                    line_kwargs={"drawstyle": "steps-post"}
+                    {"roc": (fpr, tpr)},
+                    xlabel="False Positive Rate",
+                    ylabel="True Positive Rate",
+                    line_kwargs={"drawstyle": "steps-post"},
                 )
 
             self._log_image_artifact(plot_roc_curve, "roc_curve_plot")
 
-            precision, recall, thresholds = \
-                sk_metrics.precision_recall_curve(self.y, self.y_prob)
+            precision, recall, thresholds = sk_metrics.precision_recall_curve(self.y, self.y_prob)
             thresholds = np.append(thresholds, [1.0], axis=0)
             pr_curve_pandas_df = pd.DataFrame(
                 {"precision": precision, "recall": recall, "thresholds": thresholds}
@@ -367,8 +382,10 @@ def plot_precision_recall_curve():
                 # Do not use sklearn.metrics precision-recall plot API because
                 # older sklearn verison < 0.24 does not support.
                 plot_lines(
-                    {'pr_curve': (recall, precision)}, xlabel='recall', ylabel='precision',
-                    line_kwargs={"drawstyle": "steps-post"}
+                    {"pr_curve": (recall, precision)},
+                    xlabel="recall",
+                    ylabel="precision",
+                    line_kwargs={"drawstyle": "steps-post"},
                 )
 
             self._log_image_artifact(plot_precision_recall_curve, "precision_recall_curve_plot")
@@ -378,29 +395,36 @@ def _log_multiclass_classifier(self):
         per_class_roc_curve_data_list = []
         per_class_precision_recall_curve_data_list = []
 
-        PerClassRocCurveData = namedtuple('PerClassRocCurveData', ['postive_class', 'fpr', 'tpr', 'thresholds'])
+        PerClassRocCurveData = namedtuple(
+            "PerClassRocCurveData", ["postive_class", "fpr", "tpr", "thresholds"]
+        )
         PerClassPrecisionRecallCurveData = namedtuple(
-            'PerClassPrecisionRecallCurveData', ['postive_class', 'precision', 'recall', 'thresholds']
+            "PerClassPrecisionRecallCurveData",
+            ["postive_class", "precision", "recall", "thresholds"],
         )
         log_roc_pr_curve = False
         if self.y_probs is not None:
-            max_num_classes_for_logging_curve = \
-                self.evaluator_config.get(
-                    'max_num_classes_threshold_logging_roc_pr_curve_for_multiclass_classifier', 10
-                )
+            max_num_classes_for_logging_curve = self.evaluator_config.get(
+                "max_num_classes_threshold_logging_roc_pr_curve_for_multiclass_classifier", 10
+            )
             if self.num_classes <= max_num_classes_for_logging_curve:
                 log_roc_pr_curve = True
             else:
-                _logger.warning(f'The classifier num_classes > {max_num_classes_for_logging_curve}, skip logging '
-                                f'ROC curve and Precision-Recall curve. You can add evaluator config '
-                                f"'max_num_classes_threshold_logging_roc_pr_curve_for_multiclass_classifier' to "
-                                f"increase the threshold.")
+                _logger.warning(
+                    f"The classifier num_classes > {max_num_classes_for_logging_curve}, skip logging "
+                    f"ROC curve and Precision-Recall curve. You can add evaluator config "
+                    f"'max_num_classes_threshold_logging_roc_pr_curve_for_multiclass_classifier' to "
+                    f"increase the threshold."
+                )
 
         for postive_class in self.label_list:
-            y_is_positive, y_pred_is_positive, prob_of_positive = \
-                _get_binary_sum_up_label_pred_prob(self.y, self.y_pred, self.y_probs)
+            (
+                y_is_positive,
+                y_pred_is_positive,
+                prob_of_positive,
+            ) = _get_binary_sum_up_label_pred_prob(postive_class, self.y, self.y_pred, self.y_probs)
 
-            per_class_metrics = {'positive_class': postive_class}
+            per_class_metrics = {"positive_class": postive_class}
             per_class_metrics_list.append(per_class_metrics)
 
             per_class_metrics.update(
@@ -416,12 +440,15 @@ def _log_multiclass_classifier(self):
                 roc_auc = sk_metrics.auc(fpr, tpr)
                 per_class_metrics["roc_auc"] = roc_auc
 
-                precision, recall, thresholds = \
-                    sk_metrics.precision_recall_curve(y_is_positive, prob_of_positive)
+                precision, recall, thresholds = sk_metrics.precision_recall_curve(
+                    y_is_positive, prob_of_positive
+                )
                 thresholds = np.append(thresholds, [1.0], axis=0)
                 if log_roc_pr_curve:
                     per_class_precision_recall_curve_data_list.append(
-                        PerClassPrecisionRecallCurveData(postive_class, precision, recall, thresholds)
+                        PerClassPrecisionRecallCurveData(
+                            postive_class, precision, recall, thresholds
+                        )
                     )
                 pr_auc = sk_metrics.auc(recall, precision)
                 per_class_metrics["precision_recall_auc"] = pr_auc
@@ -432,39 +459,45 @@ def _log_multiclass_classifier(self):
         if self.y_probs is not None and log_roc_pr_curve:
             per_class_roc_curve_pandas_df = pd.concat(
                 [pd.DataFrame(item._asdict()) for item in per_class_roc_curve_data_list],
-                ignore_index=True
+                ignore_index=True,
             )
             self._log_pandas_df_artifact(per_class_roc_curve_pandas_df, "per_class_roc_curve_data")
 
             per_class_precision_recall_curve_pandas_df = pd.concat(
-                [pd.DataFrame(item._asdict()) for item in per_class_precision_recall_curve_data_list],
-                ignore_index=True
+                [
+                    pd.DataFrame(item._asdict())
+                    for item in per_class_precision_recall_curve_data_list
+                ],
+                ignore_index=True,
             )
             self._log_pandas_df_artifact(
-                per_class_precision_recall_curve_pandas_df,
-                "per_class_precision_recall_curve_data"
+                per_class_precision_recall_curve_pandas_df, "per_class_precision_recall_curve_data"
             )
 
             def plot_roc_curve():
                 data_series = {
-                    f'Positive Class = {postive_class}': (fpr, tpr)
+                    f"Positive Class = {postive_class}": (fpr, tpr)
                     for postive_class, fpr, tpr, _ in per_class_roc_curve_data_list
                 }
                 plot_lines(
-                    data_series, xlabel='False Positive Rate', ylabel='True Positive Rate',
-                    legend_loc='lower right',
-                    line_kwargs={"drawstyle": "steps-post"}
+                    data_series,
+                    xlabel="False Positive Rate",
+                    ylabel="True Positive Rate",
+                    legend_loc="lower right",
+                    line_kwargs={"drawstyle": "steps-post"},
                 )
 
             def plot_precision_recall_curve():
                 data_series = {
-                    f'Positive Class = {postive_class}': (recall, precision)
+                    f"Positive Class = {postive_class}": (recall, precision)
                     for postive_class, precision, recall, _ in per_class_precision_recall_curve_data_list
                 }
                 plot_lines(
-                    data_series, xlabel='recall', ylabel='precision',
-                    legend_loc='lower left',
-                    line_kwargs={"drawstyle": "steps-post"}
+                    data_series,
+                    xlabel="recall",
+                    ylabel="precision",
+                    legend_loc="lower left",
+                    line_kwargs={"drawstyle": "steps-post"},
                 )
 
             self._log_image_artifact(plot_roc_curve, "roc_curve_plot")
@@ -484,8 +517,8 @@ def _evaluate_classifier(self):
             for label in label_list:
                 if int(label) not in [-1, 0, 1]:
                     raise ValueError(
-                        'Binomial classification require evaluation dataset label values to be '
-                        '-1, 0, or 1.'
+                        "Binomial classification require evaluation dataset label values to be "
+                        "-1, 0, or 1."
                     )
 
         if self.predict_proba_fn is not None:
@@ -509,7 +542,8 @@ def _evaluate_classifier(self):
 
         if self.is_binomial and self.y_probs is not None:
             self._log_image_artifact(
-                lambda: plot_lift_curve(self.y, self.y_probs), "lift_curve_plot",
+                lambda: plot_lift_curve(self.y, self.y_probs),
+                "lift_curve_plot",
             )
 
         # TODO: Shall we also log confusion_matrix data as a json artifact ?
@@ -518,9 +552,10 @@ def _evaluate_classifier(self):
         def plot_confusion_matrix():
             sk_metrics.ConfusionMatrixDisplay(confusion_matrix=confusion_matrix).plot()
 
-        if hasattr(sk_metrics, 'ConfusionMatrixDisplay'):
+        if hasattr(sk_metrics, "ConfusionMatrixDisplay"):
             self._log_image_artifact(
-                plot_confusion_matrix, "confusion_matrix",
+                plot_confusion_matrix,
+                "confusion_matrix",
             )
 
         self._log_metrics()
@@ -556,8 +591,12 @@ def evaluate(
             self.dataset_name = dataset.name
             self.feature_names = dataset.feature_names
 
-            model_loader_module, raw_model, predict_fn, predict_proba_fn = \
-                _extract_raw_model_and_predict_fn(model)
+            (
+                model_loader_module,
+                raw_model,
+                predict_fn,
+                predict_proba_fn,
+            ) = _extract_raw_model_and_predict_fn(model)
             self.model_loader_module = model_loader_module
             self.raw_model = raw_model
             self.predict_fn = predict_fn
diff --git a/mlflow/models/evaluation/evaluator_registry.py b/mlflow/models/evaluation/evaluator_registry.py
index cb10fc128226a..1822e313460ea 100644
--- a/mlflow/models/evaluation/evaluator_registry.py
+++ b/mlflow/models/evaluation/evaluator_registry.py
@@ -49,7 +49,8 @@ def get_evaluator(self, evaluator_name):
 
 def register_evaluators(module):
     from mlflow.models.evaluation.default_evaluator import DefaultEvaluator
-    module._model_evaluation_registry.register('default', DefaultEvaluator)
+
+    module._model_evaluation_registry.register("default", DefaultEvaluator)
     module._model_evaluation_registry.register_entrypoints()
 
 
diff --git a/mlflow/models/evaluation/lift_curve.py b/mlflow/models/evaluation/lift_curve.py
index 9fd7485f25450..6961adb791576 100644
--- a/mlflow/models/evaluation/lift_curve.py
+++ b/mlflow/models/evaluation/lift_curve.py
@@ -37,18 +37,19 @@ def _cumulative_gain_curve(y_true, y_score, pos_label=None):
 
     # ensure binary classification if pos_label is not specified
     classes = np.unique(y_true)
-    if (pos_label is None and
-        not (np.array_equal(classes, [0, 1]) or
-             np.array_equal(classes, [-1, 1]) or
-             np.array_equal(classes, [0]) or
-             np.array_equal(classes, [-1]) or
-             np.array_equal(classes, [1]))):
+    if pos_label is None and not (
+        np.array_equal(classes, [0, 1])
+        or np.array_equal(classes, [-1, 1])
+        or np.array_equal(classes, [0])
+        or np.array_equal(classes, [-1])
+        or np.array_equal(classes, [1])
+    ):
         raise ValueError("Data is not binary and pos_label is not specified")
     elif pos_label is None:
-        pos_label = 1.
+        pos_label = 1.0
 
     # make y_true a boolean vector
-    y_true = (y_true == pos_label)
+    y_true = y_true == pos_label
 
     sorted_indices = np.argsort(y_score)[::-1]
     y_true = y_true[sorted_indices]
@@ -65,9 +66,15 @@ def _cumulative_gain_curve(y_true, y_score, pos_label=None):
     return percentages, gains
 
 
-def plot_lift_curve(y_true, y_probas, title='Lift Curve',
-                    ax=None, figsize=None, title_fontsize="large",
-                    text_fontsize="medium"):
+def plot_lift_curve(
+    y_true,
+    y_probas,
+    title="Lift Curve",
+    ax=None,
+    figsize=None,
+    title_fontsize="large",
+    text_fontsize="medium",
+):
     """
     This method is copied from scikit-plot package.
 
@@ -124,14 +131,13 @@ def plot_lift_curve(y_true, y_probas, title='Lift Curve',
 
     classes = np.unique(y_true)
     if len(classes) != 2:
-        raise ValueError('Cannot calculate Lift Curve for data with '
-                         '{} category/ies'.format(len(classes)))
+        raise ValueError(
+            "Cannot calculate Lift Curve for data with " "{} category/ies".format(len(classes))
+        )
 
     # Compute Cumulative Gain Curves
-    percentages, gains1 = _cumulative_gain_curve(y_true, y_probas[:, 0],
-                                                 classes[0])
-    percentages, gains2 = _cumulative_gain_curve(y_true, y_probas[:, 1],
-                                                 classes[1])
+    percentages, gains1 = _cumulative_gain_curve(y_true, y_probas[:, 0], classes[0])
+    percentages, gains2 = _cumulative_gain_curve(y_true, y_probas[:, 1], classes[1])
 
     percentages = percentages[1:]
     gains1 = gains1[1:]
@@ -145,15 +151,15 @@ def plot_lift_curve(y_true, y_probas, title='Lift Curve',
 
     ax.set_title(title, fontsize=title_fontsize)
 
-    ax.plot(percentages, gains1, lw=3, label='Class {}'.format(classes[0]))
-    ax.plot(percentages, gains2, lw=3, label='Class {}'.format(classes[1]))
+    ax.plot(percentages, gains1, lw=3, label="Class {}".format(classes[0]))
+    ax.plot(percentages, gains2, lw=3, label="Class {}".format(classes[1]))
 
-    ax.plot([0, 1], [1, 1], 'k--', lw=2, label='Baseline')
+    ax.plot([0, 1], [1, 1], "k--", lw=2, label="Baseline")
 
-    ax.set_xlabel('Percentage of sample', fontsize=text_fontsize)
-    ax.set_ylabel('Lift', fontsize=text_fontsize)
+    ax.set_xlabel("Percentage of sample", fontsize=text_fontsize)
+    ax.set_ylabel("Lift", fontsize=text_fontsize)
     ax.tick_params(labelsize=text_fontsize)
-    ax.grid('on')
-    ax.legend(loc='lower right', fontsize=text_fontsize)
+    ax.grid("on")
+    ax.legend(loc="lower right", fontsize=text_fontsize)
 
     return ax
diff --git a/mlflow/models/utils.py b/mlflow/models/utils.py
index a8813aa136a84..5c44a03188028 100644
--- a/mlflow/models/utils.py
+++ b/mlflow/models/utils.py
@@ -238,6 +238,7 @@ def _read_sparse_matrix_from_json(path, example_type):
 
 def plot_lines(data_series, xlabel, ylabel, legend_loc=None, line_kwargs={}):
     import matplotlib.pyplot as plt
+
     fig, ax = plt.subplots()
 
     for label, (data_x, data_y) in data_series.items():
diff --git a/tests/models/test_default_evaluator.py b/tests/models/test_default_evaluator.py
index 63f97efea1868..17748708e93ca 100644
--- a/tests/models/test_default_evaluator.py
+++ b/tests/models/test_default_evaluator.py
@@ -4,32 +4,44 @@
 import sklearn.metrics
 
 from mlflow.models.evaluation import evaluate, EvaluationDataset
-from mlflow.models.evaluation.default_evaluator import _get_regressor_metrics
+from mlflow.models.evaluation.default_evaluator import (
+    _get_regressor_metrics,
+    _get_classifier_global_metrics,
+    _get_classifier_per_class_metrics,
+    _extract_raw_model_and_predict_fn,
+)
 import mlflow
 from sklearn.datasets import load_boston
 from sklearn.linear_model import LinearRegression
 
-from tests.models.test_evaluation import get_run_data, \
-    regressor_model_uri, diabetes_dataset, \
-    classifier_model_uri, iris_dataset, \
-    binary_classifier_model_uri, breast_cancer_dataset, \
-    spark_regressor_model_uri, diabetes_spark_dataset
-
-
-def test_regressor_evaluation(regressor_model_uri, diabetes_dataset):
+from tests.models.test_evaluation import (
+    get_run_data,
+    linear_regressor_model_uri,
+    diabetes_dataset,
+    multiclass_logistic_regressor_model_uri,
+    iris_dataset,
+    binary_logistic_regressor_model_uri,
+    breast_cancer_dataset,
+    spark_linear_regressor_model_uri,
+    diabetes_spark_dataset,
+    svm_model_uri,
+    breast_cancer_dataset,
+)
+
+
+def test_regressor_evaluation(linear_regressor_model_uri, diabetes_dataset):
     with mlflow.start_run() as run:
         result = evaluate(
-            regressor_model_uri,
-            model_type='regressor',
+            linear_regressor_model_uri,
+            model_type="regressor",
             dataset=diabetes_dataset,
-            evaluators='default',
+            evaluators="default",
         )
-        print(f'regressor evaluation run: {run.info.run_id}')
+        print(f"regressor evaluation run: {run.info.run_id}")
 
-    params, metrics, tags, artifacts = \
-        get_run_data(run.info.run_id)
+    params, metrics, tags, artifacts = get_run_data(run.info.run_id)
 
-    model = mlflow.pyfunc.load_model(regressor_model_uri)
+    model = mlflow.pyfunc.load_model(linear_regressor_model_uri)
 
     y = diabetes_dataset.labels
     y_pred = model.predict(diabetes_dataset.data)
@@ -38,207 +50,219 @@ def test_regressor_evaluation(regressor_model_uri, diabetes_dataset):
     for metric_key in expected_metrics:
         assert np.isclose(
             expected_metrics[metric_key],
-            metrics[metric_key + '_on_data_diabetes_dataset'],
-            rtol=1e-3
-        )
-        assert np.isclose(
-            expected_metrics[metric_key],
-            result.metrics[metric_key],
-            rtol=1e-3
+            metrics[metric_key + "_on_data_diabetes_dataset"],
+            rtol=1e-3,
         )
+        assert np.isclose(expected_metrics[metric_key], result.metrics[metric_key], rtol=1e-3)
 
-    assert json.loads(tags['mlflow.datasets']) == \
-        [{**diabetes_dataset._metadata, 'model': model.metadata.model_uuid}]
+    assert json.loads(tags["mlflow.datasets"]) == [
+        {**diabetes_dataset._metadata, "model": model.metadata.model_uuid}
+    ]
 
     assert set(artifacts) == {
-        'shap_beeswarm_plot_on_data_diabetes_dataset.png',
-        'shap_feature_importance_plot_on_data_diabetes_dataset.png',
-        'shap_summary_plot_on_data_diabetes_dataset.png',
+        "shap_beeswarm_plot_on_data_diabetes_dataset.png",
+        "shap_feature_importance_plot_on_data_diabetes_dataset.png",
+        "shap_summary_plot_on_data_diabetes_dataset.png",
     }
     assert result.artifacts.keys() == {
-        'shap_beeswarm_plot',
-        'shap_feature_importance_plot',
-        'shap_summary_plot',
+        "shap_beeswarm_plot",
+        "shap_feature_importance_plot",
+        "shap_summary_plot",
     }
 
 
-def test_multi_classifier_evaluation(classifier_model_uri, iris_dataset):
+def test_multi_classifier_evaluation(multiclass_logistic_regressor_model_uri, iris_dataset):
     with mlflow.start_run() as run:
         result = evaluate(
-            classifier_model_uri,
-            model_type='classifier',
+            multiclass_logistic_regressor_model_uri,
+            model_type="classifier",
             dataset=iris_dataset,
-            evaluators='default',
+            evaluators="default",
         )
-        print(f'multi-classifier evaluation run: {run.info.run_id}')
-
-    params, metrics, tags, artifacts = \
-        get_run_data(run.info.run_id)
-
-    expected_metrics = {
-        'accuracy': 0.32, 'example_count': 50, 'log_loss': 0.9712, 'f1_score_micro': 0.32,
-        'f1_score_macro': 0.1616, 'class_0_true_negatives': 33,
-        'class_0_false_positives': 0, 'class_0_false_negatives': 17, 'class_0_true_positives': 0,
-        'class_0_recall': 0.0, 'class_0_precision': 0.0, 'class_0_f1_score': 0.0,
-        'class_0_roc_auc': 1.0, 'class_0_precision_recall_auc': 1.0, 'class_1_true_negatives': 33,
-        'class_1_false_positives': 0, 'class_1_false_negatives': 17, 'class_1_true_positives': 0,
-        'class_1_recall': 0.0, 'class_1_precision': 0.0, 'class_1_f1_score': 0.0,
-        'class_1_roc_auc': 0.9411, 'class_1_precision_recall_auc': 0.8989,
-        'class_2_true_negatives': 0, 'class_2_false_positives': 34, 'class_2_false_negatives': 0,
-        'class_2_true_positives': 16, 'class_2_recall': 1.0, 'class_2_precision': 0.32,
-        'class_2_f1_score': 0.4848, 'class_2_roc_auc': 0.9963,
-        'class_2_precision_recall_auc': 0.9921}
+        print(f"multi-classifier evaluation run: {run.info.run_id}")
+
+    params, metrics, tags, artifacts = get_run_data(run.info.run_id)
+
+    model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri)
+
+    _, raw_model, predict_fn, predict_proba_fn = _extract_raw_model_and_predict_fn(model)
+    y = iris_dataset.labels
+    y_pred = predict_fn(iris_dataset.data)
+    y_probs = predict_proba_fn(iris_dataset.data)
+
+    expected_metrics = _get_classifier_global_metrics(False, y, y_pred, y_probs)
 
     for metric_key in expected_metrics:
         assert np.isclose(
-            expected_metrics[metric_key],
-            metrics[metric_key + '_on_data_iris_dataset'],
-            rtol=1e-3
-        )
-        assert np.isclose(
-            expected_metrics[metric_key],
-            result.metrics[metric_key],
-            rtol=1e-3
+            expected_metrics[metric_key], metrics[metric_key + "_on_data_iris_dataset"], rtol=1e-3
         )
+        assert np.isclose(expected_metrics[metric_key], result.metrics[metric_key], rtol=1e-3)
 
-    model = mlflow.pyfunc.load_model(classifier_model_uri)
-
-    assert json.loads(tags['mlflow.datasets']) == \
-        [{**iris_dataset._metadata, 'model': model.metadata.model_uuid}]
+    assert json.loads(tags["mlflow.datasets"]) == [
+        {**iris_dataset._metadata, "model": model.metadata.model_uuid}
+    ]
 
     assert set(artifacts) == {
-        'class_0_precision_recall_curve_data_on_data_iris_dataset.csv',
-        'class_0_precision_recall_curve_plot_on_data_iris_dataset.png',
-        'class_0_roc_curve_data_on_data_iris_dataset.csv',
-        'class_0_roc_curve_plot_on_data_iris_dataset.png',
-        'class_1_precision_recall_curve_data_on_data_iris_dataset.csv',
-        'class_1_precision_recall_curve_plot_on_data_iris_dataset.png',
-        'class_1_roc_curve_data_on_data_iris_dataset.csv',
-        'class_1_roc_curve_plot_on_data_iris_dataset.png',
-        'class_2_precision_recall_curve_data_on_data_iris_dataset.csv',
-        'class_2_precision_recall_curve_plot_on_data_iris_dataset.png',
-        'class_2_roc_curve_data_on_data_iris_dataset.csv',
-        'class_2_roc_curve_plot_on_data_iris_dataset.png',
-        'confusion_matrix_on_data_iris_dataset.png',
-        'explainer_on_data_iris_dataset',
-        'shap_beeswarm_plot_on_data_iris_dataset.png',
-        'shap_feature_importance_plot_on_data_iris_dataset.png',
-        'shap_summary_plot_on_data_iris_dataset.png',
+        "shap_beeswarm_plot_on_data_iris_dataset.png",
+        "per_class_metrics_data_on_data_iris_dataset.csv",
+        "roc_curve_plot_on_data_iris_dataset.png",
+        "precision_recall_curve_plot_on_data_iris_dataset.png",
+        "shap_feature_importance_plot_on_data_iris_dataset.png",
+        "explainer_on_data_iris_dataset",
+        "per_class_roc_curve_data_on_data_iris_dataset.csv",
+        "confusion_matrix_on_data_iris_dataset.png",
+        "shap_summary_plot_on_data_iris_dataset.png",
+        "per_class_precision_recall_curve_data_on_data_iris_dataset.csv",
     }
     assert result.artifacts.keys() == {
-        'class_0_roc_curve_data', 'class_0_roc_curve_plot', 'class_0_precision_recall_curve_data',
-        'class_0_precision_recall_curve_plot', 'class_1_roc_curve_data', 'class_1_roc_curve_plot',
-        'class_1_precision_recall_curve_data', 'class_1_precision_recall_curve_plot',
-        'class_2_roc_curve_data', 'class_2_roc_curve_plot', 'class_2_precision_recall_curve_data',
-        'class_2_precision_recall_curve_plot', 'confusion_matrix', 'shap_beeswarm_plot',
-        'shap_summary_plot', 'shap_feature_importance_plot'
+        "per_class_metrics_data",
+        "per_class_roc_curve_data",
+        "per_class_precision_recall_curve_data",
+        "roc_curve_plot",
+        "precision_recall_curve_plot",
+        "confusion_matrix",
+        "shap_beeswarm_plot",
+        "shap_summary_plot",
+        "shap_feature_importance_plot",
     }
 
 
-def test_bin_classifier_evaluation(binary_classifier_model_uri, breast_cancer_dataset):
+def test_bin_classifier_evaluation(binary_logistic_regressor_model_uri, breast_cancer_dataset):
     with mlflow.start_run() as run:
         result = evaluate(
-            binary_classifier_model_uri,
-            model_type='classifier',
+            binary_logistic_regressor_model_uri,
+            model_type="classifier",
             dataset=breast_cancer_dataset,
-            evaluators='default',
+            evaluators="default",
         )
-        print(f'bin-classifier evaluation run: {run.info.run_id}')
-
-    params, metrics, tags, artifacts = \
-        get_run_data(run.info.run_id)
-
-    expected_metrics = {
-        'accuracy': 0.957,
-        'example_count': 190,
-        'log_loss': 0.0918,
-        'true_negatives': 71,
-        'false_positives': 5,
-        'false_negatives': 3,
-        'true_positives': 111,
-        'recall': 0.9736,
-        'precision': 0.9568,
-        'f1_score': 0.9652,
-        'roc_auc': 0.995,
-        'precision_recall_auc': 0.997
-    }
+        print(f"bin-classifier evaluation run: {run.info.run_id}")
+
+    params, metrics, tags, artifacts = get_run_data(run.info.run_id)
+
+    model = mlflow.pyfunc.load_model(binary_logistic_regressor_model_uri)
+
+    _, raw_model, predict_fn, predict_proba_fn = _extract_raw_model_and_predict_fn(model)
+    y = breast_cancer_dataset.labels
+    y_pred = predict_fn(breast_cancer_dataset.data)
+    y_probs = predict_proba_fn(breast_cancer_dataset.data)
+
+    expected_metrics = _get_classifier_global_metrics(True, y, y_pred, y_probs)
+
     for metric_key in expected_metrics:
         assert np.isclose(
             expected_metrics[metric_key],
-            metrics[metric_key + '_on_data_breast_cancer_dataset'],
-            rtol=1e-3
-        )
-        assert np.isclose(
-            expected_metrics[metric_key],
-            result.metrics[metric_key],
-            rtol=1e-3
+            metrics[metric_key + "_on_data_breast_cancer_dataset"],
+            rtol=1e-3,
         )
+        assert np.isclose(expected_metrics[metric_key], result.metrics[metric_key], rtol=1e-3)
 
-    model = mlflow.pyfunc.load_model(binary_classifier_model_uri)
-
-    assert json.loads(tags['mlflow.datasets']) == \
-        [{**breast_cancer_dataset._metadata, 'model': model.metadata.model_uuid}]
+    assert json.loads(tags["mlflow.datasets"]) == [
+        {**breast_cancer_dataset._metadata, "model": model.metadata.model_uuid}
+    ]
 
     assert set(artifacts) == {
-        'confusion_matrix_on_data_breast_cancer_dataset.png',
-        'lift_curve_plot_on_data_breast_cancer_dataset.png',
-        'precision_recall_curve_data_on_data_breast_cancer_dataset.csv',
-        'precision_recall_curve_plot_on_data_breast_cancer_dataset.png',
-        'roc_curve_data_on_data_breast_cancer_dataset.csv',
-        'roc_curve_plot_on_data_breast_cancer_dataset.png',
-        'shap_beeswarm_plot_on_data_breast_cancer_dataset.png',
-        'shap_feature_importance_plot_on_data_breast_cancer_dataset.png',
-        'shap_summary_plot_on_data_breast_cancer_dataset.png'
+        "shap_feature_importance_plot_on_data_breast_cancer_dataset.png",
+        "lift_curve_plot_on_data_breast_cancer_dataset.png",
+        "shap_beeswarm_plot_on_data_breast_cancer_dataset.png",
+        "precision_recall_curve_plot_on_data_breast_cancer_dataset.png",
+        "roc_curve_data_on_data_breast_cancer_dataset.csv",
+        "precision_recall_curve_data_on_data_breast_cancer_dataset.csv",
+        "confusion_matrix_on_data_breast_cancer_dataset.png",
+        "shap_summary_plot_on_data_breast_cancer_dataset.png",
+        "roc_curve_plot_on_data_breast_cancer_dataset.png",
     }
     assert result.artifacts.keys() == {
-        'roc_curve_data', 'roc_curve_plot', 'precision_recall_curve_data',
-        'precision_recall_curve_plot', 'lift_curve_plot', 'confusion_matrix',
-        'shap_beeswarm_plot', 'shap_summary_plot', 'shap_feature_importance_plot'
+        "roc_curve_data",
+        "roc_curve_plot",
+        "precision_recall_curve_data",
+        "precision_recall_curve_plot",
+        "lift_curve_plot",
+        "confusion_matrix",
+        "shap_beeswarm_plot",
+        "shap_summary_plot",
+        "shap_feature_importance_plot",
     }
 
 
-def test_spark_model_evaluation(spark_regressor_model_uri, diabetes_spark_dataset):
+def test_spark_regressor_model_evaluation(spark_linear_regressor_model_uri, diabetes_spark_dataset):
     with mlflow.start_run() as run:
         result = evaluate(
-            spark_regressor_model_uri,
-            model_type='regressor',
+            spark_linear_regressor_model_uri,
+            model_type="regressor",
             dataset=diabetes_spark_dataset,
-            evaluators='default',
-            evaluator_config={
-                'log_model_explainability': True
-            }
+            evaluators="default",
+            evaluator_config={"log_model_explainability": True},
         )
-        print(f'spark model evaluation run: {run.info.run_id}')
+        print(f"spark model evaluation run: {run.info.run_id}")
 
     params, metrics, tags, artifacts = get_run_data(run.info.run_id)
 
-    expected_metrics = {
-        'example_count': 139.0,
-        'mean_absolute_error': 45.672,
-        'mean_squared_error': 3009.048,
-        'root_mean_squared_error': 54.854,
-        'sum_on_label': 21183.0,
-        'mean_on_label': 152.395,
-        'r2_score': 0.491,
-        'max_error': 136.170,
-        'mean_absolute_percentage_error': 0.41392110539896615
-    }
+    model = mlflow.pyfunc.load_model(spark_linear_regressor_model_uri)
+
+    X, y = diabetes_spark_dataset._extract_features_and_labels()
+    y_pred = model.predict(X)
+
+    expected_metrics = _get_regressor_metrics(y, y_pred)
+
     for metric_key in expected_metrics:
         assert np.isclose(
             expected_metrics[metric_key],
-            metrics[metric_key + '_on_data_diabetes_spark_dataset'],
-            rtol=1e-3
-        )
-        assert np.isclose(
-            expected_metrics[metric_key],
-            result.metrics[metric_key],
-            rtol=1e-3
+            metrics[metric_key + "_on_data_diabetes_spark_dataset"],
+            rtol=1e-3,
         )
+        assert np.isclose(expected_metrics[metric_key], result.metrics[metric_key], rtol=1e-3)
 
-    model = mlflow.pyfunc.load_model(spark_regressor_model_uri)
+    model = mlflow.pyfunc.load_model(spark_linear_regressor_model_uri)
 
-    assert json.loads(tags['mlflow.datasets']) == \
-        [{**diabetes_spark_dataset._metadata, 'model': model.metadata.model_uuid}]
+    assert json.loads(tags["mlflow.datasets"]) == [
+        {**diabetes_spark_dataset._metadata, "model": model.metadata.model_uuid}
+    ]
 
     assert set(artifacts) == set()
     assert result.artifacts == {}
+
+
+def test_svm_classifier_evaluation(svm_model_uri, breast_cancer_dataset):
+    with mlflow.start_run() as run:
+        result = evaluate(
+            svm_model_uri,
+            model_type="classifier",
+            dataset=breast_cancer_dataset,
+            evaluators="default",
+        )
+        print(f"svm evaluation run: {run.info.run_id}")
+
+    params, metrics, tags, artifacts = get_run_data(run.info.run_id)
+
+    model = mlflow.pyfunc.load_model(svm_model_uri)
+
+    _, raw_model, predict_fn, predict_proba_fn = _extract_raw_model_and_predict_fn(model)
+    y = breast_cancer_dataset.labels
+    y_pred = predict_fn(breast_cancer_dataset.data)
+
+    expected_metrics = _get_classifier_global_metrics(True, y, y_pred, None)
+
+    for metric_key in expected_metrics:
+        assert np.isclose(
+            expected_metrics[metric_key],
+            metrics[metric_key + "_on_data_breast_cancer_dataset"],
+            rtol=1e-3,
+        )
+        assert np.isclose(expected_metrics[metric_key], result.metrics[metric_key], rtol=1e-3)
+
+    assert json.loads(tags["mlflow.datasets"]) == [
+        {**breast_cancer_dataset._metadata, "model": model.metadata.model_uuid}
+    ]
+
+    assert set(artifacts) == {
+        "confusion_matrix_on_data_breast_cancer_dataset.png",
+        "shap_feature_importance_plot_on_data_breast_cancer_dataset.png",
+        "shap_beeswarm_plot_on_data_breast_cancer_dataset.png",
+        "shap_summary_plot_on_data_breast_cancer_dataset.png",
+    }
+    assert result.artifacts.keys() == {
+        "confusion_matrix",
+        "shap_beeswarm_plot",
+        "shap_summary_plot",
+        "shap_feature_importance_plot",
+    }
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index b6f59d0afd30e..0ecd87d8a5d1f 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -9,8 +9,9 @@
     EvaluationArtifact,
     EvaluationMetrics,
 )
-from mlflow.models.evaluation.base import \
-    _normalize_evaluators_and_evaluator_config_args as _normalize_config
+from mlflow.models.evaluation.base import (
+    _normalize_evaluators_and_evaluator_config_args as _normalize_config,
+)
 import hashlib
 from mlflow.models.evaluation.base import _start_run_or_reuse_active_run
 import sklearn
@@ -55,10 +56,11 @@ def get_diabetes_dataset():
 def get_diabetes_spark_dataset():
     data = sklearn.datasets.load_diabetes()
     spark = SparkSession.builder.master("local[*]").getOrCreate()
-    rows = [(Vectors.dense(features), float(label))
-            for features, label in zip(data.data, data.target)]
+    rows = [
+        (Vectors.dense(features), float(label)) for features, label in zip(data.data, data.target)
+    ]
 
-    return spark.createDataFrame(rows, ['features', 'label'])
+    return spark.createDataFrame(rows, ["features", "label"])
 
 
 def get_breast_cancer_dataset():
@@ -66,7 +68,7 @@ def get_breast_cancer_dataset():
     return data.data, data.target
 
 
-RunData = namedtuple('RunData', ['params', 'metrics', 'tags', 'artifacts'])
+RunData = namedtuple("RunData", ["params", "metrics", "tags", "artifacts"])
 
 
 def get_run_data(run_id):
@@ -111,7 +113,7 @@ def diabetes_dataset():
 @pytest.fixture(scope="module")
 def diabetes_spark_dataset():
     spark_df = get_diabetes_spark_dataset().sample(fraction=0.3, seed=1)
-    return EvaluationDataset(data=spark_df, labels='label', name="diabetes_spark_dataset")
+    return EvaluationDataset(data=spark_df, labels="label", name="diabetes_spark_dataset")
 
 
 @pytest.fixture(scope="module")
@@ -122,55 +124,68 @@ def breast_cancer_dataset():
 
 
 @pytest.fixture(scope="module")
-def regressor_model_uri():
+def linear_regressor_model_uri():
     X, y = get_diabetes_dataset()
     reg = sklearn.linear_model.LinearRegression()
     reg.fit(X, y)
 
     with mlflow.start_run() as run:
         mlflow.sklearn.log_model(reg, "reg_model")
-        regressor_model_uri = get_artifact_uri(run.info.run_id, "reg_model")
+        linear_regressor_model_uri = get_artifact_uri(run.info.run_id, "reg_model")
 
-    return regressor_model_uri
+    return linear_regressor_model_uri
 
 
 @pytest.fixture(scope="module")
-def spark_regressor_model_uri():
+def spark_linear_regressor_model_uri():
     spark_df = get_diabetes_spark_dataset()
     reg = SparkLinearRegression()
     spark_reg_model = reg.fit(spark_df)
 
     with mlflow.start_run() as run:
         mlflow.spark.log_model(spark_reg_model, "spark_reg_model")
-        spark_regressor_model_uri = get_artifact_uri(run.info.run_id, "spark_reg_model")
+        spark_linear_regressor_model_uri = get_artifact_uri(run.info.run_id, "spark_reg_model")
 
-    return spark_regressor_model_uri
+    return spark_linear_regressor_model_uri
 
 
 @pytest.fixture(scope="module")
-def classifier_model_uri():
+def multiclass_logistic_regressor_model_uri():
     X, y = get_iris()
     clf = sklearn.linear_model.LogisticRegression(max_iter=2)
     clf.fit(X, y)
 
     with mlflow.start_run() as run:
         mlflow.sklearn.log_model(clf, "clf_model")
-        classifier_model_uri = get_artifact_uri(run.info.run_id, "clf_model")
+        multiclass_logistic_regressor_model_uri = get_artifact_uri(run.info.run_id, "clf_model")
 
-    return classifier_model_uri
+    return multiclass_logistic_regressor_model_uri
 
 
 @pytest.fixture(scope="module")
-def binary_classifier_model_uri():
+def binary_logistic_regressor_model_uri():
     X, y = get_breast_cancer_dataset()
     clf = sklearn.linear_model.LogisticRegression()
     clf.fit(X, y)
 
     with mlflow.start_run() as run:
         mlflow.sklearn.log_model(clf, "bin_clf_model")
-        binary_classifier_model_uri = get_artifact_uri(run.info.run_id, "bin_clf_model")
+        binary_logistic_regressor_model_uri = get_artifact_uri(run.info.run_id, "bin_clf_model")
+
+    return binary_logistic_regressor_model_uri
+
+
+@pytest.fixture(scope="module")
+def svm_model_uri():
+    X, y = get_breast_cancer_dataset()
+    clf = sklearn.svm.LinearSVC()
+    clf.fit(X, y)
 
-    return binary_classifier_model_uri
+    with mlflow.start_run() as run:
+        mlflow.sklearn.log_model(clf, "svm_model")
+        svm_model_uri = get_artifact_uri(run.info.run_id, "svm_model")
+
+    return svm_model_uri
 
 
 @pytest.fixture(scope="module")
@@ -182,9 +197,9 @@ def iris_pandas_df_dataset():
     return EvaluationDataset(data=data, labels=labels, name="iris_pandas_df_dataset")
 
 
-def test_classifier_evaluate(classifier_model_uri, iris_dataset):
+def test_classifier_evaluate(multiclass_logistic_regressor_model_uri, iris_dataset):
     y_true = iris_dataset.labels
-    classifier_model = mlflow.pyfunc.load_model(classifier_model_uri)
+    classifier_model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri)
     y_pred = classifier_model.predict(iris_dataset.data)
     expected_accuracy_score = accuracy_score(y_true, y_pred)
     expected_metrics = {
@@ -252,10 +267,10 @@ def test_classifier_evaluate(classifier_model_uri, iris_dataset):
         )
 
 
-def test_regressor_evaluate(regressor_model_uri, iris_dataset):
-    y_true = iris_dataset.labels
-    regressor_model = mlflow.pyfunc.load_model(regressor_model_uri)
-    y_pred = regressor_model.predict(iris_dataset.data)
+def test_regressor_evaluate(linear_regressor_model_uri, diabetes_dataset):
+    y_true = diabetes_dataset.labels
+    regressor_model = mlflow.pyfunc.load_model(linear_regressor_model_uri)
+    y_pred = regressor_model.predict(diabetes_dataset.data)
     expected_mae = mean_absolute_error(y_true, y_pred)
     expected_mse = mean_squared_error(y_true, y_pred)
     expected_metrics = {
@@ -263,16 +278,16 @@ def test_regressor_evaluate(regressor_model_uri, iris_dataset):
         "mean_squared_error": expected_mse,
     }
     expected_saved_metrics = {
-        "mean_absolute_error_on_iris_dataset": expected_mae,
-        "mean_squared_error_on_iris_dataset": expected_mse,
+        "mean_absolute_error_on_diabetes_dataset": expected_mae,
+        "mean_squared_error_on_diabetes_dataset": expected_mse,
     }
 
-    for model in [regressor_model, regressor_model_uri]:
+    for model in [regressor_model, linear_regressor_model_uri]:
         with mlflow.start_run() as run:
             eval_result = evaluate(
                 model,
                 "regressor",
-                iris_dataset,
+                diabetes_dataset,
                 run_id=None,
                 evaluators="dummy_evaluator",
             )
@@ -306,9 +321,10 @@ def get_md5(data):
     assert get_md5(list3) == get_md5(list4)
 
 
-def test_dataset_hash(iris_dataset, iris_pandas_df_dataset):
-    assert iris_dataset.hash == "c7417e63a9ce038a32f37ecd7fb829f6"
+def test_dataset_hash(iris_dataset, iris_pandas_df_dataset, diabetes_spark_dataset):
+    assert iris_dataset.hash == "827a8427365cafbd9110b1b009d5a80d"
     assert iris_pandas_df_dataset.hash == "d06cfb6352dba29afe514d9be87021aa"
+    assert diabetes_spark_dataset.hash == "a30ebc9899e22ee6e60665f98d4b08b3"
 
 
 def test_datasset_extract_features_label(iris_dataset, iris_pandas_df_dataset):
@@ -340,11 +356,10 @@ def test_log_dataset_tag(iris_dataset, iris_pandas_df_dataset):
         iris_dataset._log_dataset_tag(client, run.info.run_id, model_uuid=model_uuid)
         _, _, tags, _ = get_run_data(run.info.run_id)
 
-        logged_meta1 = {**iris_dataset._metadata, 'model': model_uuid}
-        logged_meta2 = {**iris_pandas_df_dataset._metadata, 'model': model_uuid}
+        logged_meta1 = {**iris_dataset._metadata, "model": model_uuid}
+        logged_meta2 = {**iris_pandas_df_dataset._metadata, "model": model_uuid}
 
-        assert json.loads(tags["mlflow.datasets"]) == \
-            [logged_meta1]
+        assert json.loads(tags["mlflow.datasets"]) == [logged_meta1]
 
         raw_tag = get_raw_tag(run.info.run_id, "mlflow.datasets")
         assert " " not in raw_tag  # assert the tag string remove all whitespace chars.
@@ -398,7 +413,7 @@ def _load_content_from_file(self, local_artifact_path):
         raise RuntimeError()
 
 
-def test_evaluator_interface(classifier_model_uri, iris_dataset):
+def test_evaluator_interface(multiclass_logistic_regressor_model_uri, iris_dataset):
     with mock.patch.object(
         _model_evaluation_registry, "_registry", {"test_evaluator1": FakeEvauator1}
     ):
@@ -415,10 +430,10 @@ def test_evaluator_interface(classifier_model_uri, iris_dataset):
             with mlflow.start_run():
                 with pytest.raises(
                     ValueError,
-                    match='The model could not be evaluated by any of the registered evaluators',
+                    match="The model could not be evaluated by any of the registered evaluators",
                 ):
                     evaluate(
-                        classifier_model_uri,
+                        multiclass_logistic_regressor_model_uri,
                         "classifier",
                         iris_dataset,
                         run_id=None,
@@ -432,7 +447,7 @@ def test_evaluator_interface(classifier_model_uri, iris_dataset):
         ) as mock_can_evaluate, mock.patch.object(
             FakeEvauator1, "evaluate", return_value=evaluator1_return_value
         ) as mock_evaluate:
-            classifier_model = mlflow.pyfunc.load_model(classifier_model_uri)
+            classifier_model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri)
             with mlflow.start_run() as run:
                 eval1_result = evaluate(
                     classifier_model,
@@ -451,7 +466,7 @@ def test_evaluator_interface(classifier_model_uri, iris_dataset):
                 )
 
 
-def test_evaluate_with_multi_evaluators(classifier_model_uri, iris_dataset):
+def test_evaluate_with_multi_evaluators(multiclass_logistic_regressor_model_uri, iris_dataset):
     with mock.patch.object(
         _model_evaluation_registry,
         "_registry",
@@ -479,7 +494,7 @@ def test_evaluate_with_multi_evaluators(classifier_model_uri, iris_dataset):
             ) as mock_can_evaluate2, mock.patch.object(
                 FakeEvauator2, "evaluate", return_value=evaluator2_return_value
             ) as mock_evaluate2:
-                classifier_model = mlflow.pyfunc.load_model(classifier_model_uri)
+                classifier_model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri)
                 with mlflow.start_run() as run:
                     eval_result = evaluate(
                         classifier_model,
@@ -546,39 +561,46 @@ def test_start_run_or_reuse_active_run():
 
 def test_normalize_evaluators_and_evaluator_config_args():
     from mlflow.models.evaluation.default_evaluator import DefaultEvaluator
+
     with mock.patch.object(
         _model_evaluation_registry,
         "_registry",
         {"default": DefaultEvaluator},
     ):
-        assert _normalize_config(None, None) == (['default'], {})
-        assert _normalize_config(None, {'a': 3}) == (['default'], {'default': {'a': 3}})
-        assert _normalize_config(None, {'default': {'a': 3}}) == (['default'], {'default': {'a': 3}})
+        assert _normalize_config(None, None) == (["default"], {})
+        assert _normalize_config(None, {"a": 3}) == (["default"], {"default": {"a": 3}})
+        assert _normalize_config(None, {"default": {"a": 3}}) == (
+            ["default"],
+            {"default": {"a": 3}},
+        )
 
-    assert _normalize_config(None, None) == (['default', 'dummy_evaluator'], {})
+    assert _normalize_config(None, None) == (["default", "dummy_evaluator"], {})
     with pytest.raises(
-            ValueError,
-            match='`evaluator_config` argument must be a dictionary mapping each evaluator'
+        ValueError, match="`evaluator_config` argument must be a dictionary mapping each evaluator"
     ):
-        assert _normalize_config(None, {'a': 3}) == (['default', 'dummy_evaluator'], {})
+        assert _normalize_config(None, {"a": 3}) == (["default", "dummy_evaluator"], {})
 
-    assert _normalize_config(None, {'default': {'a': 3}}) == (
-        ['default', 'dummy_evaluator'], {'default': {'a': 3}}
+    assert _normalize_config(None, {"default": {"a": 3}}) == (
+        ["default", "dummy_evaluator"],
+        {"default": {"a": 3}},
     )
 
-    with mock.patch.object(_base_logger, 'warning') as patched_warning_fn:
+    with mock.patch.object(_base_logger, "warning") as patched_warning_fn:
         _normalize_config(None, None)
         patched_warning_fn.assert_called_once()
-        assert 'Multiple registered evaluators are found' in patched_warning_fn.call_args[0][0]
+        assert "Multiple registered evaluators are found" in patched_warning_fn.call_args[0][0]
 
-    assert _normalize_config('dummy_evaluator', {'a': 3}) == \
-        (['dummy_evaluator'], {'dummy_evaluator': {'a': 3}})
+    assert _normalize_config("dummy_evaluator", {"a": 3}) == (
+        ["dummy_evaluator"],
+        {"dummy_evaluator": {"a": 3}},
+    )
 
-    assert _normalize_config(['default', 'dummy_evaluator'], {'dummy_evaluator': {'a': 3}}) == \
-        (['default', 'dummy_evaluator'], {'dummy_evaluator': {'a': 3}})
+    assert _normalize_config(["default", "dummy_evaluator"], {"dummy_evaluator": {"a": 3}}) == (
+        ["default", "dummy_evaluator"],
+        {"dummy_evaluator": {"a": 3}},
+    )
 
     with pytest.raises(
-            ValueError,
-            match='evaluator_config must be a dict contains mapping from evaluator name to'
+        ValueError, match="evaluator_config must be a dict contains mapping from evaluator name to"
     ):
-        _normalize_config(['default', 'dummy_evaluator'], {'abc': {'a': 3}})
+        _normalize_config(["default", "dummy_evaluator"], {"abc": {"a": 3}})

From 3b9553a4b128c77e4955468efc9a968f9633d91b Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Fri, 31 Dec 2021 15:39:47 +0800
Subject: [PATCH 098/120] improve label handling

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              |  7 ++
 mlflow/models/evaluation/default_evaluator.py | 93 ++++++++++++-------
 2 files changed, 69 insertions(+), 31 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 511ad9fd3ea88..afaab5cd791cb 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -255,6 +255,8 @@ def __init__(self, data, labels, name=None, path=None, feature_names=None):
                     f"feature_{str(i).zfill(math.ceil((math.log10(num_features))))}"
                     for i in range(num_features)
                 ]
+            if isinstance(self.labels, list):
+                self.labels = np.array(self.labels)
         else:
             pd_column_names = [c for c in self.data.columns if c != self.labels]
             if feature_names is not None:
@@ -694,6 +696,11 @@ def evaluate(
        scalar value columns, other object types (nd.array/list/etc.) are not supported yet.
      - If the mlflow model to be evaluated is a pyspark ML model, then the input data must
        be a spark DataFrame contains a feature column of "Vector" type, and a label column.
+     - For classifier, evaluation dataset labels must contains all distinct values, the dataset
+       labels data will be used to infer the number of classes. For binary classifier, the
+       negative label value must be 0 or -1, and the positive label value must be 1.
+       For multiclass classifier, if logging explainability insights enabled, the label values
+       must be number type.
 
     Limitations of default evaluator logging model explainability insights:
      - The `shap.Explainer` "auto" algorithm will choose Linear explainer for linear model,
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 7d0a863c3c504..99ad5328b155f 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -13,6 +13,7 @@
 from sklearn import metrics as sk_metrics
 import math
 from collections import namedtuple
+import numbers
 import pandas as pd
 import numpy as np
 import json
@@ -52,6 +53,8 @@ def _load_content_from_file(self, local_artifact_path):
 def _infer_model_type_by_labels(labels):
     distinct_labels = set(labels)
     for v in distinct_labels:
+        if isinstance(v, str):
+            return "classifier"
         if not float(v).is_integer():
             return "regressor"
     if len(distinct_labels) > 1000 and len(distinct_labels) / len(labels) > 0.7:
@@ -116,12 +119,12 @@ def _get_regressor_metrics(y, y_pred):
     }
 
 
-def _get_binary_sum_up_label_pred_prob(postive_class, y, y_pred, y_probs):
-    y_is_positive = np.where(y == postive_class, 1, 0)
-    y_pred_is_positive = np.where(y_pred == postive_class, 1, 0)
+def _get_binary_sum_up_label_pred_prob(positive_class_index, positive_class, y, y_pred, y_probs):
+    y_is_positive = np.where(y == positive_class, 1, 0)
+    y_pred_is_positive = np.where(y_pred == positive_class, 1, 0)
 
     if y_probs is not None:
-        prob_of_positive = y_probs[:, postive_class]
+        prob_of_positive = y_probs[:, positive_class_index]
     else:
         prob_of_positive = None
 
@@ -146,21 +149,26 @@ def _get_classifier_per_class_metrics(y, y_pred):
     return metrics
 
 
-def _get_classifier_global_metrics(is_binomial, y, y_pred, y_probs):
+def _get_classifier_global_metrics(is_binomial, y, y_pred, y_probs, labels):
     metrics = {}
     metrics["accuracy"] = sk_metrics.accuracy_score(y, y_pred)
     metrics["example_count"] = len(y)
 
     if not is_binomial:
-        metrics["f1_score_micro"] = sk_metrics.f1_score(y, y_pred, average="micro")
-        metrics["f1_score_macro"] = sk_metrics.f1_score(y, y_pred, average="macro")
+        metrics["f1_score_micro"] = sk_metrics.f1_score(y, y_pred, average="micro", labels=labels)
+        metrics["f1_score_macro"] = sk_metrics.f1_score(y, y_pred, average="macro", labels=labels)
 
     if y_probs is not None:
-        metrics["log_loss"] = sk_metrics.log_loss(y, y_probs)
+        metrics["log_loss"] = sk_metrics.log_loss(y, y_probs, labels=labels)
 
     return metrics
 
 
+def _gen_precision_recall_curve(positive_class, y, y_prob):
+    precision, recall, thresholds = sk_metrics.precision_recall_curve(y, y_prob)
+    thresholds = np.append(thresholds, [1.0], axis=0)
+
+
 class DefaultEvaluator(ModelEvaluator):
     def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
         return model_type in ["classifier", "regressor"]
@@ -228,8 +236,15 @@ def _log_model_explainability(self):
             #  To support this, we need expand the Vector type feature column into
             #  multiple scaler feature columns and pass it to shap explainer.
             _logger.warning(
-                "Logging model explainability insights is not currently supported for PySpark"
-                " models."
+                "Logging model explainability insights is not currently supported for PySpark "
+                "models."
+            )
+            return
+
+        if not all([isinstance(label, numbers.Number) for label in self.label_list]):
+            _logger.warning(
+                "Skip logging model explainability insights because it requires all label "
+                "values to be Number type."
             )
             return
 
@@ -278,7 +293,6 @@ def _log_model_explainability(self):
                 explainer = shap.explainers.Sampling(
                     self.predict_fn, renamed_X, feature_names=truncated_feature_names
                 )
-                shap_values = explainer(renamed_X, sample_rows)
             else:
                 explainer = shap.Explainer(
                     self.predict_fn,
@@ -286,7 +300,6 @@ def _log_model_explainability(self):
                     feature_names=truncated_feature_names,
                     algorithm=algorithm,
                 )
-                shap_values = explainer(sampled_X)
         else:
             if self.raw_model and not is_multinomial_classifier:
                 # For mulitnomial classifier, shap.Explainer may choose Tree/Linear explainer for
@@ -295,16 +308,19 @@ def _log_model_explainability(self):
                 explainer = shap.Explainer(
                     self.raw_model, sampled_X, feature_names=truncated_feature_names
                 )
-                shap_values = explainer(sampled_X)
             else:
                 # fallback to default explainer
                 explainer = shap.Explainer(
                     self.predict_fn, sampled_X, feature_names=truncated_feature_names
                 )
-                shap_values = explainer(sampled_X)
 
         _logger.info(f"Shap explainer {explainer.__class__.__name__} is used.")
 
+        if algorithm == "sampling":
+            shap_values = explainer(renamed_X, sample_rows)
+        else:
+            shap_values = explainer(sampled_X)
+
         try:
             mlflow.shap.log_explainer(
                 explainer, artifact_path=_gen_log_key("explainer", self.dataset_name)
@@ -417,14 +433,16 @@ def _log_multiclass_classifier(self):
                     f"increase the threshold."
                 )
 
-        for postive_class in self.label_list:
+        for positive_class_index, positive_class in enumerate(self.label_list):
             (
                 y_is_positive,
                 y_pred_is_positive,
                 prob_of_positive,
-            ) = _get_binary_sum_up_label_pred_prob(postive_class, self.y, self.y_pred, self.y_probs)
+            ) = _get_binary_sum_up_label_pred_prob(
+                positive_class_index, positive_class, self.y, self.y_pred, self.y_probs
+            )
 
-            per_class_metrics = {"positive_class": postive_class}
+            per_class_metrics = {"positive_class": positive_class}
             per_class_metrics_list.append(per_class_metrics)
 
             per_class_metrics.update(
@@ -435,7 +453,7 @@ def _log_multiclass_classifier(self):
                 fpr, tpr, thresholds = sk_metrics.roc_curve(y_is_positive, prob_of_positive)
                 if log_roc_pr_curve:
                     per_class_roc_curve_data_list.append(
-                        PerClassRocCurveData(postive_class, fpr, tpr, thresholds)
+                        PerClassRocCurveData(positive_class, fpr, tpr, thresholds)
                     )
                 roc_auc = sk_metrics.auc(fpr, tpr)
                 per_class_metrics["roc_auc"] = roc_auc
@@ -447,7 +465,7 @@ def _log_multiclass_classifier(self):
                 if log_roc_pr_curve:
                     per_class_precision_recall_curve_data_list.append(
                         PerClassPrecisionRecallCurveData(
-                            postive_class, precision, recall, thresholds
+                            positive_class, precision, recall, thresholds
                         )
                     )
                 pr_auc = sk_metrics.auc(recall, precision)
@@ -506,20 +524,28 @@ def plot_precision_recall_curve():
     def _evaluate_classifier(self):
         from mlflow.models.evaluation.lift_curve import plot_lift_curve
 
-        label_list = sorted(list(set(self.y)))
-        self.label_list = label_list
-        self.num_classes = len(label_list)
+        self.label_list = np.unique(self.y)
+        self.num_classes = len(self.label_list)
 
         self.y_pred = self.predict_fn(self.X)
         self.is_binomial = self.num_classes <= 2
 
         if self.is_binomial:
-            for label in label_list:
-                if int(label) not in [-1, 0, 1]:
-                    raise ValueError(
-                        "Binomial classification require evaluation dataset label values to be "
-                        "-1, 0, or 1."
-                    )
+            if list(self.label_list) not in [[0, 1], [-1, 1]]:
+                raise ValueError(
+                    'Binary classifier evaluation dataset positive class label must be 1, '
+                    'negative class label must be 0 or -1, and dataset must contains both '
+                    'positive and negative examples.'
+                )
+            _logger.info(
+                'The evaluation dataset is inferred as binary dataset, positive label is '
+                f'{self.label_list[1]}, negative label is {self.label_list[0]}.'
+            )
+        else:
+            _logger.info(
+                'The evaluation dataset is inferred as multiclass dataset, number of classes '
+                f'is inferred as {self.num_classes}'
+            )
 
         if self.predict_proba_fn is not None:
             self.y_probs = self.predict_proba_fn(self.X)
@@ -532,7 +558,9 @@ def _evaluate_classifier(self):
             self.y_prob = None
 
         self.metrics.update(
-            _get_classifier_global_metrics(self.is_binomial, self.y, self.y_pred, self.y_probs)
+            _get_classifier_global_metrics(
+                self.is_binomial, self.y, self.y_pred, self.y_probs, self.label_list
+            )
         )
 
         if self.is_binomial:
@@ -547,10 +575,13 @@ def _evaluate_classifier(self):
             )
 
         # TODO: Shall we also log confusion_matrix data as a json artifact ?
-        confusion_matrix = sk_metrics.confusion_matrix(self.y, self.y_pred)
+        confusion_matrix = sk_metrics.confusion_matrix(self.y, self.y_pred, labels=self.label_list)
 
         def plot_confusion_matrix():
-            sk_metrics.ConfusionMatrixDisplay(confusion_matrix=confusion_matrix).plot()
+            sk_metrics.ConfusionMatrixDisplay(
+                confusion_matrix=confusion_matrix,
+                display_labels=self.label_list,
+            ).plot()
 
         if hasattr(sk_metrics, "ConfusionMatrixDisplay"):
             self._log_image_artifact(

From 5368cafb73e6d4272c1d959b8a998d5ef4276d35 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Sat, 1 Jan 2022 22:11:49 +0800
Subject: [PATCH 099/120] refactor

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              |   4 +-
 mlflow/models/evaluation/default_evaluator.py | 300 +++++++++---------
 mlflow/models/utils.py                        |   2 +-
 3 files changed, 155 insertions(+), 151 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index afaab5cd791cb..ef8bd9a5548da 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -255,8 +255,6 @@ def __init__(self, data, labels, name=None, path=None, feature_names=None):
                     f"feature_{str(i).zfill(math.ceil((math.log10(num_features))))}"
                     for i in range(num_features)
                 ]
-            if isinstance(self.labels, list):
-                self.labels = np.array(self.labels)
         else:
             pd_column_names = [c for c in self.data.columns if c != self.labels]
             if feature_names is not None:
@@ -698,7 +696,7 @@ def evaluate(
        be a spark DataFrame contains a feature column of "Vector" type, and a label column.
      - For classifier, evaluation dataset labels must contains all distinct values, the dataset
        labels data will be used to infer the number of classes. For binary classifier, the
-       negative label value must be 0 or -1, and the positive label value must be 1.
+       negative label value must be 0 or -1 or False, and the positive label value must be 1 or True.
        For multiclass classifier, if logging explainability insights enabled, the label values
        must be number type.
 
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 99ad5328b155f..fdef84d376c53 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -53,7 +53,7 @@ def _load_content_from_file(self, local_artifact_path):
 def _infer_model_type_by_labels(labels):
     distinct_labels = set(labels)
     for v in distinct_labels:
-        if isinstance(v, str):
+        if not isinstance(v, numbers.Number):
             return "classifier"
         if not float(v).is_integer():
             return "regressor"
@@ -120,15 +120,18 @@ def _get_regressor_metrics(y, y_pred):
 
 
 def _get_binary_sum_up_label_pred_prob(positive_class_index, positive_class, y, y_pred, y_probs):
-    y_is_positive = np.where(y == positive_class, 1, 0)
-    y_pred_is_positive = np.where(y_pred == positive_class, 1, 0)
+    y = np.array(y)
+    y_bin = np.where(y == positive_class, 1, 0)
+    y_pred_bin = None
+    y_prob_bin = None
+    if y_pred is not None:
+        y_pred = np.array(y_pred)
+        y_pred_bin = np.where(y_pred == positive_class, 1, 0)
 
     if y_probs is not None:
-        prob_of_positive = y_probs[:, positive_class_index]
-    else:
-        prob_of_positive = None
+        y_prob_bin = y_probs[:, positive_class_index]
 
-    return y_is_positive, y_pred_is_positive, prob_of_positive
+    return y_bin, y_pred_bin, y_prob_bin
 
 
 def _get_classifier_per_class_metrics(y, y_pred):
@@ -164,9 +167,102 @@ def _get_classifier_global_metrics(is_binomial, y, y_pred, y_probs, labels):
     return metrics
 
 
-def _gen_precision_recall_curve(positive_class, y, y_prob):
-    precision, recall, thresholds = sk_metrics.precision_recall_curve(y, y_prob)
-    thresholds = np.append(thresholds, [1.0], axis=0)
+def _get_classifier_per_class_metrics_collection_df(y, y_pred, labels):
+    per_class_metrics_list = []
+    for positive_class_index, positive_class in enumerate(labels):
+        (
+            y_bin,
+            y_pred_bin,
+            _,
+        ) = _get_binary_sum_up_label_pred_prob(
+            positive_class_index, positive_class, y, y_pred, None
+        )
+
+        per_class_metrics = {"positive_class": positive_class}
+        per_class_metrics.update(_get_classifier_per_class_metrics(y_bin, y_pred_bin))
+        per_class_metrics_list.append(per_class_metrics)
+
+    return pd.DataFrame(per_class_metrics_list)
+
+
+_Curve = namedtuple('_Curve', ['plot_fn', 'plot_fn_args', 'curve_dataframe', 'auc'])
+
+
+def _gen_classifier_curve(
+        is_binomial, y, y_probs, labels, curve_type,
+):
+    if curve_type == 'roc':
+        def gen_x_y_thresholds_fn(_y, _y_prob):
+            fpr, tpr, _thresholds = sk_metrics.roc_curve(_y, _y_prob)
+            return fpr, tpr, _thresholds
+
+        xlabel = "fpr"
+        ylabel = "tpr"
+        legend_loc = "lower right"
+    elif curve_type == 'pr':
+        def gen_x_y_thresholds_fn(_y, _y_prob):
+            precision, recall, _thresholds = sk_metrics.precision_recall_curve(_y, _y_prob)
+            _thresholds = np.append(_thresholds, [1.0], axis=0)
+            return recall, precision, _thresholds
+
+        xlabel = "recall"
+        ylabel = "precision"
+        legend_loc = "lower left"
+    else:
+        assert False, 'illegal curve type'
+
+    if is_binomial:
+        y_prob = y_probs[:, 1]
+        x_data, y_data, thresholds = gen_x_y_thresholds_fn(y, y_prob)
+        curve_dataframe = pd.DataFrame({
+            xlabel: x_data,
+            ylabel: y_data,
+            'thresholds': thresholds,
+        })
+        data_series = [('positive class', x_data, y_data)]
+        legend_loc = None
+        auc = sk_metrics.auc(x_data, y_data)
+    else:
+        curve_list = []
+        for positive_class_index, positive_class in enumerate(labels):
+            y_bin, _, y_prob_bin = _get_binary_sum_up_label_pred_prob(
+                positive_class_index, positive_class, y, None, y_probs
+            )
+
+            x_data, y_data, thresholds = gen_x_y_thresholds_fn(y_bin, y_prob_bin)
+            curve_list.append((positive_class, x_data, y_data, thresholds))
+
+        curve_dataframe = pd.concat(
+            [
+                pd.DataFrame({
+                    'positive_class': positive_class,
+                    xlabel: x_data,
+                    ylabel: y_data,
+                    'thresholds': thresholds,
+                })
+                for positive_class, x_data, y_data, thresholds in curve_list
+            ],
+            ignore_index=True,
+        )
+        data_series = [
+            (f"Positive Class = {positive_class}", x_data, y_data)
+            for positive_class, x_data, y_data, _ in curve_list
+        ]
+        auc = [
+            sk_metrics.auc(x_data, y_data) for _, x_data, y_data, _ in curve_list
+        ]
+    return _Curve(
+        plot_fn=plot_lines,
+        plot_fn_args={
+            'data_series': data_series,
+            'xlabel': xlabel,
+            'ylabel': ylabel,
+            'legend_loc': legend_loc,
+            'line_kwargs': {"drawstyle": "steps-post"},
+        },
+        curve_dataframe=curve_dataframe,
+        auc=auc
+    )
 
 
 class DefaultEvaluator(ModelEvaluator):
@@ -361,63 +457,33 @@ def plot_feature_importance():
     def _log_binary_classifier(self):
         self.metrics.update(_get_classifier_per_class_metrics(self.y, self.y_pred))
 
-        if self.y_prob is not None:
-            fpr, tpr, thresholds = sk_metrics.roc_curve(self.y, self.y_prob)
-            roc_curve_pandas_df = pd.DataFrame({"fpr": fpr, "tpr": tpr, "thresholds": thresholds})
-            self._log_pandas_df_artifact(
-                roc_curve_pandas_df,
-                "roc_curve_data",
+        if self.y_probs is not None:
+            roc_curve = _gen_classifier_curve(
+                is_binomial=True, y=self.y, y_probs=self.y_probs, labels=self.label_list,
+                curve_type='roc'
             )
-
-            roc_auc = sk_metrics.auc(fpr, tpr)
-            self.metrics["roc_auc"] = roc_auc
-
-            def plot_roc_curve():
-                # Do not use sklearn.metrics roc plot API because
-                # older sklearn verison < 0.24 does not support.
-                plot_lines(
-                    {"roc": (fpr, tpr)},
-                    xlabel="False Positive Rate",
-                    ylabel="True Positive Rate",
-                    line_kwargs={"drawstyle": "steps-post"},
-                )
-
-            self._log_image_artifact(plot_roc_curve, "roc_curve_plot")
-
-            precision, recall, thresholds = sk_metrics.precision_recall_curve(self.y, self.y_prob)
-            thresholds = np.append(thresholds, [1.0], axis=0)
-            pr_curve_pandas_df = pd.DataFrame(
-                {"precision": precision, "recall": recall, "thresholds": thresholds}
+            self._log_image_artifact(
+                lambda: roc_curve.plot_fn(**roc_curve.plot_fn_args),
+                "roc_curve_plot"
             )
-            self._log_pandas_df_artifact(pr_curve_pandas_df, "precision_recall_curve_data")
-
-            pr_auc = sk_metrics.auc(recall, precision)
-            self.metrics["precision_recall_auc"] = pr_auc
-
-            def plot_precision_recall_curve():
-                # Do not use sklearn.metrics precision-recall plot API because
-                # older sklearn verison < 0.24 does not support.
-                plot_lines(
-                    {"pr_curve": (recall, precision)},
-                    xlabel="recall",
-                    ylabel="precision",
-                    line_kwargs={"drawstyle": "steps-post"},
-                )
+            self._log_pandas_df_artifact(roc_curve.curve_dataframe, "roc_curve_data")
+            self.metrics["roc_auc"] = roc_curve.auc
 
-            self._log_image_artifact(plot_precision_recall_curve, "precision_recall_curve_plot")
+            pr_curve = _gen_classifier_curve(
+                is_binomial=True, y=self.y, y_probs=self.y_probs, labels=self.label_list,
+                curve_type='pr'
+            )
+            self._log_image_artifact(
+                lambda: pr_curve.plot_fn(**pr_curve.plot_fn_args),
+                "precision_recall_curve_plot"
+            )
+            self._log_pandas_df_artifact(pr_curve.curve_dataframe, "precision_recall_curve_data")
+            self.metrics["precision_recall_auc"] = pr_curve.auc
 
     def _log_multiclass_classifier(self):
-        per_class_metrics_list = []
-        per_class_roc_curve_data_list = []
-        per_class_precision_recall_curve_data_list = []
+        per_class_metrics_collection_df = \
+            _get_classifier_per_class_metrics_collection_df(self.y, self.y_pred, self.label_list)
 
-        PerClassRocCurveData = namedtuple(
-            "PerClassRocCurveData", ["postive_class", "fpr", "tpr", "thresholds"]
-        )
-        PerClassPrecisionRecallCurveData = namedtuple(
-            "PerClassPrecisionRecallCurveData",
-            ["postive_class", "precision", "recall", "thresholds"],
-        )
         log_roc_pr_curve = False
         if self.y_probs is not None:
             max_num_classes_for_logging_curve = self.evaluator_config.get(
@@ -433,93 +499,30 @@ def _log_multiclass_classifier(self):
                     f"increase the threshold."
                 )
 
-        for positive_class_index, positive_class in enumerate(self.label_list):
-            (
-                y_is_positive,
-                y_pred_is_positive,
-                prob_of_positive,
-            ) = _get_binary_sum_up_label_pred_prob(
-                positive_class_index, positive_class, self.y, self.y_pred, self.y_probs
+        if log_roc_pr_curve:
+            roc_curve = _gen_classifier_curve(
+                is_binomial=False, y=self.y, y_probs=self.y_probs, labels=self.label_list,
+                curve_type='roc'
             )
-
-            per_class_metrics = {"positive_class": positive_class}
-            per_class_metrics_list.append(per_class_metrics)
-
-            per_class_metrics.update(
-                _get_classifier_per_class_metrics(y_is_positive, y_pred_is_positive)
+            self._log_image_artifact(
+                lambda: roc_curve.plot_fn(**roc_curve.plot_fn_args),
+                "roc_curve_plot"
             )
+            self._log_pandas_df_artifact(roc_curve.curve_dataframe, "roc_curve_data")
+            per_class_metrics_collection_df["roc_auc"] = roc_curve.auc
 
-            if self.y_probs is not None:
-                fpr, tpr, thresholds = sk_metrics.roc_curve(y_is_positive, prob_of_positive)
-                if log_roc_pr_curve:
-                    per_class_roc_curve_data_list.append(
-                        PerClassRocCurveData(positive_class, fpr, tpr, thresholds)
-                    )
-                roc_auc = sk_metrics.auc(fpr, tpr)
-                per_class_metrics["roc_auc"] = roc_auc
-
-                precision, recall, thresholds = sk_metrics.precision_recall_curve(
-                    y_is_positive, prob_of_positive
-                )
-                thresholds = np.append(thresholds, [1.0], axis=0)
-                if log_roc_pr_curve:
-                    per_class_precision_recall_curve_data_list.append(
-                        PerClassPrecisionRecallCurveData(
-                            positive_class, precision, recall, thresholds
-                        )
-                    )
-                pr_auc = sk_metrics.auc(recall, precision)
-                per_class_metrics["precision_recall_auc"] = pr_auc
-
-        per_class_metrics_pandas_df = pd.DataFrame(per_class_metrics_list)
-        self._log_pandas_df_artifact(per_class_metrics_pandas_df, "per_class_metrics_data")
-
-        if self.y_probs is not None and log_roc_pr_curve:
-            per_class_roc_curve_pandas_df = pd.concat(
-                [pd.DataFrame(item._asdict()) for item in per_class_roc_curve_data_list],
-                ignore_index=True,
+            pr_curve = _gen_classifier_curve(
+                is_binomial=False, y=self.y, y_probs=self.y_probs, labels=self.label_list,
+                curve_type='pr'
             )
-            self._log_pandas_df_artifact(per_class_roc_curve_pandas_df, "per_class_roc_curve_data")
-
-            per_class_precision_recall_curve_pandas_df = pd.concat(
-                [
-                    pd.DataFrame(item._asdict())
-                    for item in per_class_precision_recall_curve_data_list
-                ],
-                ignore_index=True,
-            )
-            self._log_pandas_df_artifact(
-                per_class_precision_recall_curve_pandas_df, "per_class_precision_recall_curve_data"
+            self._log_image_artifact(
+                lambda: pr_curve.plot_fn(**pr_curve.plot_fn_args),
+                "precision_recall_curve_plot"
             )
+            self._log_pandas_df_artifact(pr_curve.curve_dataframe, "precision_recall_curve_data")
+            per_class_metrics_collection_df["precision_recall_auc"] = pr_curve.auc
 
-            def plot_roc_curve():
-                data_series = {
-                    f"Positive Class = {postive_class}": (fpr, tpr)
-                    for postive_class, fpr, tpr, _ in per_class_roc_curve_data_list
-                }
-                plot_lines(
-                    data_series,
-                    xlabel="False Positive Rate",
-                    ylabel="True Positive Rate",
-                    legend_loc="lower right",
-                    line_kwargs={"drawstyle": "steps-post"},
-                )
-
-            def plot_precision_recall_curve():
-                data_series = {
-                    f"Positive Class = {postive_class}": (recall, precision)
-                    for postive_class, precision, recall, _ in per_class_precision_recall_curve_data_list
-                }
-                plot_lines(
-                    data_series,
-                    xlabel="recall",
-                    ylabel="precision",
-                    legend_loc="lower left",
-                    line_kwargs={"drawstyle": "steps-post"},
-                )
-
-            self._log_image_artifact(plot_roc_curve, "roc_curve_plot")
-            self._log_image_artifact(plot_precision_recall_curve, "precision_recall_curve_plot")
+        self._log_pandas_df_artifact(per_class_metrics_collection_df, "per_class_metrics")
 
     def _evaluate_classifier(self):
         from mlflow.models.evaluation.lift_curve import plot_lift_curve
@@ -533,9 +536,9 @@ def _evaluate_classifier(self):
         if self.is_binomial:
             if list(self.label_list) not in [[0, 1], [-1, 1]]:
                 raise ValueError(
-                    'Binary classifier evaluation dataset positive class label must be 1, '
-                    'negative class label must be 0 or -1, and dataset must contains both '
-                    'positive and negative examples.'
+                    'Binary classifier evaluation dataset positive class label must be 1 or True, '
+                    'negative class label must be 0 or -1 or False, and dataset must contains '
+                    'both positive and negative examples.'
                 )
             _logger.info(
                 'The evaluation dataset is inferred as binary dataset, positive label is '
@@ -575,13 +578,16 @@ def _evaluate_classifier(self):
             )
 
         # TODO: Shall we also log confusion_matrix data as a json artifact ?
-        confusion_matrix = sk_metrics.confusion_matrix(self.y, self.y_pred, labels=self.label_list)
+        # normalize the confusion matrix, keep consistent with sklearn autologging.
+        confusion_matrix = sk_metrics.confusion_matrix(
+            self.y, self.y_pred, labels=self.label_list, normalize="true"
+        )
 
         def plot_confusion_matrix():
             sk_metrics.ConfusionMatrixDisplay(
                 confusion_matrix=confusion_matrix,
                 display_labels=self.label_list,
-            ).plot()
+            ).plot(cmap="Blues")
 
         if hasattr(sk_metrics, "ConfusionMatrixDisplay"):
             self._log_image_artifact(
diff --git a/mlflow/models/utils.py b/mlflow/models/utils.py
index 5c44a03188028..d471d4d8551be 100644
--- a/mlflow/models/utils.py
+++ b/mlflow/models/utils.py
@@ -241,7 +241,7 @@ def plot_lines(data_series, xlabel, ylabel, legend_loc=None, line_kwargs={}):
 
     fig, ax = plt.subplots()
 
-    for label, (data_x, data_y) in data_series.items():
+    for label, data_x, data_y in data_series:
         ax.plot(data_x, data_y, label=label, **line_kwargs)
 
     if legend_loc:

From c5f93ccd8b0a33035af4d87963c55f882d773c2b Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Sun, 2 Jan 2022 21:49:07 +0800
Subject: [PATCH 100/120] add tests

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/artifacts.py         |  22 ++
 mlflow/models/evaluation/default_evaluator.py |  73 +-----
 tests/models/test_default_evaluator.py        | 223 ++++++++++++++++--
 3 files changed, 242 insertions(+), 76 deletions(-)
 create mode 100644 mlflow/models/evaluation/artifacts.py

diff --git a/mlflow/models/evaluation/artifacts.py b/mlflow/models/evaluation/artifacts.py
new file mode 100644
index 0000000000000..656fcf9c059a8
--- /dev/null
+++ b/mlflow/models/evaluation/artifacts.py
@@ -0,0 +1,22 @@
+from PIL.Image import open as open_image
+import pandas as pd
+
+from mlflow.models.evaluation.base import EvaluationArtifact
+
+
+class ImageEvaluationArtifact(EvaluationArtifact):
+    def save(self, output_artifact_path):
+        self._content.save(output_artifact_path)
+
+    def _load_content_from_file(self, local_artifact_path):
+        self._content = open_image(local_artifact_path)
+        return self._content
+
+
+class CsvEvaluationArtifact(EvaluationArtifact):
+    def save(self, output_artifact_path):
+        self._content.to_csv(output_artifact_path, index=False)
+
+    def _load_content_from_file(self, local_artifact_path):
+        self._content = pd.read_csv(local_artifact_path)
+        return self._content
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index fdef84d376c53..30178cd54255d 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -2,13 +2,13 @@
 from mlflow.models.evaluation.base import (
     ModelEvaluator,
     EvaluationMetrics,
-    EvaluationArtifact,
     EvaluationResult,
 )
 from mlflow.entities.metric import Metric
 from mlflow.utils.file_utils import TempDir
 from mlflow.utils.string_utils import truncate_str_from_middle
 from mlflow.models.utils import plot_lines
+from mlflow.models.evaluation.artifacts import ImageEvaluationArtifact, CsvEvaluationArtifact
 
 from sklearn import metrics as sk_metrics
 import math
@@ -16,37 +16,13 @@
 import numbers
 import pandas as pd
 import numpy as np
-import json
 import time
 from functools import partial
 import logging
 from packaging.version import Version
 
-
-from PIL.Image import open as open_image
-
-
 _logger = logging.getLogger(__name__)
 
-
-class ImageEvaluationArtifact(EvaluationArtifact):
-    def save(self, output_artifact_path):
-        self._content.save(output_artifact_path)
-
-    def _load_content_from_file(self, local_artifact_path):
-        self._content = open_image(local_artifact_path)
-        return self._content
-
-
-class CsvEvaluationArtifact(EvaluationArtifact):
-    def save(self, output_artifact_path):
-        self._content.to_csv(output_artifact_path, index=False)
-
-    def _load_content_from_file(self, local_artifact_path):
-        self._content = pd.read_csv(local_artifact_path)
-        return self._content
-
-
 _DEFAULT_SAMPLE_ROWS_FOR_SHAP = 2000
 
 
@@ -70,10 +46,6 @@ def _extract_raw_model_and_predict_fn(model):
     try:
         if model_loader_module == "mlflow.sklearn":
             raw_model = model._model_impl
-        elif model_loader_module == "mlflow.lightgbm":
-            raw_model = model._model_impl.lgb_model
-        elif model_loader_module == "mlflow.xgboost":
-            raw_model = model._model_impl.xgb_model
         else:
             raw_model = None
     except Exception as e:
@@ -94,7 +66,8 @@ def _extract_raw_model_and_predict_fn(model):
                 # Because shap evaluation will pass evaluation data in ndarray format
                 # (without feature names), if set validate_features=True it will raise error.
                 predict_fn = partial(predict_fn, validate_features=False)
-                predict_proba_fn = partial(predict_proba_fn, validate_features=False)
+                if predict_proba_fn is not None:
+                    predict_proba_fn = partial(predict_proba_fn, validate_features=False)
         except ImportError:
             pass
 
@@ -129,6 +102,7 @@ def _get_binary_sum_up_label_pred_prob(positive_class_index, positive_class, y,
         y_pred_bin = np.where(y_pred == positive_class, 1, 0)
 
     if y_probs is not None:
+        y_probs = np.array(y_probs)
         y_prob_bin = y_probs[:, positive_class_index]
 
     return y_bin, y_pred_bin, y_prob_bin
@@ -185,7 +159,7 @@ def _get_classifier_per_class_metrics_collection_df(y, y_pred, labels):
     return pd.DataFrame(per_class_metrics_list)
 
 
-_Curve = namedtuple('_Curve', ['plot_fn', 'plot_fn_args', 'curve_dataframe', 'auc'])
+_Curve = namedtuple('_Curve', ['plot_fn', 'plot_fn_args', 'auc'])
 
 
 def _gen_classifier_curve(
@@ -196,8 +170,8 @@ def gen_x_y_thresholds_fn(_y, _y_prob):
             fpr, tpr, _thresholds = sk_metrics.roc_curve(_y, _y_prob)
             return fpr, tpr, _thresholds
 
-        xlabel = "fpr"
-        ylabel = "tpr"
+        xlabel = "False Positive Rate"
+        ylabel = "True Positive Rate"
         legend_loc = "lower right"
     elif curve_type == 'pr':
         def gen_x_y_thresholds_fn(_y, _y_prob):
@@ -212,13 +186,7 @@ def gen_x_y_thresholds_fn(_y, _y_prob):
         assert False, 'illegal curve type'
 
     if is_binomial:
-        y_prob = y_probs[:, 1]
-        x_data, y_data, thresholds = gen_x_y_thresholds_fn(y, y_prob)
-        curve_dataframe = pd.DataFrame({
-            xlabel: x_data,
-            ylabel: y_data,
-            'thresholds': thresholds,
-        })
+        x_data, y_data, thresholds = gen_x_y_thresholds_fn(y, y_probs)
         data_series = [('positive class', x_data, y_data)]
         legend_loc = None
         auc = sk_metrics.auc(x_data, y_data)
@@ -232,18 +200,6 @@ def gen_x_y_thresholds_fn(_y, _y_prob):
             x_data, y_data, thresholds = gen_x_y_thresholds_fn(y_bin, y_prob_bin)
             curve_list.append((positive_class, x_data, y_data, thresholds))
 
-        curve_dataframe = pd.concat(
-            [
-                pd.DataFrame({
-                    'positive_class': positive_class,
-                    xlabel: x_data,
-                    ylabel: y_data,
-                    'thresholds': thresholds,
-                })
-                for positive_class, x_data, y_data, thresholds in curve_list
-            ],
-            ignore_index=True,
-        )
         data_series = [
             (f"Positive Class = {positive_class}", x_data, y_data)
             for positive_class, x_data, y_data, _ in curve_list
@@ -260,7 +216,6 @@ def gen_x_y_thresholds_fn(_y, _y_prob):
             'legend_loc': legend_loc,
             'line_kwargs': {"drawstyle": "steps-post"},
         },
-        curve_dataframe=curve_dataframe,
         auc=auc
     )
 
@@ -337,7 +292,8 @@ def _log_model_explainability(self):
             )
             return
 
-        if not all([isinstance(label, numbers.Number) for label in self.label_list]):
+        if self.model_type == 'classifier' and \
+                not all([isinstance(label, numbers.Number) for label in self.label_list]):
             _logger.warning(
                 "Skip logging model explainability insights because it requires all label "
                 "values to be Number type."
@@ -459,25 +415,23 @@ def _log_binary_classifier(self):
 
         if self.y_probs is not None:
             roc_curve = _gen_classifier_curve(
-                is_binomial=True, y=self.y, y_probs=self.y_probs, labels=self.label_list,
+                is_binomial=True, y=self.y, y_probs=self.y_prob, labels=self.label_list,
                 curve_type='roc'
             )
             self._log_image_artifact(
                 lambda: roc_curve.plot_fn(**roc_curve.plot_fn_args),
                 "roc_curve_plot"
             )
-            self._log_pandas_df_artifact(roc_curve.curve_dataframe, "roc_curve_data")
             self.metrics["roc_auc"] = roc_curve.auc
 
             pr_curve = _gen_classifier_curve(
-                is_binomial=True, y=self.y, y_probs=self.y_probs, labels=self.label_list,
+                is_binomial=True, y=self.y, y_probs=self.y_prob, labels=self.label_list,
                 curve_type='pr'
             )
             self._log_image_artifact(
                 lambda: pr_curve.plot_fn(**pr_curve.plot_fn_args),
                 "precision_recall_curve_plot"
             )
-            self._log_pandas_df_artifact(pr_curve.curve_dataframe, "precision_recall_curve_data")
             self.metrics["precision_recall_auc"] = pr_curve.auc
 
     def _log_multiclass_classifier(self):
@@ -508,7 +462,6 @@ def _log_multiclass_classifier(self):
                 lambda: roc_curve.plot_fn(**roc_curve.plot_fn_args),
                 "roc_curve_plot"
             )
-            self._log_pandas_df_artifact(roc_curve.curve_dataframe, "roc_curve_data")
             per_class_metrics_collection_df["roc_auc"] = roc_curve.auc
 
             pr_curve = _gen_classifier_curve(
@@ -519,7 +472,6 @@ def _log_multiclass_classifier(self):
                 lambda: pr_curve.plot_fn(**pr_curve.plot_fn_args),
                 "precision_recall_curve_plot"
             )
-            self._log_pandas_df_artifact(pr_curve.curve_dataframe, "precision_recall_curve_data")
             per_class_metrics_collection_df["precision_recall_auc"] = pr_curve.auc
 
         self._log_pandas_df_artifact(per_class_metrics_collection_df, "per_class_metrics")
@@ -577,7 +529,6 @@ def _evaluate_classifier(self):
                 "lift_curve_plot",
             )
 
-        # TODO: Shall we also log confusion_matrix data as a json artifact ?
         # normalize the confusion matrix, keep consistent with sklearn autologging.
         confusion_matrix = sk_metrics.confusion_matrix(
             self.y, self.y_pred, labels=self.label_list, normalize="true"
diff --git a/tests/models/test_default_evaluator.py b/tests/models/test_default_evaluator.py
index 17748708e93ca..62c0e3744e443 100644
--- a/tests/models/test_default_evaluator.py
+++ b/tests/models/test_default_evaluator.py
@@ -3,16 +3,22 @@
 import math
 import sklearn.metrics
 
+
 from mlflow.models.evaluation import evaluate, EvaluationDataset
 from mlflow.models.evaluation.default_evaluator import (
     _get_regressor_metrics,
     _get_classifier_global_metrics,
     _get_classifier_per_class_metrics,
     _extract_raw_model_and_predict_fn,
+    _infer_model_type_by_labels,
+    _extract_raw_model_and_predict_fn,
+    _get_regressor_metrics,
+    _get_binary_sum_up_label_pred_prob,
+    _get_classifier_per_class_metrics,
+    _gen_classifier_curve,
 )
 import mlflow
-from sklearn.datasets import load_boston
-from sklearn.linear_model import LinearRegression
+from sklearn.linear_model import LogisticRegression
 
 from tests.models.test_evaluation import (
     get_run_data,
@@ -27,6 +33,13 @@
     svm_model_uri,
     breast_cancer_dataset,
 )
+from mlflow.models.utils import plot_lines
+
+
+def assert_dict_equal(d1, d2, rtol):
+    for k in d1:
+        assert k in d2
+        assert np.isclose(d1[k], d2[k], rtol=rtol)
 
 
 def test_regressor_evaluation(linear_regressor_model_uri, diabetes_dataset):
@@ -90,7 +103,7 @@ def test_multi_classifier_evaluation(multiclass_logistic_regressor_model_uri, ir
     y_pred = predict_fn(iris_dataset.data)
     y_probs = predict_proba_fn(iris_dataset.data)
 
-    expected_metrics = _get_classifier_global_metrics(False, y, y_pred, y_probs)
+    expected_metrics = _get_classifier_global_metrics(False, y, y_pred, y_probs, labels=None)
 
     for metric_key in expected_metrics:
         assert np.isclose(
@@ -104,20 +117,16 @@ def test_multi_classifier_evaluation(multiclass_logistic_regressor_model_uri, ir
 
     assert set(artifacts) == {
         "shap_beeswarm_plot_on_data_iris_dataset.png",
-        "per_class_metrics_data_on_data_iris_dataset.csv",
+        "per_class_metrics_on_data_iris_dataset.csv",
         "roc_curve_plot_on_data_iris_dataset.png",
         "precision_recall_curve_plot_on_data_iris_dataset.png",
         "shap_feature_importance_plot_on_data_iris_dataset.png",
         "explainer_on_data_iris_dataset",
-        "per_class_roc_curve_data_on_data_iris_dataset.csv",
         "confusion_matrix_on_data_iris_dataset.png",
         "shap_summary_plot_on_data_iris_dataset.png",
-        "per_class_precision_recall_curve_data_on_data_iris_dataset.csv",
     }
     assert result.artifacts.keys() == {
-        "per_class_metrics_data",
-        "per_class_roc_curve_data",
-        "per_class_precision_recall_curve_data",
+        "per_class_metrics",
         "roc_curve_plot",
         "precision_recall_curve_plot",
         "confusion_matrix",
@@ -146,7 +155,7 @@ def test_bin_classifier_evaluation(binary_logistic_regressor_model_uri, breast_c
     y_pred = predict_fn(breast_cancer_dataset.data)
     y_probs = predict_proba_fn(breast_cancer_dataset.data)
 
-    expected_metrics = _get_classifier_global_metrics(True, y, y_pred, y_probs)
+    expected_metrics = _get_classifier_global_metrics(True, y, y_pred, y_probs, labels=None)
 
     for metric_key in expected_metrics:
         assert np.isclose(
@@ -165,16 +174,12 @@ def test_bin_classifier_evaluation(binary_logistic_regressor_model_uri, breast_c
         "lift_curve_plot_on_data_breast_cancer_dataset.png",
         "shap_beeswarm_plot_on_data_breast_cancer_dataset.png",
         "precision_recall_curve_plot_on_data_breast_cancer_dataset.png",
-        "roc_curve_data_on_data_breast_cancer_dataset.csv",
-        "precision_recall_curve_data_on_data_breast_cancer_dataset.csv",
         "confusion_matrix_on_data_breast_cancer_dataset.png",
         "shap_summary_plot_on_data_breast_cancer_dataset.png",
         "roc_curve_plot_on_data_breast_cancer_dataset.png",
     }
     assert result.artifacts.keys() == {
-        "roc_curve_data",
         "roc_curve_plot",
-        "precision_recall_curve_data",
         "precision_recall_curve_plot",
         "lift_curve_plot",
         "confusion_matrix",
@@ -240,7 +245,7 @@ def test_svm_classifier_evaluation(svm_model_uri, breast_cancer_dataset):
     y = breast_cancer_dataset.labels
     y_pred = predict_fn(breast_cancer_dataset.data)
 
-    expected_metrics = _get_classifier_global_metrics(True, y, y_pred, None)
+    expected_metrics = _get_classifier_global_metrics(True, y, y_pred, None, labels=None)
 
     for metric_key in expected_metrics:
         assert np.isclose(
@@ -266,3 +271,191 @@ def test_svm_classifier_evaluation(svm_model_uri, breast_cancer_dataset):
         "shap_summary_plot",
         "shap_feature_importance_plot",
     }
+
+
+def test_infer_model_type_by_labels():
+    assert _infer_model_type_by_labels(['a', 'b']) == 'classifier'
+    assert _infer_model_type_by_labels([1, 2.5]) == 'regressor'
+    assert _infer_model_type_by_labels(list(range(2000))) == 'regressor'
+    assert _infer_model_type_by_labels([1, 2, 3]) == 'classifier'
+
+
+def test_extract_raw_model_and_predict_fn(binary_logistic_regressor_model_uri):
+    model = mlflow.pyfunc.load_model(binary_logistic_regressor_model_uri)
+    model_loader_module, raw_model, predict_fn, predict_proba_fn = \
+        _extract_raw_model_and_predict_fn(model)
+    assert model_loader_module == 'mlflow.sklearn'
+    assert isinstance(raw_model, LogisticRegression)
+    assert predict_fn == raw_model.predict
+    assert predict_proba_fn == raw_model.predict_proba
+
+
+def test_get_regressor_metrics():
+    y = [1.1, 2.1, -3.5]
+    y_pred = [1.5, 2.0, -3.0]
+
+    metrics = _get_regressor_metrics(y, y_pred)
+    expected_metrics = {
+        'example_count': 3, 'mean_absolute_error': 0.3333333333333333,
+        'mean_squared_error': 0.13999999999999999,
+        'root_mean_squared_error': 0.3741657386773941, 'sum_on_label': -0.2999999999999998,
+        'mean_on_label': -0.09999999999999994, 'r2_score': 0.976457399103139, 'max_error': 0.5,
+        'mean_absolute_percentage_error': 0.18470418470418468
+    }
+    assert_dict_equal(metrics, expected_metrics, rtol=1e-3)
+
+
+def test_get_binary_sum_up_label_pred_prob():
+    y = [0, 1, 2]
+    y_pred = [0, 2, 1]
+    y_probs = [[0.7, 0.1, 0.2], [0.2, 0.3, 0.5], [0.25, 0.4, 0.35]]
+
+    results = []
+    for idx, label in enumerate([0, 1, 2]):
+        y_bin, y_pred_bin, y_prob_bin = \
+            _get_binary_sum_up_label_pred_prob(idx, label, y, y_pred, y_probs)
+        results.append((list(y_bin), list(y_pred_bin), list(y_prob_bin)))
+
+    print(results)
+    assert results == [
+        ([1, 0, 0], [1, 0, 0], [0.7, 0.2, 0.25]),
+        ([0, 1, 0], [0, 0, 1], [0.1, 0.3, 0.4]),
+        ([0, 0, 1], [0, 1, 0], [0.2, 0.5, 0.35]),
+    ]
+
+
+def test_get_classifier_per_class_metrics():
+    y = [0, 1, 0, 1, 0, 1, 0, 1, 1, 0]
+    y_pred = [0, 1, 1, 0, 1, 1, 0, 1, 1, 0]
+
+    expected_metrics = {
+        'true_negatives': 3, 'false_positives': 2, 'false_negatives': 1, 'true_positives': 4,
+        'recall': 0.8, 'precision': 0.6666666666666666, 'f1_score': 0.7272727272727272
+    }
+    metrics = _get_classifier_per_class_metrics(y, y_pred)
+    assert_dict_equal(metrics, expected_metrics, rtol=1e-3)
+
+
+def test_multiclass_get_classifier_global_metrics():
+    y = [0, 1, 2, 1, 2]
+    y_pred = [0, 2, 1, 1, 0]
+    y_probs = [[0.7, 0.1, 0.2], [0.2, 0.3, 0.5], [0.25, 0.4, 0.35], [0.3, 0.4, 0.3], [0.8, 0.1, 0.1]]
+
+    metrics = _get_classifier_global_metrics(
+        is_binomial=False, y=y, y_pred=y_pred, y_probs=y_probs, labels=[0, 1, 2]
+    )
+    expected_metrics = {
+        'accuracy': 0.4, 'example_count': 5, 'f1_score_micro': 0.4,
+        'f1_score_macro': 0.38888888888888884, 'log_loss': 1.1658691395263094
+    }
+    assert_dict_equal(metrics, expected_metrics, 1e-3)
+
+
+def test_binary_get_classifier_global_metrics():
+    y = [0, 1, 0, 1, 0, 1, 0, 1, 1, 0]
+    y_pred = [0, 1, 1, 0, 1, 1, 0, 1, 1, 0]
+    y_prob = [0.1, 0.9, 0.8, 0.2, 0.7, 0.8, 0.3, 0.6, 0.65, 0.4]
+    y_probs = [[1 - p, p] for p in y_prob]
+    metrics = _get_classifier_global_metrics(
+        is_binomial=True, y=y, y_pred=y_pred, y_probs=y_probs, labels=[0, 1]
+    )
+    expected_metrics = {
+        'accuracy': 0.7, 'example_count': 10, 'log_loss': 0.6665822319387167
+    }
+    assert_dict_equal(metrics, expected_metrics, 1e-3)
+
+
+def test_gen_binary_precision_recall_curve():
+    y = [0, 1, 0, 1, 0, 1, 0, 1, 1, 0]
+    y_prob = [0.1, 0.9, 0.8, 0.2, 0.7, 0.8, 0.3, 0.6, 0.65, 0.4]
+
+    results = _gen_classifier_curve(is_binomial=True, y=y, y_probs=y_prob, labels=[0, 1], curve_type='pr')
+    assert results.plot_fn is plot_lines
+    assert np.allclose(results.plot_fn_args['data_series'][0][1], np.array(
+        [1., 0.8, 0.8, 0.8, 0.6, 0.4, 0.4, 0.2, 0.]), rtol=1e-3)
+    assert np.allclose(results.plot_fn_args['data_series'][0][2], np.array(
+        [0.55555556, 0.5, 0.57142857, 0.66666667, 0.6,
+         0.5, 0.66666667, 1., 1.]), rtol=1e-3)
+    assert results.plot_fn_args['xlabel'] == 'recall'
+    assert results.plot_fn_args['ylabel'] == 'precision'
+    assert results.plot_fn_args['legend_loc'] is None
+    assert results.plot_fn_args['line_kwargs'] == {'drawstyle': 'steps-post'}
+    assert np.isclose(results.auc, 0.7088888888888889, rtol=1e-3)
+
+
+def test_gen_binary_roc_curve():
+    y = [0, 1, 0, 1, 0, 1, 0, 1, 1, 0]
+    y_prob = [0.1, 0.9, 0.8, 0.2, 0.7, 0.8, 0.3, 0.6, 0.65, 0.4]
+
+    results = _gen_classifier_curve(is_binomial=True, y=y, y_probs=y_prob, labels=[0, 1], curve_type='roc')
+    assert results.plot_fn is plot_lines
+    assert np.allclose(results.plot_fn_args['data_series'][0][1], np.array(
+        [0., 0., 0.2, 0.4, 0.4, 0.8, 0.8, 1.]), rtol=1e-3)
+    assert np.allclose(results.plot_fn_args['data_series'][0][2], np.array(
+        [0., 0.2, 0.4, 0.4, 0.8, 0.8, 1., 1.]), rtol=1e-3)
+    assert results.plot_fn_args['xlabel'] == 'False Positive Rate'
+    assert results.plot_fn_args['ylabel'] == 'True Positive Rate'
+    assert results.plot_fn_args['legend_loc'] is None
+    assert results.plot_fn_args['line_kwargs'] == {'drawstyle': 'steps-post'}
+    assert np.isclose(results.auc, 0.66, rtol=1e-3)
+
+
+def test_gen_multiclass_precision_recall_curve():
+    y = [0, 1, 2, 1, 2]
+    y_probs = [[0.7, 0.1, 0.2], [0.2, 0.3, 0.5], [0.25, 0.4, 0.35], [0.3, 0.4, 0.3], [0.8, 0.1, 0.1]]
+
+    results = _gen_classifier_curve(is_binomial=False, y=y, y_probs=y_probs, labels=[0, 1, 2], curve_type='pr')
+    expected_x_data_list = [
+        [1., 0., 0.],
+        [1., 0.5, 0.],
+        [1., 0.5, 0.5, 0.5, 0., 0.]
+    ]
+    expected_y_data_list = [
+        [0.5, 0., 1.],
+        [0.66666667, 0.5, 1.],
+        [0.4, 0.25, 0.33333333, 0.5, 0.,
+         1.]
+    ]
+    for index, (name, x_data, y_data) in enumerate(results.plot_fn_args['data_series']):
+        assert name == f'Positive Class = {index}'
+        assert np.allclose(x_data, expected_x_data_list[index], rtol=1e-3)
+        assert np.allclose(y_data, expected_y_data_list[index], rtol=1e-3)
+
+    assert results.plot_fn_args['xlabel'] == 'recall'
+    assert results.plot_fn_args['ylabel'] == 'precision'
+    assert results.plot_fn_args['legend_loc'] == 'lower left'
+    assert results.plot_fn_args['line_kwargs'] == {'drawstyle': 'steps-post'}
+
+    expected_auc = [0.25, 0.6666666666666666, 0.2875]
+    assert np.allclose(results.auc, expected_auc, rtol=1e-3)
+
+
+def test_gen_multiclass_roc_curve():
+    y = [0, 1, 2, 1, 2]
+    y_probs = [[0.7, 0.1, 0.2], [0.2, 0.3, 0.5], [0.25, 0.4, 0.35], [0.3, 0.4, 0.3], [0.8, 0.1, 0.1]]
+
+    results = _gen_classifier_curve(is_binomial=False, y=y, y_probs=y_probs, labels=[0, 1, 2], curve_type='roc')
+    print(results)
+
+    expected_x_data_list = [
+        [0., 0.25, 0.25, 1.],
+        [0., 0.33333333, 0.33333333, 1.],
+        [0., 0.33333333, 0.33333333, 1., 1.]
+    ]
+    expected_y_data_list = [
+        [0., 0., 1., 1.],
+        [0., 0.5, 1., 1.],
+        [0., 0., 0.5, 0.5, 1.]
+    ]
+    for index, (name, x_data, y_data) in enumerate(results.plot_fn_args['data_series']):
+        assert name == f'Positive Class = {index}'
+        assert np.allclose(x_data, expected_x_data_list[index], rtol=1e-3)
+        assert np.allclose(y_data, expected_y_data_list[index], rtol=1e-3)
+
+    assert results.plot_fn_args['xlabel'] == 'False Positive Rate'
+    assert results.plot_fn_args['ylabel'] == 'True Positive Rate'
+    assert results.plot_fn_args['legend_loc'] == 'lower right'
+    assert results.plot_fn_args['line_kwargs'] == {'drawstyle': 'steps-post'}
+
+    expected_auc = [0.75, 0.7500000000000001, 0.33333333333333337]
+    assert np.allclose(results.auc, expected_auc, rtol=1e-3)

From 19d23aa26ea682ecf9cce6efa03a0fb63ba12c80 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Sun, 2 Jan 2022 21:51:04 +0800
Subject: [PATCH 101/120] black

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/default_evaluator.py | 104 +++++-----
 tests/models/test_default_evaluator.py        | 191 +++++++++++-------
 2 files changed, 174 insertions(+), 121 deletions(-)

diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 30178cd54255d..56345c6b518b5 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -144,11 +144,7 @@ def _get_classifier_global_metrics(is_binomial, y, y_pred, y_probs, labels):
 def _get_classifier_per_class_metrics_collection_df(y, y_pred, labels):
     per_class_metrics_list = []
     for positive_class_index, positive_class in enumerate(labels):
-        (
-            y_bin,
-            y_pred_bin,
-            _,
-        ) = _get_binary_sum_up_label_pred_prob(
+        (y_bin, y_pred_bin, _,) = _get_binary_sum_up_label_pred_prob(
             positive_class_index, positive_class, y, y_pred, None
         )
 
@@ -159,13 +155,18 @@ def _get_classifier_per_class_metrics_collection_df(y, y_pred, labels):
     return pd.DataFrame(per_class_metrics_list)
 
 
-_Curve = namedtuple('_Curve', ['plot_fn', 'plot_fn_args', 'auc'])
+_Curve = namedtuple("_Curve", ["plot_fn", "plot_fn_args", "auc"])
 
 
 def _gen_classifier_curve(
-        is_binomial, y, y_probs, labels, curve_type,
+    is_binomial,
+    y,
+    y_probs,
+    labels,
+    curve_type,
 ):
-    if curve_type == 'roc':
+    if curve_type == "roc":
+
         def gen_x_y_thresholds_fn(_y, _y_prob):
             fpr, tpr, _thresholds = sk_metrics.roc_curve(_y, _y_prob)
             return fpr, tpr, _thresholds
@@ -173,7 +174,8 @@ def gen_x_y_thresholds_fn(_y, _y_prob):
         xlabel = "False Positive Rate"
         ylabel = "True Positive Rate"
         legend_loc = "lower right"
-    elif curve_type == 'pr':
+    elif curve_type == "pr":
+
         def gen_x_y_thresholds_fn(_y, _y_prob):
             precision, recall, _thresholds = sk_metrics.precision_recall_curve(_y, _y_prob)
             _thresholds = np.append(_thresholds, [1.0], axis=0)
@@ -183,11 +185,11 @@ def gen_x_y_thresholds_fn(_y, _y_prob):
         ylabel = "precision"
         legend_loc = "lower left"
     else:
-        assert False, 'illegal curve type'
+        assert False, "illegal curve type"
 
     if is_binomial:
         x_data, y_data, thresholds = gen_x_y_thresholds_fn(y, y_probs)
-        data_series = [('positive class', x_data, y_data)]
+        data_series = [("positive class", x_data, y_data)]
         legend_loc = None
         auc = sk_metrics.auc(x_data, y_data)
     else:
@@ -204,19 +206,17 @@ def gen_x_y_thresholds_fn(_y, _y_prob):
             (f"Positive Class = {positive_class}", x_data, y_data)
             for positive_class, x_data, y_data, _ in curve_list
         ]
-        auc = [
-            sk_metrics.auc(x_data, y_data) for _, x_data, y_data, _ in curve_list
-        ]
+        auc = [sk_metrics.auc(x_data, y_data) for _, x_data, y_data, _ in curve_list]
     return _Curve(
         plot_fn=plot_lines,
         plot_fn_args={
-            'data_series': data_series,
-            'xlabel': xlabel,
-            'ylabel': ylabel,
-            'legend_loc': legend_loc,
-            'line_kwargs': {"drawstyle": "steps-post"},
+            "data_series": data_series,
+            "xlabel": xlabel,
+            "ylabel": ylabel,
+            "legend_loc": legend_loc,
+            "line_kwargs": {"drawstyle": "steps-post"},
         },
-        auc=auc
+        auc=auc,
     )
 
 
@@ -292,8 +292,9 @@ def _log_model_explainability(self):
             )
             return
 
-        if self.model_type == 'classifier' and \
-                not all([isinstance(label, numbers.Number) for label in self.label_list]):
+        if self.model_type == "classifier" and not all(
+            [isinstance(label, numbers.Number) for label in self.label_list]
+        ):
             _logger.warning(
                 "Skip logging model explainability insights because it requires all label "
                 "values to be Number type."
@@ -415,28 +416,33 @@ def _log_binary_classifier(self):
 
         if self.y_probs is not None:
             roc_curve = _gen_classifier_curve(
-                is_binomial=True, y=self.y, y_probs=self.y_prob, labels=self.label_list,
-                curve_type='roc'
+                is_binomial=True,
+                y=self.y,
+                y_probs=self.y_prob,
+                labels=self.label_list,
+                curve_type="roc",
             )
             self._log_image_artifact(
-                lambda: roc_curve.plot_fn(**roc_curve.plot_fn_args),
-                "roc_curve_plot"
+                lambda: roc_curve.plot_fn(**roc_curve.plot_fn_args), "roc_curve_plot"
             )
             self.metrics["roc_auc"] = roc_curve.auc
 
             pr_curve = _gen_classifier_curve(
-                is_binomial=True, y=self.y, y_probs=self.y_prob, labels=self.label_list,
-                curve_type='pr'
+                is_binomial=True,
+                y=self.y,
+                y_probs=self.y_prob,
+                labels=self.label_list,
+                curve_type="pr",
             )
             self._log_image_artifact(
-                lambda: pr_curve.plot_fn(**pr_curve.plot_fn_args),
-                "precision_recall_curve_plot"
+                lambda: pr_curve.plot_fn(**pr_curve.plot_fn_args), "precision_recall_curve_plot"
             )
             self.metrics["precision_recall_auc"] = pr_curve.auc
 
     def _log_multiclass_classifier(self):
-        per_class_metrics_collection_df = \
-            _get_classifier_per_class_metrics_collection_df(self.y, self.y_pred, self.label_list)
+        per_class_metrics_collection_df = _get_classifier_per_class_metrics_collection_df(
+            self.y, self.y_pred, self.label_list
+        )
 
         log_roc_pr_curve = False
         if self.y_probs is not None:
@@ -455,22 +461,26 @@ def _log_multiclass_classifier(self):
 
         if log_roc_pr_curve:
             roc_curve = _gen_classifier_curve(
-                is_binomial=False, y=self.y, y_probs=self.y_probs, labels=self.label_list,
-                curve_type='roc'
+                is_binomial=False,
+                y=self.y,
+                y_probs=self.y_probs,
+                labels=self.label_list,
+                curve_type="roc",
             )
             self._log_image_artifact(
-                lambda: roc_curve.plot_fn(**roc_curve.plot_fn_args),
-                "roc_curve_plot"
+                lambda: roc_curve.plot_fn(**roc_curve.plot_fn_args), "roc_curve_plot"
             )
             per_class_metrics_collection_df["roc_auc"] = roc_curve.auc
 
             pr_curve = _gen_classifier_curve(
-                is_binomial=False, y=self.y, y_probs=self.y_probs, labels=self.label_list,
-                curve_type='pr'
+                is_binomial=False,
+                y=self.y,
+                y_probs=self.y_probs,
+                labels=self.label_list,
+                curve_type="pr",
             )
             self._log_image_artifact(
-                lambda: pr_curve.plot_fn(**pr_curve.plot_fn_args),
-                "precision_recall_curve_plot"
+                lambda: pr_curve.plot_fn(**pr_curve.plot_fn_args), "precision_recall_curve_plot"
             )
             per_class_metrics_collection_df["precision_recall_auc"] = pr_curve.auc
 
@@ -488,18 +498,18 @@ def _evaluate_classifier(self):
         if self.is_binomial:
             if list(self.label_list) not in [[0, 1], [-1, 1]]:
                 raise ValueError(
-                    'Binary classifier evaluation dataset positive class label must be 1 or True, '
-                    'negative class label must be 0 or -1 or False, and dataset must contains '
-                    'both positive and negative examples.'
+                    "Binary classifier evaluation dataset positive class label must be 1 or True, "
+                    "negative class label must be 0 or -1 or False, and dataset must contains "
+                    "both positive and negative examples."
                 )
             _logger.info(
-                'The evaluation dataset is inferred as binary dataset, positive label is '
-                f'{self.label_list[1]}, negative label is {self.label_list[0]}.'
+                "The evaluation dataset is inferred as binary dataset, positive label is "
+                f"{self.label_list[1]}, negative label is {self.label_list[0]}."
             )
         else:
             _logger.info(
-                'The evaluation dataset is inferred as multiclass dataset, number of classes '
-                f'is inferred as {self.num_classes}'
+                "The evaluation dataset is inferred as multiclass dataset, number of classes "
+                f"is inferred as {self.num_classes}"
             )
 
         if self.predict_proba_fn is not None:
diff --git a/tests/models/test_default_evaluator.py b/tests/models/test_default_evaluator.py
index 62c0e3744e443..f1945186cc6bb 100644
--- a/tests/models/test_default_evaluator.py
+++ b/tests/models/test_default_evaluator.py
@@ -274,17 +274,21 @@ def test_svm_classifier_evaluation(svm_model_uri, breast_cancer_dataset):
 
 
 def test_infer_model_type_by_labels():
-    assert _infer_model_type_by_labels(['a', 'b']) == 'classifier'
-    assert _infer_model_type_by_labels([1, 2.5]) == 'regressor'
-    assert _infer_model_type_by_labels(list(range(2000))) == 'regressor'
-    assert _infer_model_type_by_labels([1, 2, 3]) == 'classifier'
+    assert _infer_model_type_by_labels(["a", "b"]) == "classifier"
+    assert _infer_model_type_by_labels([1, 2.5]) == "regressor"
+    assert _infer_model_type_by_labels(list(range(2000))) == "regressor"
+    assert _infer_model_type_by_labels([1, 2, 3]) == "classifier"
 
 
 def test_extract_raw_model_and_predict_fn(binary_logistic_regressor_model_uri):
     model = mlflow.pyfunc.load_model(binary_logistic_regressor_model_uri)
-    model_loader_module, raw_model, predict_fn, predict_proba_fn = \
-        _extract_raw_model_and_predict_fn(model)
-    assert model_loader_module == 'mlflow.sklearn'
+    (
+        model_loader_module,
+        raw_model,
+        predict_fn,
+        predict_proba_fn,
+    ) = _extract_raw_model_and_predict_fn(model)
+    assert model_loader_module == "mlflow.sklearn"
     assert isinstance(raw_model, LogisticRegression)
     assert predict_fn == raw_model.predict
     assert predict_proba_fn == raw_model.predict_proba
@@ -296,11 +300,15 @@ def test_get_regressor_metrics():
 
     metrics = _get_regressor_metrics(y, y_pred)
     expected_metrics = {
-        'example_count': 3, 'mean_absolute_error': 0.3333333333333333,
-        'mean_squared_error': 0.13999999999999999,
-        'root_mean_squared_error': 0.3741657386773941, 'sum_on_label': -0.2999999999999998,
-        'mean_on_label': -0.09999999999999994, 'r2_score': 0.976457399103139, 'max_error': 0.5,
-        'mean_absolute_percentage_error': 0.18470418470418468
+        "example_count": 3,
+        "mean_absolute_error": 0.3333333333333333,
+        "mean_squared_error": 0.13999999999999999,
+        "root_mean_squared_error": 0.3741657386773941,
+        "sum_on_label": -0.2999999999999998,
+        "mean_on_label": -0.09999999999999994,
+        "r2_score": 0.976457399103139,
+        "max_error": 0.5,
+        "mean_absolute_percentage_error": 0.18470418470418468,
     }
     assert_dict_equal(metrics, expected_metrics, rtol=1e-3)
 
@@ -312,8 +320,9 @@ def test_get_binary_sum_up_label_pred_prob():
 
     results = []
     for idx, label in enumerate([0, 1, 2]):
-        y_bin, y_pred_bin, y_prob_bin = \
-            _get_binary_sum_up_label_pred_prob(idx, label, y, y_pred, y_probs)
+        y_bin, y_pred_bin, y_prob_bin = _get_binary_sum_up_label_pred_prob(
+            idx, label, y, y_pred, y_probs
+        )
         results.append((list(y_bin), list(y_pred_bin), list(y_prob_bin)))
 
     print(results)
@@ -329,8 +338,13 @@ def test_get_classifier_per_class_metrics():
     y_pred = [0, 1, 1, 0, 1, 1, 0, 1, 1, 0]
 
     expected_metrics = {
-        'true_negatives': 3, 'false_positives': 2, 'false_negatives': 1, 'true_positives': 4,
-        'recall': 0.8, 'precision': 0.6666666666666666, 'f1_score': 0.7272727272727272
+        "true_negatives": 3,
+        "false_positives": 2,
+        "false_negatives": 1,
+        "true_positives": 4,
+        "recall": 0.8,
+        "precision": 0.6666666666666666,
+        "f1_score": 0.7272727272727272,
     }
     metrics = _get_classifier_per_class_metrics(y, y_pred)
     assert_dict_equal(metrics, expected_metrics, rtol=1e-3)
@@ -339,14 +353,23 @@ def test_get_classifier_per_class_metrics():
 def test_multiclass_get_classifier_global_metrics():
     y = [0, 1, 2, 1, 2]
     y_pred = [0, 2, 1, 1, 0]
-    y_probs = [[0.7, 0.1, 0.2], [0.2, 0.3, 0.5], [0.25, 0.4, 0.35], [0.3, 0.4, 0.3], [0.8, 0.1, 0.1]]
+    y_probs = [
+        [0.7, 0.1, 0.2],
+        [0.2, 0.3, 0.5],
+        [0.25, 0.4, 0.35],
+        [0.3, 0.4, 0.3],
+        [0.8, 0.1, 0.1],
+    ]
 
     metrics = _get_classifier_global_metrics(
         is_binomial=False, y=y, y_pred=y_pred, y_probs=y_probs, labels=[0, 1, 2]
     )
     expected_metrics = {
-        'accuracy': 0.4, 'example_count': 5, 'f1_score_micro': 0.4,
-        'f1_score_macro': 0.38888888888888884, 'log_loss': 1.1658691395263094
+        "accuracy": 0.4,
+        "example_count": 5,
+        "f1_score_micro": 0.4,
+        "f1_score_macro": 0.38888888888888884,
+        "log_loss": 1.1658691395263094,
     }
     assert_dict_equal(metrics, expected_metrics, 1e-3)
 
@@ -359,9 +382,7 @@ def test_binary_get_classifier_global_metrics():
     metrics = _get_classifier_global_metrics(
         is_binomial=True, y=y, y_pred=y_pred, y_probs=y_probs, labels=[0, 1]
     )
-    expected_metrics = {
-        'accuracy': 0.7, 'example_count': 10, 'log_loss': 0.6665822319387167
-    }
+    expected_metrics = {"accuracy": 0.7, "example_count": 10, "log_loss": 0.6665822319387167}
     assert_dict_equal(metrics, expected_metrics, 1e-3)
 
 
@@ -369,17 +390,24 @@ def test_gen_binary_precision_recall_curve():
     y = [0, 1, 0, 1, 0, 1, 0, 1, 1, 0]
     y_prob = [0.1, 0.9, 0.8, 0.2, 0.7, 0.8, 0.3, 0.6, 0.65, 0.4]
 
-    results = _gen_classifier_curve(is_binomial=True, y=y, y_probs=y_prob, labels=[0, 1], curve_type='pr')
+    results = _gen_classifier_curve(
+        is_binomial=True, y=y, y_probs=y_prob, labels=[0, 1], curve_type="pr"
+    )
     assert results.plot_fn is plot_lines
-    assert np.allclose(results.plot_fn_args['data_series'][0][1], np.array(
-        [1., 0.8, 0.8, 0.8, 0.6, 0.4, 0.4, 0.2, 0.]), rtol=1e-3)
-    assert np.allclose(results.plot_fn_args['data_series'][0][2], np.array(
-        [0.55555556, 0.5, 0.57142857, 0.66666667, 0.6,
-         0.5, 0.66666667, 1., 1.]), rtol=1e-3)
-    assert results.plot_fn_args['xlabel'] == 'recall'
-    assert results.plot_fn_args['ylabel'] == 'precision'
-    assert results.plot_fn_args['legend_loc'] is None
-    assert results.plot_fn_args['line_kwargs'] == {'drawstyle': 'steps-post'}
+    assert np.allclose(
+        results.plot_fn_args["data_series"][0][1],
+        np.array([1.0, 0.8, 0.8, 0.8, 0.6, 0.4, 0.4, 0.2, 0.0]),
+        rtol=1e-3,
+    )
+    assert np.allclose(
+        results.plot_fn_args["data_series"][0][2],
+        np.array([0.55555556, 0.5, 0.57142857, 0.66666667, 0.6, 0.5, 0.66666667, 1.0, 1.0]),
+        rtol=1e-3,
+    )
+    assert results.plot_fn_args["xlabel"] == "recall"
+    assert results.plot_fn_args["ylabel"] == "precision"
+    assert results.plot_fn_args["legend_loc"] is None
+    assert results.plot_fn_args["line_kwargs"] == {"drawstyle": "steps-post"}
     assert np.isclose(results.auc, 0.7088888888888889, rtol=1e-3)
 
 
@@ -387,44 +415,55 @@ def test_gen_binary_roc_curve():
     y = [0, 1, 0, 1, 0, 1, 0, 1, 1, 0]
     y_prob = [0.1, 0.9, 0.8, 0.2, 0.7, 0.8, 0.3, 0.6, 0.65, 0.4]
 
-    results = _gen_classifier_curve(is_binomial=True, y=y, y_probs=y_prob, labels=[0, 1], curve_type='roc')
+    results = _gen_classifier_curve(
+        is_binomial=True, y=y, y_probs=y_prob, labels=[0, 1], curve_type="roc"
+    )
     assert results.plot_fn is plot_lines
-    assert np.allclose(results.plot_fn_args['data_series'][0][1], np.array(
-        [0., 0., 0.2, 0.4, 0.4, 0.8, 0.8, 1.]), rtol=1e-3)
-    assert np.allclose(results.plot_fn_args['data_series'][0][2], np.array(
-        [0., 0.2, 0.4, 0.4, 0.8, 0.8, 1., 1.]), rtol=1e-3)
-    assert results.plot_fn_args['xlabel'] == 'False Positive Rate'
-    assert results.plot_fn_args['ylabel'] == 'True Positive Rate'
-    assert results.plot_fn_args['legend_loc'] is None
-    assert results.plot_fn_args['line_kwargs'] == {'drawstyle': 'steps-post'}
+    assert np.allclose(
+        results.plot_fn_args["data_series"][0][1],
+        np.array([0.0, 0.0, 0.2, 0.4, 0.4, 0.8, 0.8, 1.0]),
+        rtol=1e-3,
+    )
+    assert np.allclose(
+        results.plot_fn_args["data_series"][0][2],
+        np.array([0.0, 0.2, 0.4, 0.4, 0.8, 0.8, 1.0, 1.0]),
+        rtol=1e-3,
+    )
+    assert results.plot_fn_args["xlabel"] == "False Positive Rate"
+    assert results.plot_fn_args["ylabel"] == "True Positive Rate"
+    assert results.plot_fn_args["legend_loc"] is None
+    assert results.plot_fn_args["line_kwargs"] == {"drawstyle": "steps-post"}
     assert np.isclose(results.auc, 0.66, rtol=1e-3)
 
 
 def test_gen_multiclass_precision_recall_curve():
     y = [0, 1, 2, 1, 2]
-    y_probs = [[0.7, 0.1, 0.2], [0.2, 0.3, 0.5], [0.25, 0.4, 0.35], [0.3, 0.4, 0.3], [0.8, 0.1, 0.1]]
-
-    results = _gen_classifier_curve(is_binomial=False, y=y, y_probs=y_probs, labels=[0, 1, 2], curve_type='pr')
-    expected_x_data_list = [
-        [1., 0., 0.],
-        [1., 0.5, 0.],
-        [1., 0.5, 0.5, 0.5, 0., 0.]
+    y_probs = [
+        [0.7, 0.1, 0.2],
+        [0.2, 0.3, 0.5],
+        [0.25, 0.4, 0.35],
+        [0.3, 0.4, 0.3],
+        [0.8, 0.1, 0.1],
     ]
+
+    results = _gen_classifier_curve(
+        is_binomial=False, y=y, y_probs=y_probs, labels=[0, 1, 2], curve_type="pr"
+    )
+    expected_x_data_list = [[1.0, 0.0, 0.0], [1.0, 0.5, 0.0], [1.0, 0.5, 0.5, 0.5, 0.0, 0.0]]
     expected_y_data_list = [
-        [0.5, 0., 1.],
-        [0.66666667, 0.5, 1.],
-        [0.4, 0.25, 0.33333333, 0.5, 0.,
-         1.]
+        [0.5, 0.0, 1.0],
+        [0.66666667, 0.5, 1.0],
+        [0.4, 0.25, 0.33333333, 0.5, 0.0, 1.0],
     ]
-    for index, (name, x_data, y_data) in enumerate(results.plot_fn_args['data_series']):
-        assert name == f'Positive Class = {index}'
+    for index, (name, x_data, y_data) in enumerate(results.plot_fn_args["data_series"]):
+        assert name == f"Positive Class = {index}"
         assert np.allclose(x_data, expected_x_data_list[index], rtol=1e-3)
         assert np.allclose(y_data, expected_y_data_list[index], rtol=1e-3)
 
-    assert results.plot_fn_args['xlabel'] == 'recall'
-    assert results.plot_fn_args['ylabel'] == 'precision'
-    assert results.plot_fn_args['legend_loc'] == 'lower left'
-    assert results.plot_fn_args['line_kwargs'] == {'drawstyle': 'steps-post'}
+    assert results.plot_fn_args["xlabel"] == "recall"
+    assert results.plot_fn_args["ylabel"] == "precision"
+    assert results.plot_fn_args["legend_loc"] == "lower left"
+    assert results.plot_fn_args["line_kwargs"] == {"drawstyle": "steps-post"}
 
     expected_auc = [0.25, 0.6666666666666666, 0.2875]
     assert np.allclose(results.auc, expected_auc, rtol=1e-3)
@@ -432,30 +471,34 @@ def test_gen_multiclass_precision_recall_curve():
 
 def test_gen_multiclass_roc_curve():
     y = [0, 1, 2, 1, 2]
-    y_probs = [[0.7, 0.1, 0.2], [0.2, 0.3, 0.5], [0.25, 0.4, 0.35], [0.3, 0.4, 0.3], [0.8, 0.1, 0.1]]
+    y_probs = [
+        [0.7, 0.1, 0.2],
+        [0.2, 0.3, 0.5],
+        [0.25, 0.4, 0.35],
+        [0.3, 0.4, 0.3],
+        [0.8, 0.1, 0.1],
+    ]
 
-    results = _gen_classifier_curve(is_binomial=False, y=y, y_probs=y_probs, labels=[0, 1, 2], curve_type='roc')
+    results = _gen_classifier_curve(
+        is_binomial=False, y=y, y_probs=y_probs, labels=[0, 1, 2], curve_type="roc"
+    )
     print(results)
 
     expected_x_data_list = [
-        [0., 0.25, 0.25, 1.],
-        [0., 0.33333333, 0.33333333, 1.],
-        [0., 0.33333333, 0.33333333, 1., 1.]
-    ]
-    expected_y_data_list = [
-        [0., 0., 1., 1.],
-        [0., 0.5, 1., 1.],
-        [0., 0., 0.5, 0.5, 1.]
+        [0.0, 0.25, 0.25, 1.0],
+        [0.0, 0.33333333, 0.33333333, 1.0],
+        [0.0, 0.33333333, 0.33333333, 1.0, 1.0],
     ]
-    for index, (name, x_data, y_data) in enumerate(results.plot_fn_args['data_series']):
-        assert name == f'Positive Class = {index}'
+    expected_y_data_list = [[0.0, 0.0, 1.0, 1.0], [0.0, 0.5, 1.0, 1.0], [0.0, 0.0, 0.5, 0.5, 1.0]]
+    for index, (name, x_data, y_data) in enumerate(results.plot_fn_args["data_series"]):
+        assert name == f"Positive Class = {index}"
         assert np.allclose(x_data, expected_x_data_list[index], rtol=1e-3)
         assert np.allclose(y_data, expected_y_data_list[index], rtol=1e-3)
 
-    assert results.plot_fn_args['xlabel'] == 'False Positive Rate'
-    assert results.plot_fn_args['ylabel'] == 'True Positive Rate'
-    assert results.plot_fn_args['legend_loc'] == 'lower right'
-    assert results.plot_fn_args['line_kwargs'] == {'drawstyle': 'steps-post'}
+    assert results.plot_fn_args["xlabel"] == "False Positive Rate"
+    assert results.plot_fn_args["ylabel"] == "True Positive Rate"
+    assert results.plot_fn_args["legend_loc"] == "lower right"
+    assert results.plot_fn_args["line_kwargs"] == {"drawstyle": "steps-post"}
 
     expected_auc = [0.75, 0.7500000000000001, 0.33333333333333337]
     assert np.allclose(results.auc, expected_auc, rtol=1e-3)

From 74cc310ad6f6abac4e9b430ab81d3d030aadf315 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Tue, 4 Jan 2022 17:52:10 +0800
Subject: [PATCH 102/120] update

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/artifacts.py | 2 +-
 mlflow/models/evaluation/base.py      | 8 ++++++++
 requirements/small-requirements.txt   | 1 +
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/mlflow/models/evaluation/artifacts.py b/mlflow/models/evaluation/artifacts.py
index 656fcf9c059a8..3552a4e7cfaad 100644
--- a/mlflow/models/evaluation/artifacts.py
+++ b/mlflow/models/evaluation/artifacts.py
@@ -1,4 +1,3 @@
-from PIL.Image import open as open_image
 import pandas as pd
 
 from mlflow.models.evaluation.base import EvaluationArtifact
@@ -9,6 +8,7 @@ def save(self, output_artifact_path):
         self._content.save(output_artifact_path)
 
     def _load_content_from_file(self, local_artifact_path):
+        from PIL.Image import open as open_image
         self._content = open_image(local_artifact_path)
         return self._content
 
diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index ef8bd9a5548da..954445b75c49b 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -597,6 +597,11 @@ def check_nesting_config_dict(_evaluator_name_list, _evaluator_name_to_conf_map)
     return evaluator_name_list, evaluator_name_to_conf_map
 
 
+# This variable holds the last running evaluator name. This can be used to
+# check which evaluator fail when `evaluate` API fail.
+_last_evaluator = None
+
+
 def _evaluate(
     model, model_type, dataset, actual_run_id, evaluator_name_list, evaluator_name_to_conf_map
 ):
@@ -613,6 +618,8 @@ def _evaluate(
 
     eval_results = []
     for evaluator_name in evaluator_name_list:
+        global _last_evaluator
+
         config = evaluator_name_to_conf_map.get(evaluator_name) or {}
         try:
             evaluator = _model_evaluation_registry.get_evaluator(evaluator_name)
@@ -620,6 +627,7 @@ def _evaluate(
             _logger.warning(f"Evaluator '{evaluator_name}' is not registered.")
             continue
 
+        _last_evaluator = evaluator_name
         if evaluator.can_evaluate(model_type, config):
             _logger.info(f"Evaluating the model with the {evaluator_name} evaluator.")
             result = evaluator.evaluate(model, model_type, dataset, actual_run_id, config)
diff --git a/requirements/small-requirements.txt b/requirements/small-requirements.txt
index 855d6a3052473..8ea020af4dc37 100644
--- a/requirements/small-requirements.txt
+++ b/requirements/small-requirements.txt
@@ -13,3 +13,4 @@ moto!=2.0.7
 azure-storage-blob>=12.0.0
 azure-identity>=1.6.1
 databricks-cli@git+https://github.com/databricks/databricks-cli.git
+pillow

From 2924f8c795336ad9afdcab29da1ab9309750bbdf Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Tue, 4 Jan 2022 20:59:55 +0800
Subject: [PATCH 103/120] increase plot dpi

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              | 21 +++++++++++++------
 mlflow/models/evaluation/default_evaluator.py | 12 +++++++++++
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 954445b75c49b..5da827a4abf9b 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -597,9 +597,15 @@ def check_nesting_config_dict(_evaluator_name_list, _evaluator_name_to_conf_map)
     return evaluator_name_list, evaluator_name_to_conf_map
 
 
-# This variable holds the last running evaluator name. This can be used to
-# check which evaluator fail when `evaluate` API fail.
-_last_evaluator = None
+_last_failed_evaluator = None
+
+
+def _get_last_failed_evaluator():
+    """
+    Return the evaluator name of the last failed evaluator when calling `evalaute`.
+    This can be used to check which evaluator fail when `evaluate` API fail.
+    """
+    return _last_failed_evaluator
 
 
 def _evaluate(
@@ -612,14 +618,15 @@ def _evaluate(
     # import _model_evaluation_registry and PyFuncModel inside function to avoid circuit importing
     from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
 
+    global _last_failed_evaluator
+    _last_failed_evaluator = None
+
     client = mlflow.tracking.MlflowClient()
     model_uuid = model.metadata.model_uuid
     dataset._log_dataset_tag(client, actual_run_id, model_uuid)
 
     eval_results = []
     for evaluator_name in evaluator_name_list:
-        global _last_evaluator
-
         config = evaluator_name_to_conf_map.get(evaluator_name) or {}
         try:
             evaluator = _model_evaluation_registry.get_evaluator(evaluator_name)
@@ -627,12 +634,14 @@ def _evaluate(
             _logger.warning(f"Evaluator '{evaluator_name}' is not registered.")
             continue
 
-        _last_evaluator = evaluator_name
+        _last_failed_evaluator = evaluator_name
         if evaluator.can_evaluate(model_type, config):
             _logger.info(f"Evaluating the model with the {evaluator_name} evaluator.")
             result = evaluator.evaluate(model, model_type, dataset, actual_run_id, config)
             eval_results.append(result)
 
+    _last_failed_evaluator = None
+
     if len(eval_results) == 0:
         raise ValueError(
             "The model could not be evaluated by any of the registered evaluators, please "
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 56345c6b518b5..e6615adc19b00 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -220,6 +220,17 @@ def gen_x_y_thresholds_fn(_y, _y_prob):
     )
 
 
+_matplotlib_initialized = False
+
+def _init_matplotlib():
+    global _matplotlib_initialized
+    if not _matplotlib_initialized:
+        import matplotlib.pyplot as pyplot
+        pyplot.rcParams['figure.dpi'] = 144
+        pyplot.rcParams['figure.figsize'] = [6.0, 4.0]
+        _matplotlib_initialized = True
+
+
 class DefaultEvaluator(ModelEvaluator):
     def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
         return model_type in ["classifier", "regressor"]
@@ -578,6 +589,7 @@ def evaluate(
         **kwargs,
     ):
         with TempDir() as temp_dir:
+            _init_matplotlib()
             self.client = mlflow.tracking.MlflowClient()
 
             self.temp_dir = temp_dir

From cf5d095a7a13c6a4dc5c57ed57dcd64e065ef7cd Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 5 Jan 2022 16:42:23 +0800
Subject: [PATCH 104/120] fix test fixture

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 tests/models/test_evaluation.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index 0ecd87d8a5d1f..1357efe262f55 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -123,7 +123,7 @@ def breast_cancer_dataset():
     return EvaluationDataset(data=eval_X, labels=eval_y, name="breast_cancer_dataset")
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def linear_regressor_model_uri():
     X, y = get_diabetes_dataset()
     reg = sklearn.linear_model.LinearRegression()
@@ -136,7 +136,7 @@ def linear_regressor_model_uri():
     return linear_regressor_model_uri
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def spark_linear_regressor_model_uri():
     spark_df = get_diabetes_spark_dataset()
     reg = SparkLinearRegression()
@@ -149,7 +149,7 @@ def spark_linear_regressor_model_uri():
     return spark_linear_regressor_model_uri
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def multiclass_logistic_regressor_model_uri():
     X, y = get_iris()
     clf = sklearn.linear_model.LogisticRegression(max_iter=2)
@@ -162,7 +162,7 @@ def multiclass_logistic_regressor_model_uri():
     return multiclass_logistic_regressor_model_uri
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def binary_logistic_regressor_model_uri():
     X, y = get_breast_cancer_dataset()
     clf = sklearn.linear_model.LogisticRegression()
@@ -175,7 +175,7 @@ def binary_logistic_regressor_model_uri():
     return binary_logistic_regressor_model_uri
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def svm_model_uri():
     X, y = get_breast_cancer_dataset()
     clf = sklearn.svm.LinearSVC()
@@ -188,7 +188,7 @@ def svm_model_uri():
     return svm_model_uri
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture
 def iris_pandas_df_dataset():
     X, y = get_iris()
     eval_X, eval_y = X[0::3], y[0::3]

From 923a6a962c09ba8858bef0fedbff194672b3d86b Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 5 Jan 2022 17:02:42 +0800
Subject: [PATCH 105/120] fix pylint

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/artifacts.py         |  1 +
 mlflow/models/evaluation/base.py              |  3 +-
 mlflow/models/evaluation/default_evaluator.py | 47 ++++++++++++-------
 mlflow/models/evaluation/lift_curve.py        |  2 +-
 mlflow/models/utils.py                        |  5 +-
 tests/models/test_default_evaluator.py        | 25 ++++------
 6 files changed, 47 insertions(+), 36 deletions(-)

diff --git a/mlflow/models/evaluation/artifacts.py b/mlflow/models/evaluation/artifacts.py
index 3552a4e7cfaad..6343d6e15aa48 100644
--- a/mlflow/models/evaluation/artifacts.py
+++ b/mlflow/models/evaluation/artifacts.py
@@ -9,6 +9,7 @@ def save(self, output_artifact_path):
 
     def _load_content_from_file(self, local_artifact_path):
         from PIL.Image import open as open_image
+
         self._content = open_image(local_artifact_path)
         return self._content
 
diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 5da827a4abf9b..2f706c10d80f8 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -713,7 +713,8 @@ def evaluate(
        be a spark DataFrame contains a feature column of "Vector" type, and a label column.
      - For classifier, evaluation dataset labels must contains all distinct values, the dataset
        labels data will be used to infer the number of classes. For binary classifier, the
-       negative label value must be 0 or -1 or False, and the positive label value must be 1 or True.
+       negative label value must be 0 or -1 or False, and the positive label value must be
+       1 or True.
        For multiclass classifier, if logging explainability insights enabled, the label values
        must be number type.
 
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index e6615adc19b00..a37d2d722cc2f 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -222,15 +222,18 @@ def gen_x_y_thresholds_fn(_y, _y_prob):
 
 _matplotlib_initialized = False
 
+
 def _init_matplotlib():
     global _matplotlib_initialized
     if not _matplotlib_initialized:
         import matplotlib.pyplot as pyplot
-        pyplot.rcParams['figure.dpi'] = 144
-        pyplot.rcParams['figure.figsize'] = [6.0, 4.0]
+
+        pyplot.rcParams["figure.dpi"] = 144
+        pyplot.rcParams["figure.figsize"] = [6.0, 4.0]
         _matplotlib_initialized = True
 
 
+# pylint: disable=attribute-defined-outside-init
 class DefaultEvaluator(ModelEvaluator):
     def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
         return model_type in ["classifier", "regressor"]
@@ -433,9 +436,11 @@ def _log_binary_classifier(self):
                 labels=self.label_list,
                 curve_type="roc",
             )
-            self._log_image_artifact(
-                lambda: roc_curve.plot_fn(**roc_curve.plot_fn_args), "roc_curve_plot"
-            )
+
+            def plot_roc_curve():
+                roc_curve.plot_fn(**roc_curve.plot_fn_args)
+
+            self._log_image_artifact(plot_roc_curve, "roc_curve_plot")
             self.metrics["roc_auc"] = roc_curve.auc
 
             pr_curve = _gen_classifier_curve(
@@ -445,9 +450,11 @@ def _log_binary_classifier(self):
                 labels=self.label_list,
                 curve_type="pr",
             )
-            self._log_image_artifact(
-                lambda: pr_curve.plot_fn(**pr_curve.plot_fn_args), "precision_recall_curve_plot"
-            )
+
+            def plot_pr_curve():
+                pr_curve.plot_fn(**pr_curve.plot_fn_args)
+
+            self._log_image_artifact(plot_pr_curve, "precision_recall_curve_plot")
             self.metrics["precision_recall_auc"] = pr_curve.auc
 
     def _log_multiclass_classifier(self):
@@ -464,10 +471,10 @@ def _log_multiclass_classifier(self):
                 log_roc_pr_curve = True
             else:
                 _logger.warning(
-                    f"The classifier num_classes > {max_num_classes_for_logging_curve}, skip logging "
-                    f"ROC curve and Precision-Recall curve. You can add evaluator config "
-                    f"'max_num_classes_threshold_logging_roc_pr_curve_for_multiclass_classifier' to "
-                    f"increase the threshold."
+                    f"The classifier num_classes > {max_num_classes_for_logging_curve}, skip "
+                    f"logging ROC curve and Precision-Recall curve. You can add evaluator config "
+                    f"'max_num_classes_threshold_logging_roc_pr_curve_for_multiclass_classifier' "
+                    f"to increase the threshold."
                 )
 
         if log_roc_pr_curve:
@@ -478,9 +485,11 @@ def _log_multiclass_classifier(self):
                 labels=self.label_list,
                 curve_type="roc",
             )
-            self._log_image_artifact(
-                lambda: roc_curve.plot_fn(**roc_curve.plot_fn_args), "roc_curve_plot"
-            )
+
+            def plot_roc_curve():
+                roc_curve.plot_fn(**roc_curve.plot_fn_args)
+
+            self._log_image_artifact(plot_roc_curve, "roc_curve_plot")
             per_class_metrics_collection_df["roc_auc"] = roc_curve.auc
 
             pr_curve = _gen_classifier_curve(
@@ -490,9 +499,11 @@ def _log_multiclass_classifier(self):
                 labels=self.label_list,
                 curve_type="pr",
             )
-            self._log_image_artifact(
-                lambda: pr_curve.plot_fn(**pr_curve.plot_fn_args), "precision_recall_curve_plot"
-            )
+
+            def plot_pr_curve():
+                pr_curve.plot_fn(**pr_curve.plot_fn_args)
+
+            self._log_image_artifact(plot_pr_curve, "precision_recall_curve_plot")
             per_class_metrics_collection_df["precision_recall_auc"] = pr_curve.auc
 
         self._log_pandas_df_artifact(per_class_metrics_collection_df, "per_class_metrics")
diff --git a/mlflow/models/evaluation/lift_curve.py b/mlflow/models/evaluation/lift_curve.py
index 6961adb791576..168cf1849e5ed 100644
--- a/mlflow/models/evaluation/lift_curve.py
+++ b/mlflow/models/evaluation/lift_curve.py
@@ -147,7 +147,7 @@ def plot_lift_curve(
     gains2 = gains2 / percentages
 
     if ax is None:
-        fig, ax = plt.subplots(1, 1, figsize=figsize)
+        _, ax = plt.subplots(1, 1, figsize=figsize)
 
     ax.set_title(title, fontsize=title_fontsize)
 
diff --git a/mlflow/models/utils.py b/mlflow/models/utils.py
index d471d4d8551be..39b30c98db229 100644
--- a/mlflow/models/utils.py
+++ b/mlflow/models/utils.py
@@ -236,11 +236,14 @@ def _read_sparse_matrix_from_json(path, example_type):
             return csr_matrix((data, indices, indptr), shape=shape)
 
 
-def plot_lines(data_series, xlabel, ylabel, legend_loc=None, line_kwargs={}):
+def plot_lines(data_series, xlabel, ylabel, legend_loc=None, line_kwargs=None):
     import matplotlib.pyplot as plt
 
     fig, ax = plt.subplots()
 
+    if line_kwargs is None:
+        line_kwargs = {}
+
     for label, data_x, data_y in data_series:
         ax.plot(data_x, data_y, label=label, **line_kwargs)
 
diff --git a/tests/models/test_default_evaluator.py b/tests/models/test_default_evaluator.py
index f1945186cc6bb..d70ea3321266b 100644
--- a/tests/models/test_default_evaluator.py
+++ b/tests/models/test_default_evaluator.py
@@ -1,15 +1,10 @@
 import numpy as np
 import json
-import math
-import sklearn.metrics
 
 
-from mlflow.models.evaluation import evaluate, EvaluationDataset
+from mlflow.models.evaluation import evaluate
 from mlflow.models.evaluation.default_evaluator import (
-    _get_regressor_metrics,
     _get_classifier_global_metrics,
-    _get_classifier_per_class_metrics,
-    _extract_raw_model_and_predict_fn,
     _infer_model_type_by_labels,
     _extract_raw_model_and_predict_fn,
     _get_regressor_metrics,
@@ -20,6 +15,7 @@
 import mlflow
 from sklearn.linear_model import LogisticRegression
 
+# pylint: disable=unused-import
 from tests.models.test_evaluation import (
     get_run_data,
     linear_regressor_model_uri,
@@ -31,7 +27,6 @@
     spark_linear_regressor_model_uri,
     diabetes_spark_dataset,
     svm_model_uri,
-    breast_cancer_dataset,
 )
 from mlflow.models.utils import plot_lines
 
@@ -52,7 +47,7 @@ def test_regressor_evaluation(linear_regressor_model_uri, diabetes_dataset):
         )
         print(f"regressor evaluation run: {run.info.run_id}")
 
-    params, metrics, tags, artifacts = get_run_data(run.info.run_id)
+    _, metrics, tags, artifacts = get_run_data(run.info.run_id)
 
     model = mlflow.pyfunc.load_model(linear_regressor_model_uri)
 
@@ -94,11 +89,11 @@ def test_multi_classifier_evaluation(multiclass_logistic_regressor_model_uri, ir
         )
         print(f"multi-classifier evaluation run: {run.info.run_id}")
 
-    params, metrics, tags, artifacts = get_run_data(run.info.run_id)
+    _, metrics, tags, artifacts = get_run_data(run.info.run_id)
 
     model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri)
 
-    _, raw_model, predict_fn, predict_proba_fn = _extract_raw_model_and_predict_fn(model)
+    _, _, predict_fn, predict_proba_fn = _extract_raw_model_and_predict_fn(model)
     y = iris_dataset.labels
     y_pred = predict_fn(iris_dataset.data)
     y_probs = predict_proba_fn(iris_dataset.data)
@@ -146,11 +141,11 @@ def test_bin_classifier_evaluation(binary_logistic_regressor_model_uri, breast_c
         )
         print(f"bin-classifier evaluation run: {run.info.run_id}")
 
-    params, metrics, tags, artifacts = get_run_data(run.info.run_id)
+    _, metrics, tags, artifacts = get_run_data(run.info.run_id)
 
     model = mlflow.pyfunc.load_model(binary_logistic_regressor_model_uri)
 
-    _, raw_model, predict_fn, predict_proba_fn = _extract_raw_model_and_predict_fn(model)
+    _, _, predict_fn, predict_proba_fn = _extract_raw_model_and_predict_fn(model)
     y = breast_cancer_dataset.labels
     y_pred = predict_fn(breast_cancer_dataset.data)
     y_probs = predict_proba_fn(breast_cancer_dataset.data)
@@ -200,7 +195,7 @@ def test_spark_regressor_model_evaluation(spark_linear_regressor_model_uri, diab
         )
         print(f"spark model evaluation run: {run.info.run_id}")
 
-    params, metrics, tags, artifacts = get_run_data(run.info.run_id)
+    _, metrics, tags, artifacts = get_run_data(run.info.run_id)
 
     model = mlflow.pyfunc.load_model(spark_linear_regressor_model_uri)
 
@@ -237,11 +232,11 @@ def test_svm_classifier_evaluation(svm_model_uri, breast_cancer_dataset):
         )
         print(f"svm evaluation run: {run.info.run_id}")
 
-    params, metrics, tags, artifacts = get_run_data(run.info.run_id)
+    _, metrics, tags, artifacts = get_run_data(run.info.run_id)
 
     model = mlflow.pyfunc.load_model(svm_model_uri)
 
-    _, raw_model, predict_fn, predict_proba_fn = _extract_raw_model_and_predict_fn(model)
+    _, _, predict_fn, predict_proba_fn = _extract_raw_model_and_predict_fn(model)
     y = breast_cancer_dataset.labels
     y_pred = predict_fn(breast_cancer_dataset.data)
 

From f899fcfe8f2902e1550285ef6574acfdacf78a4b Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 5 Jan 2022 17:57:24 +0800
Subject: [PATCH 106/120] update doc

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              | 44 ++++++++++++++++---
 mlflow/models/evaluation/default_evaluator.py | 14 ++++++
 requirements/small-requirements.txt           |  1 +
 tests/models/test_default_evaluator.py        |  2 +-
 4 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 2f706c10d80f8..1d6ebfec0b92f 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -693,16 +693,39 @@ def evaluate(
     :return: An :py:class:`mlflow.models.evaluation.EvaluationDataset` instance containing
              evaluation results.
 
-    The default evaluator supports the 'regressor' and 'classifer' `model_type`s. The available
-    `evaluator_config` options for the default evaluator include:
-     - log_model_explainability: A boolean value specifying whether or not to log model
+    The default evaluator supports the 'regressor' and 'classifer' model types.
+
+    For both the 'regressor' and 'classifer' types, the default evaluator will generate model
+    summary plots and feature importance plots generated by shap explainer.
+
+    For regressor model, the default evaluator will additionally log:
+
+     - **metrics**: example_count, mean_absolute_error, mean_squared_error, root_mean_squared_error,
+       sum_on_label, mean_on_label, r2_score, max_error, mean_absolute_percentage_error.
+
+    For binary classifier, the default evaluator will additionally log:
+
+     - **metrics**: true_negatives, false_positives, false_negatives, true_positives, recall, precision,
+       f1_score, accuracy, example_count, log_loss, roc_auc, precision_recall_auc.
+     - **artifacts**: lift curve plot, precision-recall plot, ROC plot.
+
+    For multiclass classifier, the default evaluator will additionally log:
+
+     - **metrics**: accuracy, example_count, f1_score_micro, f1_score_macro, log_loss
+     - **artifacts**: A CSV file for "per_class_metrics" (per-class metrics includes true_negatives/
+       false_positives/false_negatives/true_positives/recall/precision/roc_auc,
+       precision_recall_auc), precision-recall merged curves plot and ROC merged curves plot.
+
+    The available `evaluator_config` options for the default evaluator include:
+
+     - **log_model_explainability**: A boolean value specifying whether or not to log model
        explainability insights, default value is True.
-     - explainability_algorithm: A string to specify the SHAP Explainer algorithm for model
+     - **explainability_algorithm**: A string to specify the SHAP Explainer algorithm for model
        explainability. If not set, `shap.Explainer` is used with the "auto" algorithm, which
        chooses the best Explainer based on the model.
-     - explainability_nsamples: The number of sample rows to use for computing model
+     - **explainability_nsamples**: The number of sample rows to use for computing model
        explainability insights. Default value is 2000.
-     - max_num_classes_threshold_logging_roc_pr_curve_for_multiclass_classifier:
+     - **max_num_classes_threshold_logging_roc_pr_curve_for_multiclass_classifier**:
        For multiclass classifier, specify the max number of classes which allow logging per-class
        ROC curve and Precision-Recall curve.
 
@@ -710,7 +733,8 @@ def evaluate(
      - If the input dataset is pandas dataframe, the feature columns in pandas dataframe must be
        scalar value columns, other object types (nd.array/list/etc.) are not supported yet.
      - If the mlflow model to be evaluated is a pyspark ML model, then the input data must
-       be a spark DataFrame contains a feature column of "Vector" type, and a label column.
+       be a spark DataFrame or pandas DataFrame contains a feature column with values of type
+       "pyspark.ml.linalg.Vector", and a label column.
      - For classifier, evaluation dataset labels must contains all distinct values, the dataset
        labels data will be used to infer the number of classes. For binary classifier, the
        negative label value must be 0 or -1 or False, and the positive label value must be
@@ -718,6 +742,12 @@ def evaluate(
        For multiclass classifier, if logging explainability insights enabled, the label values
        must be number type.
 
+    Limitations of metrics/artifacts computation:
+     - For classifier, some metrics and plot computation require model provides
+       "predict probability" function. Currently, for sklearn model, we will extract "predict_proba"
+       method from the raw model to achieve this, for other model, it will skip logging
+       metrics/artifacts which require probability prediction.
+
     Limitations of default evaluator logging model explainability insights:
      - The `shap.Explainer` "auto" algorithm will choose Linear explainer for linear model,
        and choose Tree explainer for tree model. But the shap Linear/Tree explainer does not
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index a37d2d722cc2f..8e83a4c601ece 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -110,6 +110,7 @@ def _get_binary_sum_up_label_pred_prob(positive_class_index, positive_class, y,
 
 def _get_classifier_per_class_metrics(y, y_pred):
     """
+    get classifier metrics which computing over a specific class.
     For binary classifier, y/y_pred is for the positive class.
     For multiclass classifier, y/y_pred sum up to a binary "is class" and "is not class".
     """
@@ -127,6 +128,9 @@ def _get_classifier_per_class_metrics(y, y_pred):
 
 
 def _get_classifier_global_metrics(is_binomial, y, y_pred, y_probs, labels):
+    """
+    get classifier metrics which computing over all classes examples.
+    """
     metrics = {}
     metrics["accuracy"] = sk_metrics.accuracy_score(y, y_pred)
     metrics["example_count"] = len(y)
@@ -165,6 +169,16 @@ def _gen_classifier_curve(
     labels,
     curve_type,
 ):
+    """
+    Generate precision-recall curve or ROC curve for classifier.
+    :param is_binomial: True if it is binary classifier otherwise False
+    :param y: True label values
+    :param y_probs: if binary classifer, the predicted probability for positive class.
+                    if multiclass classiifer, the predicted probabilities for all classes.
+    :param labels: The set of labels.
+    :param curve_type: "pr" or "roc"
+    :return: An instance of "_Curve" which includes attributes "plot_fn", "plot_fn_args", "auc".
+    """
     if curve_type == "roc":
 
         def gen_x_y_thresholds_fn(_y, _y_prob):
diff --git a/requirements/small-requirements.txt b/requirements/small-requirements.txt
index 8ea020af4dc37..335f227677590 100644
--- a/requirements/small-requirements.txt
+++ b/requirements/small-requirements.txt
@@ -14,3 +14,4 @@ azure-storage-blob>=12.0.0
 azure-identity>=1.6.1
 databricks-cli@git+https://github.com/databricks/databricks-cli.git
 pillow
+matplotlib
diff --git a/tests/models/test_default_evaluator.py b/tests/models/test_default_evaluator.py
index d70ea3321266b..b85909ca0af8e 100644
--- a/tests/models/test_default_evaluator.py
+++ b/tests/models/test_default_evaluator.py
@@ -93,7 +93,7 @@ def test_multi_classifier_evaluation(multiclass_logistic_regressor_model_uri, ir
 
     model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri)
 
-    _, _, predict_fn, predict_proba_fn = _extract_raw_model_and_predict_fn(model)
+    _, _, predict_fn, _ = _extract_raw_model_and_predict_fn(model)
     y = iris_dataset.labels
     y_pred = predict_fn(iris_dataset.data)
     y_probs = predict_proba_fn(iris_dataset.data)

From fd6465bc227944a81d6ad46e10d5422ad5ac74f3 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 5 Jan 2022 20:06:23 +0800
Subject: [PATCH 107/120] use matplot rc_context

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              |  6 +++---
 mlflow/models/evaluation/default_evaluator.py | 19 ++++++-------------
 tests/models/test_default_evaluator.py        |  9 ++-------
 3 files changed, 11 insertions(+), 23 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 1d6ebfec0b92f..e01734ce9ba04 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -705,8 +705,8 @@ def evaluate(
 
     For binary classifier, the default evaluator will additionally log:
 
-     - **metrics**: true_negatives, false_positives, false_negatives, true_positives, recall, precision,
-       f1_score, accuracy, example_count, log_loss, roc_auc, precision_recall_auc.
+     - **metrics**: true_negatives, false_positives, false_negatives, true_positives, recall,
+       precision, f1_score, accuracy, example_count, log_loss, roc_auc, precision_recall_auc.
      - **artifacts**: lift curve plot, precision-recall plot, ROC plot.
 
     For multiclass classifier, the default evaluator will additionally log:
@@ -714,7 +714,7 @@ def evaluate(
      - **metrics**: accuracy, example_count, f1_score_micro, f1_score_macro, log_loss
      - **artifacts**: A CSV file for "per_class_metrics" (per-class metrics includes true_negatives/
        false_positives/false_negatives/true_positives/recall/precision/roc_auc,
-       precision_recall_auc), precision-recall merged curves plot and ROC merged curves plot.
+       precision_recall_auc), precision-recall merged curves plot, ROC merged curves plot.
 
     The available `evaluator_config` options for the default evaluator include:
 
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 8e83a4c601ece..c9fca9a5569b2 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -234,17 +234,10 @@ def gen_x_y_thresholds_fn(_y, _y_prob):
     )
 
 
-_matplotlib_initialized = False
-
-
-def _init_matplotlib():
-    global _matplotlib_initialized
-    if not _matplotlib_initialized:
-        import matplotlib.pyplot as pyplot
-
-        pyplot.rcParams["figure.dpi"] = 144
-        pyplot.rcParams["figure.figsize"] = [6.0, 4.0]
-        _matplotlib_initialized = True
+_matplotlib_config = {
+    "figure.dpi": 144,
+    "figure.figsize": [6.0, 4.0],
+}
 
 
 # pylint: disable=attribute-defined-outside-init
@@ -613,8 +606,8 @@ def evaluate(
         evaluator_config,
         **kwargs,
     ):
-        with TempDir() as temp_dir:
-            _init_matplotlib()
+        import matplotlib
+        with TempDir() as temp_dir, matplotlib.rc_context(_matplotlib_config):
             self.client = mlflow.tracking.MlflowClient()
 
             self.temp_dir = temp_dir
diff --git a/tests/models/test_default_evaluator.py b/tests/models/test_default_evaluator.py
index b85909ca0af8e..75cac13c075a3 100644
--- a/tests/models/test_default_evaluator.py
+++ b/tests/models/test_default_evaluator.py
@@ -45,7 +45,6 @@ def test_regressor_evaluation(linear_regressor_model_uri, diabetes_dataset):
             dataset=diabetes_dataset,
             evaluators="default",
         )
-        print(f"regressor evaluation run: {run.info.run_id}")
 
     _, metrics, tags, artifacts = get_run_data(run.info.run_id)
 
@@ -87,13 +86,12 @@ def test_multi_classifier_evaluation(multiclass_logistic_regressor_model_uri, ir
             dataset=iris_dataset,
             evaluators="default",
         )
-        print(f"multi-classifier evaluation run: {run.info.run_id}")
 
     _, metrics, tags, artifacts = get_run_data(run.info.run_id)
 
     model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri)
 
-    _, _, predict_fn, _ = _extract_raw_model_and_predict_fn(model)
+    _, _, predict_fn, predict_proba_fn = _extract_raw_model_and_predict_fn(model)
     y = iris_dataset.labels
     y_pred = predict_fn(iris_dataset.data)
     y_probs = predict_proba_fn(iris_dataset.data)
@@ -139,7 +137,6 @@ def test_bin_classifier_evaluation(binary_logistic_regressor_model_uri, breast_c
             dataset=breast_cancer_dataset,
             evaluators="default",
         )
-        print(f"bin-classifier evaluation run: {run.info.run_id}")
 
     _, metrics, tags, artifacts = get_run_data(run.info.run_id)
 
@@ -193,7 +190,6 @@ def test_spark_regressor_model_evaluation(spark_linear_regressor_model_uri, diab
             evaluators="default",
             evaluator_config={"log_model_explainability": True},
         )
-        print(f"spark model evaluation run: {run.info.run_id}")
 
     _, metrics, tags, artifacts = get_run_data(run.info.run_id)
 
@@ -230,13 +226,12 @@ def test_svm_classifier_evaluation(svm_model_uri, breast_cancer_dataset):
             dataset=breast_cancer_dataset,
             evaluators="default",
         )
-        print(f"svm evaluation run: {run.info.run_id}")
 
     _, metrics, tags, artifacts = get_run_data(run.info.run_id)
 
     model = mlflow.pyfunc.load_model(svm_model_uri)
 
-    _, _, predict_fn, predict_proba_fn = _extract_raw_model_and_predict_fn(model)
+    _, _, predict_fn, _ = _extract_raw_model_and_predict_fn(model)
     y = breast_cancer_dataset.labels
     y_pred = predict_fn(breast_cancer_dataset.data)
 

From 2f900493723209f95d13b42b2a42154558405e68 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 5 Jan 2022 20:10:33 +0800
Subject: [PATCH 108/120] fix shap import

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/default_evaluator.py | 1 +
 requirements/small-requirements.txt           | 1 +
 2 files changed, 2 insertions(+)

diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index c9fca9a5569b2..0b169e73ae1bd 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -330,6 +330,7 @@ def _log_model_explainability(self):
                 "SHAP or matplotlib package is not installed, so model explainability insights "
                 "will not be logged."
             )
+            return
 
         if Version(shap.__version__) < Version("0.40"):
             _logger.warning(
diff --git a/requirements/small-requirements.txt b/requirements/small-requirements.txt
index 335f227677590..18424b787c6fc 100644
--- a/requirements/small-requirements.txt
+++ b/requirements/small-requirements.txt
@@ -15,3 +15,4 @@ azure-identity>=1.6.1
 databricks-cli@git+https://github.com/databricks/databricks-cli.git
 pillow
 matplotlib
+shap>=0.40

From 4c4e90987ad66413b39bfd3e134da72bb0712b08 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 5 Jan 2022 23:14:30 +0800
Subject: [PATCH 109/120] refactor EvaluationDataset

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              | 306 ++++++++----------
 mlflow/models/evaluation/default_evaluator.py |   8 +-
 tests/models/test_default_evaluator.py        |  23 +-
 tests/models/test_evaluation.py               |  55 ++--
 .../mlflow_test_plugin/dummy_evaluator.py     |   3 +-
 5 files changed, 189 insertions(+), 206 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index e01734ce9ba04..0b7845a48324a 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -155,6 +155,72 @@ def artifacts(self) -> Dict[str, "mlflow.models.evaluation.EvaluationArtifact"]:
 _cached_mlflow_client = None
 
 
+def _convert_uint64_ndarray_to_bytes(array):
+    assert len(array.shape) == 1
+    # see struct pack format string https://docs.python.org/3/library/struct.html#format-strings
+    return struct.pack(f">{array.size}Q", *array)
+
+
+def _hash_ndarray_as_bytes(nd_array):
+    from pandas.util import hash_array
+    import numpy as np
+
+    return _convert_uint64_ndarray_to_bytes(
+        hash_array(nd_array.flatten(order="C"))
+    ) + _convert_uint64_ndarray_to_bytes(np.array(nd_array.shape, dtype="uint64"))
+
+
+def _array_like_obj_to_bytes(data, spark_vector_type):
+    """
+    Helper method to convert pandas dataframe/numpy array/list into bytes for
+    MD5 calculation purpose.
+    """
+    from pandas.util import hash_pandas_object
+    import numpy as np
+    import pandas as pd
+
+    if isinstance(data, pd.DataFrame):
+
+        def _hash_array_like_element_as_bytes(v):
+            if spark_vector_type is not None:
+                if isinstance(v, spark_vector_type):
+                    return _hash_ndarray_as_bytes(v.toArray())
+            if isinstance(v, np.ndarray):
+                return _hash_ndarray_as_bytes(v)
+            if isinstance(v, list):
+                return _hash_ndarray_as_bytes(np.array(v))
+            return v
+
+        data = data.applymap(_hash_array_like_element_as_bytes)
+        return _convert_uint64_ndarray_to_bytes(hash_pandas_object(data))
+    elif isinstance(data, np.ndarray):
+        return _hash_ndarray_as_bytes(data)
+    elif isinstance(data, list):
+        return _hash_ndarray_as_bytes(np.array(data))
+    else:
+        raise ValueError("Unsupported data type.")
+
+
+def _gen_md5_for_arraylike_obj(md5_gen, data, spark_vector_type):
+    """
+    Helper method to generate MD5 hash array-like object, the MD5 will calculate over:
+     - array length
+     - first NUM_SAMPLE_ROWS_FOR_HASH rows content
+     - last NUM_SAMPLE_ROWS_FOR_HASH rows content
+    """
+    import numpy as np
+
+    len_bytes = _convert_uint64_ndarray_to_bytes(np.array([len(data)], dtype="uint64"))
+    md5_gen.update(len_bytes)
+    if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
+        md5_gen.update(_array_like_obj_to_bytes(data, spark_vector_type))
+    else:
+        head_rows = data[: EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH]
+        tail_rows = data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH :]
+        md5_gen.update(_array_like_obj_to_bytes(head_rows, spark_vector_type))
+        md5_gen.update(_array_like_obj_to_bytes(tail_rows, spark_vector_type))
+
+
 class EvaluationDataset:
     """
     An input dataset for model evaluation. This is intended for use with the
@@ -193,58 +259,47 @@ def __init__(self, data, labels, name=None, path=None, feature_names=None):
         import numpy as np
         import pandas as pd
 
-        # TODO:
-        #  for pandas.DataFrame input, check column type and raise error if unsupported column
-        #   found
-        #  For spark DataFrame input, support feature column with `pyspark.ml.Vector` column type.
+        if name is not None and '"' in name:
+            raise ValueError(f'Dataset name cannot include a double quote (") but got {name}')
+        if path is not None and '"' in path:
+            raise ValueError(f'Dataset path cannot include a double quote (") but got {path}')
+
+        self._user_specified_name = name
+        self._path = path
+        self._hash = None
+
         try:
             # add checking `'pyspark' in sys.modules` to avoid importing pyspark when user
             # run code not related to pyspark.
             if "pyspark" in sys.modules:
                 from pyspark.sql import DataFrame as SparkDataFrame
-                from pyspark.ml.linalg import VectorUDT as SparkVectorUDT
+                from pyspark.ml.linalg import Vector as SparkMLVector
 
                 supported_dataframe_types = (pd.DataFrame, SparkDataFrame)
-                self._spark_df_type = SparkDataFrame
-                self._spark_vector_type = SparkVectorUDT
+                spark_df_type = SparkDataFrame
+                spark_vector_type = SparkMLVector
             else:
                 supported_dataframe_types = (pd.DataFrame,)
-                self._spark_df_type = None
+                spark_df_type = None
+                spark_vector_type = None
         except ImportError:
             supported_dataframe_types = (pd.DataFrame,)
 
-        if name is not None and '"' in name:
-            raise ValueError(f'Dataset name cannot include a double quote (") but got {name}')
-        if path is not None and '"' in path:
-            raise ValueError(f'Dataset path cannot include a double quote (") but got {path}')
-
         if isinstance(data, (np.ndarray, list)):
             if not isinstance(labels, (np.ndarray, list)):
                 raise ValueError(
                     "If data is a numpy array or list of evaluation features, "
                     "labels must be a numpy array or list of evaluation labels"
                 )
-        elif isinstance(data, supported_dataframe_types):
-            if not isinstance(labels, str):
-                raise ValueError(
-                    "If data is a Pandas DataFrame or Spark DataFrame, labels must be the "
-                    "string name of a column from `data` that contains evaluation labels"
-                )
-        else:
-            raise ValueError(
-                "The data argument must be a numpy array, a list or a Pandas DataFrame, or "
-                "spark DataFrame if pyspark package installed."
-            )
-
-        self._user_specified_name = name
-        self._original_data = data
-        self._data = None
-        self.labels = labels
-        self.path = path
-        self._hash = None
-
-        if isinstance(self.data, np.ndarray):
-            num_features = self.data.shape[1]
+            self._features_data = data
+            self._labels_data = labels if isinstance(labels, np.ndarray) else np.array(labels)
+            if isinstance(data, list):
+                num_features = len(data[0])
+            else:
+                if len(data.shape) > 1:
+                    num_features = data.shape[1]
+                else:
+                    num_features = len(data[0])
             if feature_names is not None:
                 feature_names = list(feature_names)
                 if num_features != len(feature_names):
@@ -255,113 +310,61 @@ def __init__(self, data, labels, name=None, path=None, feature_names=None):
                     f"feature_{str(i).zfill(math.ceil((math.log10(num_features))))}"
                     for i in range(num_features)
                 ]
-        else:
-            pd_column_names = [c for c in self.data.columns if c != self.labels]
-            if feature_names is not None:
-                feature_names = list(feature_names)
-                if pd_column_names != list(feature_names):
-                    raise ValueError(
-                        "feature names must match feature column names in the pandas " "dataframe"
-                    )
-            self._feature_names = pd_column_names
-
-    @property
-    def data(self):
-        """
-        Return original data if data is numpy array or pandas dataframe,
-        For spark dataframe, will only return the first SPARK_DATAFRAME_LIMIT rows as pandas
-        dataframe and emit warning.
-        """
-        if self._data is not None:
-            return self._data
-
-        if self._spark_df_type and isinstance(self._original_data, self._spark_df_type):
-            self._data = self._original_data.limit(
-                EvaluationDataset.SPARK_DATAFRAME_LIMIT
-            ).toPandas()
-            _logger.warning(
-                f"Specified Spark DataFrame is too large for model evaluation. Only "
-                f"the first {EvaluationDataset.SPARK_DATAFRAME_LIMIT} rows will be used."
-            )
-        else:
-            self._data = self._original_data
-
-        return self._data
-
-    def _extract_features_and_labels(self):
-        """
-        Extract features data and labels data.
-        For spark dataframe, will only extract the first SPARK_DATAFRAME_LIMIT rows data
-        and emit warning.
-        """
-        import numpy as np
+        elif isinstance(data, supported_dataframe_types):
+            if not isinstance(labels, str):
+                raise ValueError(
+                    "If data is a Pandas DataFrame or Spark DataFrame, labels must be the "
+                    "string name of a column from `data` that contains evaluation labels"
+                )
+            if isinstance(data, spark_df_type):
+                _logger.warning(
+                    f"Specified Spark DataFrame is too large for model evaluation. Only "
+                    f"the first {EvaluationDataset.SPARK_DATAFRAME_LIMIT} rows will be used."
+                    "If you want evaluate on the whole spark dataframe, please manually call "
+                    "`spark_dataframe.toPandas()`."
+                )
+                data = data.limit(EvaluationDataset.SPARK_DATAFRAME_LIMIT).toPandas()
 
-        if isinstance(self.data, np.ndarray):
-            return self.data, self.labels
+            self._features_data = data.drop(labels, axis=1, inplace=False)
+            self._labels_data = data[labels].to_numpy()
+            if feature_names is not None:
+                raise ValueError(
+                    "If `data` argument is pandas/spark dataframe, you cannot specify the "
+                    "`feature_names` argument, instead, the column names of the input "
+                    "dataframe will be used as the feature names."
+                )
+            self._feature_names = list(self._features_data.columns)
         else:
-            return (
-                self.data.drop(self.labels, axis=1, inplace=False),
-                self.data[self.labels].to_numpy(),
+            raise ValueError(
+                "The data argument must be a numpy array, a list or a Pandas DataFrame, or "
+                "spark DataFrame if pyspark package installed."
             )
 
-    @staticmethod
-    def _convert_uint64_ndarray_to_bytes(array):
-        assert len(array.shape) == 1
-        # see struct pack format string https://docs.python.org/3/library/struct.html#format-strings
-        return struct.pack(f">{array.size}Q", *array)
+        # generate dataset hash
+        md5_gen = hashlib.md5()
+        _gen_md5_for_arraylike_obj(md5_gen, self._features_data, spark_vector_type)
+        _gen_md5_for_arraylike_obj(md5_gen, self._labels_data, spark_vector_type)
+        md5_gen.update(",".join(self._feature_names).encode("UTF-8"))
 
-    @staticmethod
-    def _array_like_obj_to_bytes(data):
-        """
-        Helper method to convert pandas dataframe/numpy array/list into bytes for
-        MD5 calculation purpose.
-        """
-        from pandas.util import hash_pandas_object, hash_array
-        import numpy as np
-        import pandas as pd
+        self._hash = md5_gen.hexdigest()
 
-        if isinstance(data, pd.DataFrame):
-            return EvaluationDataset._convert_uint64_ndarray_to_bytes(hash_pandas_object(data))
-        elif isinstance(data, np.ndarray):
-            return EvaluationDataset._convert_uint64_ndarray_to_bytes(
-                hash_array(data.flatten(order="C"))
-            )
-        elif isinstance(data, list):
-            return EvaluationDataset._convert_uint64_ndarray_to_bytes(hash_array(np.array(data)))
-        else:
-            raise ValueError("Unsupported data type.")
+    @property
+    def feature_names(self):
+        return self._feature_names
 
-    @staticmethod
-    def _gen_md5_for_arraylike_obj(md5_gen, data):
+    @property
+    def features_data(self):
         """
-        Helper method to generate MD5 hash array-like object, the MD5 will calculate over:
-         - array length
-         - first NUM_SAMPLE_ROWS_FOR_HASH rows content
-         - last NUM_SAMPLE_ROWS_FOR_HASH rows content
+        return features data as a numpy array or a pandas DataFrame.
         """
-        import numpy as np
-
-        len_bytes = EvaluationDataset._convert_uint64_ndarray_to_bytes(
-            np.array([len(data)], dtype="uint64")
-        )
-        md5_gen.update(len_bytes)
-        if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
-            md5_gen.update(EvaluationDataset._array_like_obj_to_bytes(data))
-        else:
-            md5_gen.update(
-                EvaluationDataset._array_like_obj_to_bytes(
-                    data[: EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH]
-                )
-            )
-            md5_gen.update(
-                EvaluationDataset._array_like_obj_to_bytes(
-                    data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH :]
-                )
-            )
+        return self._features_data
 
     @property
-    def feature_names(self):
-        return self._feature_names
+    def labels_data(self):
+        """
+        return labels data as a numpy array
+        """
+        return self._labels_data
 
     @property
     def name(self):
@@ -372,44 +375,17 @@ def name(self):
         return self._user_specified_name if self._user_specified_name is not None else self.hash
 
     @property
-    def hash(self):
+    def path(self):
         """
-        Compute a hash from the specified dataset by selecting the first 5 records, last 5 records,
-        dataset size and feeding them through a cheap, low-collision hash function
+        Dataset path
         """
-        import numpy as np
-        import pandas as pd
+        return self._path
 
-        if self._hash is None:
-            md5_gen = hashlib.md5()
-            if isinstance(self.data, np.ndarray):
-                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data)
-                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.labels)
-            elif isinstance(self.data, pd.DataFrame):
-                column_names = ",".join(self.data.columns)
-                meta_str = f"columns={column_names}\nlabels={self.labels}"
-                md5_gen.update(meta_str.encode("UTF-8"))
-
-                data_for_hash = self.data
-
-                if self._spark_df_type and isinstance(self._original_data, self._spark_df_type):
-                    # For spark dataframe, the Vector type column we need expand it
-                    # into multiple columns, otherwise pandas hash function cannot compute hash
-                    # over it.
-                    transform_func = {}
-                    for field in self._original_data.schema:
-                        if isinstance(field.dataType, self._spark_vector_type):
-                            transform_func[field.name] = lambda x: pd.Series(x.toArray())
-                        else:
-                            transform_func[field.name] = lambda x: x
-
-                    data_for_hash = self.data.transform(transform_func)
-
-                # TODO:
-                #  For array/list type column values in pandas DataFrame, pandas hash function
-                #  also cannot support it, expand them if we need support them.
-                EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, data_for_hash)
-            self._hash = md5_gen.hexdigest()
+    @property
+    def hash(self):
+        """
+        Dataset hash, includes hash on first 20 rows and last 20 rows.
+        """
         return self._hash
 
     @property
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 0b169e73ae1bd..211b6eea50034 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -608,6 +608,7 @@ def evaluate(
         **kwargs,
     ):
         import matplotlib
+
         with TempDir() as temp_dir, matplotlib.rc_context(_matplotlib_config):
             self.client = mlflow.tracking.MlflowClient()
 
@@ -631,13 +632,12 @@ def evaluate(
             self.predict_fn = predict_fn
             self.predict_proba_fn = predict_proba_fn
 
-            X, y = dataset._extract_features_and_labels()
-            self.X = X
-            self.y = y
+            self.X = dataset.features_data
+            self.y = dataset.labels_data
             self.metrics = EvaluationMetrics()
             self.artifacts = {}
 
-            infered_model_type = _infer_model_type_by_labels(y)
+            infered_model_type = _infer_model_type_by_labels(self.y)
 
             if model_type != infered_model_type:
                 _logger.warning(
diff --git a/tests/models/test_default_evaluator.py b/tests/models/test_default_evaluator.py
index 75cac13c075a3..0ebd4de8d23a9 100644
--- a/tests/models/test_default_evaluator.py
+++ b/tests/models/test_default_evaluator.py
@@ -50,8 +50,8 @@ def test_regressor_evaluation(linear_regressor_model_uri, diabetes_dataset):
 
     model = mlflow.pyfunc.load_model(linear_regressor_model_uri)
 
-    y = diabetes_dataset.labels
-    y_pred = model.predict(diabetes_dataset.data)
+    y = diabetes_dataset.labels_data
+    y_pred = model.predict(diabetes_dataset.features_data)
 
     expected_metrics = _get_regressor_metrics(y, y_pred)
     for metric_key in expected_metrics:
@@ -92,9 +92,9 @@ def test_multi_classifier_evaluation(multiclass_logistic_regressor_model_uri, ir
     model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri)
 
     _, _, predict_fn, predict_proba_fn = _extract_raw_model_and_predict_fn(model)
-    y = iris_dataset.labels
-    y_pred = predict_fn(iris_dataset.data)
-    y_probs = predict_proba_fn(iris_dataset.data)
+    y = iris_dataset.labels_data
+    y_pred = predict_fn(iris_dataset.features_data)
+    y_probs = predict_proba_fn(iris_dataset.features_data)
 
     expected_metrics = _get_classifier_global_metrics(False, y, y_pred, y_probs, labels=None)
 
@@ -143,9 +143,9 @@ def test_bin_classifier_evaluation(binary_logistic_regressor_model_uri, breast_c
     model = mlflow.pyfunc.load_model(binary_logistic_regressor_model_uri)
 
     _, _, predict_fn, predict_proba_fn = _extract_raw_model_and_predict_fn(model)
-    y = breast_cancer_dataset.labels
-    y_pred = predict_fn(breast_cancer_dataset.data)
-    y_probs = predict_proba_fn(breast_cancer_dataset.data)
+    y = breast_cancer_dataset.labels_data
+    y_pred = predict_fn(breast_cancer_dataset.features_data)
+    y_probs = predict_proba_fn(breast_cancer_dataset.features_data)
 
     expected_metrics = _get_classifier_global_metrics(True, y, y_pred, y_probs, labels=None)
 
@@ -195,7 +195,8 @@ def test_spark_regressor_model_evaluation(spark_linear_regressor_model_uri, diab
 
     model = mlflow.pyfunc.load_model(spark_linear_regressor_model_uri)
 
-    X, y = diabetes_spark_dataset._extract_features_and_labels()
+    X = diabetes_spark_dataset.features_data
+    y = diabetes_spark_dataset.labels_data
     y_pred = model.predict(X)
 
     expected_metrics = _get_regressor_metrics(y, y_pred)
@@ -232,8 +233,8 @@ def test_svm_classifier_evaluation(svm_model_uri, breast_cancer_dataset):
     model = mlflow.pyfunc.load_model(svm_model_uri)
 
     _, _, predict_fn, _ = _extract_raw_model_and_predict_fn(model)
-    y = breast_cancer_dataset.labels
-    y_pred = predict_fn(breast_cancer_dataset.data)
+    y = breast_cancer_dataset.labels_data
+    y_pred = predict_fn(breast_cancer_dataset.features_data)
 
     expected_metrics = _get_classifier_global_metrics(True, y, y_pred, None, labels=None)
 
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index 1357efe262f55..1d2da101845e8 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -25,7 +25,7 @@
 from mlflow.utils.file_utils import TempDir
 from mlflow_test_plugin.dummy_evaluator import Array2DEvaluationArtifact
 from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry
-from mlflow.models.evaluation.base import _logger as _base_logger
+from mlflow.models.evaluation.base import _logger as _base_logger, _gen_md5_for_arraylike_obj
 
 from sklearn.metrics import (
     accuracy_score,
@@ -59,8 +59,7 @@ def get_diabetes_spark_dataset():
     rows = [
         (Vectors.dense(features), float(label)) for features, label in zip(data.data, data.target)
     ]
-
-    return spark.createDataFrame(rows, ["features", "label"])
+    return spark.createDataFrame(spark.sparkContext.parallelize(rows, 1), ["features", "label"])
 
 
 def get_breast_cancer_dataset():
@@ -192,15 +191,23 @@ def svm_model_uri():
 def iris_pandas_df_dataset():
     X, y = get_iris()
     eval_X, eval_y = X[0::3], y[0::3]
-    data = pd.DataFrame({"f1": eval_X[:, 0], "f2": eval_X[:, 1], "y": eval_y})
+    data = pd.DataFrame(
+        {
+            "f1": eval_X[:, 0],
+            "f2": eval_X[:, 1],
+            "f2": eval_X[:, 2],
+            "f3": eval_X[:, 3],
+            "y": eval_y,
+        }
+    )
     labels = "y"
     return EvaluationDataset(data=data, labels=labels, name="iris_pandas_df_dataset")
 
 
 def test_classifier_evaluate(multiclass_logistic_regressor_model_uri, iris_dataset):
-    y_true = iris_dataset.labels
+    y_true = iris_dataset.labels_data
     classifier_model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri)
-    y_pred = classifier_model.predict(iris_dataset.data)
+    y_pred = classifier_model.predict(iris_dataset.features_data)
     expected_accuracy_score = accuracy_score(y_true, y_pred)
     expected_metrics = {
         "accuracy_score": expected_accuracy_score,
@@ -268,9 +275,9 @@ def test_classifier_evaluate(multiclass_logistic_regressor_model_uri, iris_datas
 
 
 def test_regressor_evaluate(linear_regressor_model_uri, diabetes_dataset):
-    y_true = diabetes_dataset.labels
+    y_true = diabetes_dataset.labels_data
     regressor_model = mlflow.pyfunc.load_model(linear_regressor_model_uri)
-    y_pred = regressor_model.predict(diabetes_dataset.data)
+    y_pred = regressor_model.predict(diabetes_dataset.features_data)
     expected_mae = mean_absolute_error(y_true, y_pred)
     expected_mse = mean_squared_error(y_true, y_pred)
     expected_metrics = {
@@ -307,7 +314,7 @@ def test_dataset_name():
 def test_gen_md5_for_arraylike_obj():
     def get_md5(data):
         md5_gen = hashlib.md5()
-        EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, data)
+        _gen_md5_for_arraylike_obj(md5_gen, data, None)
         return md5_gen.hexdigest()
 
     list0 = list(range(20))
@@ -322,31 +329,29 @@ def get_md5(data):
 
 
 def test_dataset_hash(iris_dataset, iris_pandas_df_dataset, diabetes_spark_dataset):
-    assert iris_dataset.hash == "827a8427365cafbd9110b1b009d5a80d"
-    assert iris_pandas_df_dataset.hash == "d06cfb6352dba29afe514d9be87021aa"
-    assert diabetes_spark_dataset.hash == "a30ebc9899e22ee6e60665f98d4b08b3"
+    assert iris_dataset.hash == "d4975e40e1443d94f4f8e72c4c7d46d2"
+    assert iris_pandas_df_dataset.hash == "a0487290fb15889bb13ea75f892eb539"
+    assert diabetes_spark_dataset.hash == "e646b03e976240bd0c79c6bcc1ae0bda"
 
 
-def test_datasset_extract_features_label(iris_dataset, iris_pandas_df_dataset):
-    X1, y1 = iris_dataset._extract_features_and_labels()
-    assert np.array_equal(X1, iris_dataset.data)
-    assert np.array_equal(y1, iris_dataset.labels)
+def test_datasset_extract_features_label():
+    data = pd.DataFrame({"f1": [1, 2], "f2": [3, 4], "label": [0, 1]})
+    eval_dataset = EvaluationDataset(data=data, labels="label")
 
-    X2, y2 = iris_pandas_df_dataset._extract_features_and_labels()
-    assert list(X2.columns) == ["f1", "f2"]
-    assert np.array_equal(X2["f1"], X1[:, 0])
-    assert np.array_equal(X2["f2"], X1[:, 1])
-    assert np.array_equal(y2, y1)
+    assert list(eval_dataset.features_data.columns) == ["f1", "f2"]
+    assert np.array_equal(eval_dataset.features_data.f1.to_numpy(), [1, 2])
+    assert np.array_equal(eval_dataset.features_data.f2.to_numpy(), [3, 4])
+    assert np.array_equal(eval_dataset.labels_data, [0, 1])
 
 
 def test_spark_df_dataset(spark_session):
     spark_df = spark_session.createDataFrame([(1.0, 2.0, 3.0)] * 10, ["f1", "f2", "y"])
     with mock.patch.object(EvaluationDataset, "SPARK_DATAFRAME_LIMIT", 5):
         dataset = EvaluationDataset(spark_df, "y")
-        assert list(dataset.data.columns) == ["f1", "f2", "y"]
-        assert list(dataset.data["f1"]) == [1.0] * 5
-        assert list(dataset.data["f2"]) == [2.0] * 5
-        assert list(dataset.data["y"]) == [3.0] * 5
+        assert list(dataset.features_data.columns) == ["f1", "f2"]
+        assert list(dataset.features_data["f1"]) == [1.0] * 5
+        assert list(dataset.features_data["f2"]) == [2.0] * 5
+        assert list(dataset.labels_data) == [3.0] * 5
 
 
 def test_log_dataset_tag(iris_dataset, iris_pandas_df_dataset):
diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
index bb3c8a8875849..10ec46c63edac 100644
--- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -44,7 +44,8 @@ def evaluate(
         self, model, model_type, dataset, run_id, evaluator_config=None, **kwargs
     ) -> EvaluationResult:
         client = mlflow.tracking.MlflowClient()
-        X, y = dataset._extract_features_and_labels()
+        X = dataset.features_data
+        y = dataset.labels_data
         y_pred = model.predict(X)
         if model_type == "classifier":
             accuracy_score = sk_metrics.accuracy_score(y, y_pred)

From 4acf9ea2719e8be9f3878977c057cf84a7b724ef Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 5 Jan 2022 23:25:01 +0800
Subject: [PATCH 110/120] limit user specify shap algos

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              |  5 +++--
 mlflow/models/evaluation/default_evaluator.py | 21 ++++++++++---------
 tests/models/test_evaluation.py               |  6 +++---
 3 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 0b7845a48324a..65d052b0f8c7e 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -697,8 +697,9 @@ def evaluate(
      - **log_model_explainability**: A boolean value specifying whether or not to log model
        explainability insights, default value is True.
      - **explainability_algorithm**: A string to specify the SHAP Explainer algorithm for model
-       explainability. If not set, `shap.Explainer` is used with the "auto" algorithm, which
-       chooses the best Explainer based on the model.
+       explainability. Supported algorithm includes: 'exact', 'permutation', 'partition'.
+       If not set, `shap.Explainer` is used with the "auto" algorithm, which chooses the best
+       Explainer based on the model.
      - **explainability_nsamples**: The number of sample rows to use for computing model
        explainability insights. Default value is 2000.
      - **max_num_classes_threshold_logging_roc_pr_curve_for_multiclass_classifier**:
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 211b6eea50034..ada6397edd268 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -364,17 +364,18 @@ def _log_model_explainability(self):
 
         sampled_X = shap.sample(renamed_X, sample_rows)
         if algorithm:
-            if algorithm == "sampling":
-                explainer = shap.explainers.Sampling(
-                    self.predict_fn, renamed_X, feature_names=truncated_feature_names
-                )
-            else:
-                explainer = shap.Explainer(
-                    self.predict_fn,
-                    sampled_X,
-                    feature_names=truncated_feature_names,
-                    algorithm=algorithm,
+            supported_algos = ['exact', 'permutation', 'partition']
+            if algorithm not in supported_algos:
+                raise ValueError(
+                    f"Specified explainer algorithm {algorithm} is unsupported. Currently only "
+                    f"support {','.join(supported_algos)} algorithms."
                 )
+            explainer = shap.Explainer(
+                self.predict_fn,
+                sampled_X,
+                feature_names=truncated_feature_names,
+                algorithm=algorithm,
+            )
         else:
             if self.raw_model and not is_multinomial_classifier:
                 # For mulitnomial classifier, shap.Explainer may choose Tree/Linear explainer for
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index 1d2da101845e8..bcc7e851b3268 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -195,8 +195,8 @@ def iris_pandas_df_dataset():
         {
             "f1": eval_X[:, 0],
             "f2": eval_X[:, 1],
-            "f2": eval_X[:, 2],
-            "f3": eval_X[:, 3],
+            "f3": eval_X[:, 2],
+            "f4": eval_X[:, 3],
             "y": eval_y,
         }
     )
@@ -330,7 +330,7 @@ def get_md5(data):
 
 def test_dataset_hash(iris_dataset, iris_pandas_df_dataset, diabetes_spark_dataset):
     assert iris_dataset.hash == "d4975e40e1443d94f4f8e72c4c7d46d2"
-    assert iris_pandas_df_dataset.hash == "a0487290fb15889bb13ea75f892eb539"
+    assert iris_pandas_df_dataset.hash == "799d4f50e2e353127f94a0e5300add06"
     assert diabetes_spark_dataset.hash == "e646b03e976240bd0c79c6bcc1ae0bda"
 
 

From 09d484287727d84150e428b57a0e58e378015130 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Wed, 5 Jan 2022 23:45:26 +0800
Subject: [PATCH 111/120] clean

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/default_evaluator.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index ada6397edd268..235a8eac3978d 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -314,7 +314,7 @@ def _log_model_explainability(self):
             return
 
         if self.model_type == "classifier" and not all(
-            [isinstance(label, numbers.Number) for label in self.label_list]
+            [isinstance(label, (numbers.Number, np.bool_)) for label in self.label_list]
         ):
             _logger.warning(
                 "Skip logging model explainability insights because it requires all label "
@@ -355,14 +355,13 @@ def _log_model_explainability(self):
             f: f2 for f, f2 in zip(self.feature_names, truncated_feature_names)
         }
 
-        if isinstance(self.X, pd.DataFrame):
+        sampled_X = shap.sample(self.X, sample_rows)
+
+        if isinstance(sampled_X, pd.DataFrame):
             # For some shap explainer, the plot will use the DataFrame column names instead of
             # using feature_names argument value. So rename the dataframe column names.
-            renamed_X = self.X.rename(columns=truncated_feature_name_map, copy=False)
-        else:
-            renamed_X = self.X
+            sampled_X = sampled_X.rename(columns=truncated_feature_name_map, copy=False)
 
-        sampled_X = shap.sample(renamed_X, sample_rows)
         if algorithm:
             supported_algos = ['exact', 'permutation', 'partition']
             if algorithm not in supported_algos:
@@ -392,10 +391,7 @@ def _log_model_explainability(self):
 
         _logger.info(f"Shap explainer {explainer.__class__.__name__} is used.")
 
-        if algorithm == "sampling":
-            shap_values = explainer(renamed_X, sample_rows)
-        else:
-            shap_values = explainer(sampled_X)
+        shap_values = explainer(sampled_X)
 
         try:
             mlflow.shap.log_explainer(

From 8e018f475721500891655ff2b81cce4eb8f16841 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 6 Jan 2022 11:09:43 +0800
Subject: [PATCH 112/120] update evaluation dataset

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              | 50 ++++++++++++-------
 mlflow/models/evaluation/default_evaluator.py |  2 +-
 tests/models/test_evaluation.py               | 39 ++++++++++++++-
 3 files changed, 71 insertions(+), 20 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 65d052b0f8c7e..acdc62019bd32 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -170,7 +170,7 @@ def _hash_ndarray_as_bytes(nd_array):
     ) + _convert_uint64_ndarray_to_bytes(np.array(nd_array.shape, dtype="uint64"))
 
 
-def _array_like_obj_to_bytes(data, spark_vector_type):
+def _array_like_obj_to_bytes(data):
     """
     Helper method to convert pandas dataframe/numpy array/list into bytes for
     MD5 calculation purpose.
@@ -180,6 +180,12 @@ def _array_like_obj_to_bytes(data, spark_vector_type):
     import pandas as pd
 
     if isinstance(data, pd.DataFrame):
+        # add checking `'pyspark' in sys.modules` to avoid importing pyspark when user
+        # run code not related to pyspark.
+        if "pyspark" in sys.modules:
+            from pyspark.ml.linalg import Vector as spark_vector_type
+        else:
+            spark_vector_type = None
 
         def _hash_array_like_element_as_bytes(v):
             if spark_vector_type is not None:
@@ -201,7 +207,7 @@ def _hash_array_like_element_as_bytes(v):
         raise ValueError("Unsupported data type.")
 
 
-def _gen_md5_for_arraylike_obj(md5_gen, data, spark_vector_type):
+def _gen_md5_for_arraylike_obj(md5_gen, data):
     """
     Helper method to generate MD5 hash array-like object, the MD5 will calculate over:
      - array length
@@ -213,12 +219,12 @@ def _gen_md5_for_arraylike_obj(md5_gen, data, spark_vector_type):
     len_bytes = _convert_uint64_ndarray_to_bytes(np.array([len(data)], dtype="uint64"))
     md5_gen.update(len_bytes)
     if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
-        md5_gen.update(_array_like_obj_to_bytes(data, spark_vector_type))
+        md5_gen.update(_array_like_obj_to_bytes(data))
     else:
         head_rows = data[: EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH]
         tail_rows = data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH :]
-        md5_gen.update(_array_like_obj_to_bytes(head_rows, spark_vector_type))
-        md5_gen.update(_array_like_obj_to_bytes(tail_rows, spark_vector_type))
+        md5_gen.update(_array_like_obj_to_bytes(head_rows))
+        md5_gen.update(_array_like_obj_to_bytes(tail_rows))
 
 
 class EvaluationDataset:
@@ -273,15 +279,12 @@ def __init__(self, data, labels, name=None, path=None, feature_names=None):
             # run code not related to pyspark.
             if "pyspark" in sys.modules:
                 from pyspark.sql import DataFrame as SparkDataFrame
-                from pyspark.ml.linalg import Vector as SparkMLVector
 
                 supported_dataframe_types = (pd.DataFrame, SparkDataFrame)
                 spark_df_type = SparkDataFrame
-                spark_vector_type = SparkMLVector
             else:
                 supported_dataframe_types = (pd.DataFrame,)
                 spark_df_type = None
-                spark_vector_type = None
         except ImportError:
             supported_dataframe_types = (pd.DataFrame,)
 
@@ -294,12 +297,23 @@ def __init__(self, data, labels, name=None, path=None, feature_names=None):
             self._features_data = data
             self._labels_data = labels if isinstance(labels, np.ndarray) else np.array(labels)
             if isinstance(data, list):
-                num_features = len(data[0])
-            else:
-                if len(data.shape) > 1:
-                    num_features = data.shape[1]
-                else:
-                    num_features = len(data[0])
+                data = np.array(data)
+
+            if len(data.shape) != 2:
+                raise ValueError(
+                    'If the `data` argument is a numpy array, it must be a 2 dimension array '
+                    'and second dimension represent the number of features. If the `data` '
+                    'argument is a list, each of its element must be a feature array of '
+                    'numpy array or list and all element must has the same length.'
+                )
+
+            if len(self._features_data) != len(self._labels_data):
+                raise ValueError(
+                    'The input features example rows must be the same length with labels array.'
+                )
+
+            num_features = data.shape[1]
+
             if feature_names is not None:
                 feature_names = list(feature_names)
                 if num_features != len(feature_names):
@@ -307,7 +321,7 @@ def __init__(self, data, labels, name=None, path=None, feature_names=None):
                 self._feature_names = feature_names
             else:
                 self._feature_names = [
-                    f"feature_{str(i).zfill(math.ceil((math.log10(num_features))))}"
+                    f"feature_{str(i + 1).zfill(math.ceil((math.log10(num_features + 1))))}"
                     for i in range(num_features)
                 ]
         elif isinstance(data, supported_dataframe_types):
@@ -327,6 +341,7 @@ def __init__(self, data, labels, name=None, path=None, feature_names=None):
 
             self._features_data = data.drop(labels, axis=1, inplace=False)
             self._labels_data = data[labels].to_numpy()
+
             if feature_names is not None:
                 raise ValueError(
                     "If `data` argument is pandas/spark dataframe, you cannot specify the "
@@ -340,10 +355,11 @@ def __init__(self, data, labels, name=None, path=None, feature_names=None):
                 "spark DataFrame if pyspark package installed."
             )
 
+
         # generate dataset hash
         md5_gen = hashlib.md5()
-        _gen_md5_for_arraylike_obj(md5_gen, self._features_data, spark_vector_type)
-        _gen_md5_for_arraylike_obj(md5_gen, self._labels_data, spark_vector_type)
+        _gen_md5_for_arraylike_obj(md5_gen, self._features_data)
+        _gen_md5_for_arraylike_obj(md5_gen, self._labels_data)
         md5_gen.update(",".join(self._feature_names).encode("UTF-8"))
 
         self._hash = md5_gen.hexdigest()
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 235a8eac3978d..b06c7f2b357a7 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -349,7 +349,7 @@ def _log_model_explainability(self):
         for i, truncated_name in enumerate(truncated_feature_names):
             if truncated_name != self.feature_names[i]:
                 # For duplicated truncated name, attach "(f_{feature_index})" at the end
-                truncated_feature_names[i] = f"{truncated_name}(f_{i})"
+                truncated_feature_names[i] = f"{truncated_name}(f_{i + 1})"
 
         truncated_feature_name_map = {
             f: f2 for f, f2 in zip(self.feature_names, truncated_feature_names)
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index bcc7e851b3268..a59df9f7ac196 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -314,7 +314,7 @@ def test_dataset_name():
 def test_gen_md5_for_arraylike_obj():
     def get_md5(data):
         md5_gen = hashlib.md5()
-        _gen_md5_for_arraylike_obj(md5_gen, data, None)
+        _gen_md5_for_arraylike_obj(md5_gen, data)
         return md5_gen.hexdigest()
 
     list0 = list(range(20))
@@ -334,7 +334,7 @@ def test_dataset_hash(iris_dataset, iris_pandas_df_dataset, diabetes_spark_datas
     assert diabetes_spark_dataset.hash == "e646b03e976240bd0c79c6bcc1ae0bda"
 
 
-def test_datasset_extract_features_label():
+def test_datasset_with_pandas_dataframe():
     data = pd.DataFrame({"f1": [1, 2], "f2": [3, 4], "label": [0, 1]})
     eval_dataset = EvaluationDataset(data=data, labels="label")
 
@@ -344,6 +344,41 @@ def test_datasset_extract_features_label():
     assert np.array_equal(eval_dataset.labels_data, [0, 1])
 
 
+def test_datasset_with_array_data():
+    features = [[1, 2], [3, 4]]
+    labels = [0, 1]
+
+    for input_data in [features, np.array(features)]:
+        eval_dataset1 = EvaluationDataset(data=input_data, labels=labels)
+        assert np.array_equal(eval_dataset1.features_data, features)
+        assert np.array_equal(eval_dataset1.labels_data, labels)
+        assert list(eval_dataset1.feature_names) == ['feature_1', 'feature_2']
+
+    with pytest.raises(ValueError, match='all element must has the same length'):
+        EvaluationDataset(data=[[1, 2], [3, 4, 5]], labels=labels)
+
+
+def test_autogen_feature_names():
+    labels = [0]
+    eval_dataset2 = EvaluationDataset(data=[list(range(9))], labels=labels)
+    assert eval_dataset2.feature_names == [f'feature_{i + 1}' for i in range(9)]
+
+    eval_dataset2 = EvaluationDataset(data=[list(range(10))], labels=labels)
+    assert eval_dataset2.feature_names == [f'feature_{i + 1:02d}' for i in range(10)]
+
+    eval_dataset2 = EvaluationDataset(data=[list(range(99))], labels=labels)
+    assert eval_dataset2.feature_names == [f'feature_{i + 1:02d}' for i in range(99)]
+
+    eval_dataset2 = EvaluationDataset(data=[list(range(100))], labels=labels)
+    assert eval_dataset2.feature_names == [f'feature_{i + 1:03d}' for i in range(100)]
+
+    with pytest.raises(
+            ValueError,
+            match='features example rows must be the same length with labels array'
+    ):
+        EvaluationDataset(data=[[1, 2], [3, 4]], labels=[1, 2, 3])
+
+
 def test_spark_df_dataset(spark_session):
     spark_df = spark_session.createDataFrame([(1.0, 2.0, 3.0)] * 10, ["f1", "f2", "y"])
     with mock.patch.object(EvaluationDataset, "SPARK_DATAFRAME_LIMIT", 5):

From 985447a81d2ec41e6a525d654b6e6d9b759b0792 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 6 Jan 2022 18:06:48 +0800
Subject: [PATCH 113/120] use svg fig

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/artifacts.py         | 10 +++++-----
 mlflow/models/evaluation/base.py              | 16 ++++++++--------
 mlflow/models/evaluation/default_evaluator.py | 10 +++++-----
 tests/models/test_evaluation.py               | 17 ++++++++---------
 4 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/mlflow/models/evaluation/artifacts.py b/mlflow/models/evaluation/artifacts.py
index 6343d6e15aa48..509e2d71433d5 100644
--- a/mlflow/models/evaluation/artifacts.py
+++ b/mlflow/models/evaluation/artifacts.py
@@ -3,14 +3,14 @@
 from mlflow.models.evaluation.base import EvaluationArtifact
 
 
-class ImageEvaluationArtifact(EvaluationArtifact):
+class BinaryFileEvaluationArtifact(EvaluationArtifact):
     def save(self, output_artifact_path):
-        self._content.save(output_artifact_path)
+        with open(output_artifact_path, 'wb') as f:
+            f.write(self._content)
 
     def _load_content_from_file(self, local_artifact_path):
-        from PIL.Image import open as open_image
-
-        self._content = open_image(local_artifact_path)
+        with open(local_artifact_path, 'rb') as f:
+            self._content = f.read(-1)
         return self._content
 
 
diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index acdc62019bd32..b9a6517ff795b 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -294,22 +294,23 @@ def __init__(self, data, labels, name=None, path=None, feature_names=None):
                     "If data is a numpy array or list of evaluation features, "
                     "labels must be a numpy array or list of evaluation labels"
                 )
-            self._features_data = data
-            self._labels_data = labels if isinstance(labels, np.ndarray) else np.array(labels)
             if isinstance(data, list):
                 data = np.array(data)
 
             if len(data.shape) != 2:
                 raise ValueError(
-                    'If the `data` argument is a numpy array, it must be a 2 dimension array '
-                    'and second dimension represent the number of features. If the `data` '
-                    'argument is a list, each of its element must be a feature array of '
-                    'numpy array or list and all element must has the same length.'
+                    "If the `data` argument is a numpy array, it must be a 2 dimension array "
+                    "and second dimension represent the number of features. If the `data` "
+                    "argument is a list, each of its element must be a feature array of "
+                    "numpy array or list and all element must has the same length."
                 )
 
+            self._features_data = data
+            self._labels_data = labels if isinstance(labels, np.ndarray) else np.array(labels)
+
             if len(self._features_data) != len(self._labels_data):
                 raise ValueError(
-                    'The input features example rows must be the same length with labels array.'
+                    "The input features example rows must be the same length with labels array."
                 )
 
             num_features = data.shape[1]
@@ -355,7 +356,6 @@ def __init__(self, data, labels, name=None, path=None, feature_names=None):
                 "spark DataFrame if pyspark package installed."
             )
 
-
         # generate dataset hash
         md5_gen = hashlib.md5()
         _gen_md5_for_arraylike_obj(md5_gen, self._features_data)
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index b06c7f2b357a7..d2763d585397c 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -8,7 +8,7 @@
 from mlflow.utils.file_utils import TempDir
 from mlflow.utils.string_utils import truncate_str_from_middle
 from mlflow.models.utils import plot_lines
-from mlflow.models.evaluation.artifacts import ImageEvaluationArtifact, CsvEvaluationArtifact
+from mlflow.models.evaluation.artifacts import BinaryFileEvaluationArtifact, CsvEvaluationArtifact
 
 from sklearn import metrics as sk_metrics
 import math
@@ -270,18 +270,18 @@ def _log_image_artifact(
     ):
         import matplotlib.pyplot as pyplot
 
-        artifact_file_name = _gen_log_key(artifact_name, self.dataset_name) + ".png"
+        artifact_file_name = _gen_log_key(artifact_name, self.dataset_name) + ".svg"
         artifact_file_local_path = self.temp_dir.path(artifact_file_name)
 
         try:
             pyplot.clf()
             do_plot()
-            pyplot.savefig(artifact_file_local_path)
+            pyplot.savefig(artifact_file_local_path, format='svg')
         finally:
             pyplot.close(pyplot.gcf())
 
         mlflow.log_artifact(artifact_file_local_path)
-        artifact = ImageEvaluationArtifact(uri=mlflow.get_artifact_uri(artifact_file_name))
+        artifact = BinaryFileEvaluationArtifact(uri=mlflow.get_artifact_uri(artifact_file_name))
         artifact.load(artifact_file_local_path)
         self.artifacts[artifact_name] = artifact
 
@@ -363,7 +363,7 @@ def _log_model_explainability(self):
             sampled_X = sampled_X.rename(columns=truncated_feature_name_map, copy=False)
 
         if algorithm:
-            supported_algos = ['exact', 'permutation', 'partition']
+            supported_algos = ["exact", "permutation", "partition"]
             if algorithm not in supported_algos:
                 raise ValueError(
                     f"Specified explainer algorithm {algorithm} is unsupported. Currently only "
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index a59df9f7ac196..bc283d016ebc0 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -331,7 +331,7 @@ def get_md5(data):
 def test_dataset_hash(iris_dataset, iris_pandas_df_dataset, diabetes_spark_dataset):
     assert iris_dataset.hash == "d4975e40e1443d94f4f8e72c4c7d46d2"
     assert iris_pandas_df_dataset.hash == "799d4f50e2e353127f94a0e5300add06"
-    assert diabetes_spark_dataset.hash == "e646b03e976240bd0c79c6bcc1ae0bda"
+    assert diabetes_spark_dataset.hash == "99329a790dc483e7382c0d1d27aac3f3"
 
 
 def test_datasset_with_pandas_dataframe():
@@ -352,29 +352,28 @@ def test_datasset_with_array_data():
         eval_dataset1 = EvaluationDataset(data=input_data, labels=labels)
         assert np.array_equal(eval_dataset1.features_data, features)
         assert np.array_equal(eval_dataset1.labels_data, labels)
-        assert list(eval_dataset1.feature_names) == ['feature_1', 'feature_2']
+        assert list(eval_dataset1.feature_names) == ["feature_1", "feature_2"]
 
-    with pytest.raises(ValueError, match='all element must has the same length'):
+    with pytest.raises(ValueError, match="all element must has the same length"):
         EvaluationDataset(data=[[1, 2], [3, 4, 5]], labels=labels)
 
 
 def test_autogen_feature_names():
     labels = [0]
     eval_dataset2 = EvaluationDataset(data=[list(range(9))], labels=labels)
-    assert eval_dataset2.feature_names == [f'feature_{i + 1}' for i in range(9)]
+    assert eval_dataset2.feature_names == [f"feature_{i + 1}" for i in range(9)]
 
     eval_dataset2 = EvaluationDataset(data=[list(range(10))], labels=labels)
-    assert eval_dataset2.feature_names == [f'feature_{i + 1:02d}' for i in range(10)]
+    assert eval_dataset2.feature_names == [f"feature_{i + 1:02d}" for i in range(10)]
 
     eval_dataset2 = EvaluationDataset(data=[list(range(99))], labels=labels)
-    assert eval_dataset2.feature_names == [f'feature_{i + 1:02d}' for i in range(99)]
+    assert eval_dataset2.feature_names == [f"feature_{i + 1:02d}" for i in range(99)]
 
     eval_dataset2 = EvaluationDataset(data=[list(range(100))], labels=labels)
-    assert eval_dataset2.feature_names == [f'feature_{i + 1:03d}' for i in range(100)]
+    assert eval_dataset2.feature_names == [f"feature_{i + 1:03d}" for i in range(100)]
 
     with pytest.raises(
-            ValueError,
-            match='features example rows must be the same length with labels array'
+        ValueError, match="features example rows must be the same length with labels array"
     ):
         EvaluationDataset(data=[[1, 2], [3, 4]], labels=[1, 2, 3])
 

From 57dec63e042104655710fbe9c9236493b16ae22e Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 6 Jan 2022 21:28:56 +0800
Subject: [PATCH 114/120] revert svg

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/artifacts.py         | 10 +++++-----
 mlflow/models/evaluation/default_evaluator.py |  8 ++++----
 mlflow/models/evaluation/lift_curve.py        |  2 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/mlflow/models/evaluation/artifacts.py b/mlflow/models/evaluation/artifacts.py
index 509e2d71433d5..6343d6e15aa48 100644
--- a/mlflow/models/evaluation/artifacts.py
+++ b/mlflow/models/evaluation/artifacts.py
@@ -3,14 +3,14 @@
 from mlflow.models.evaluation.base import EvaluationArtifact
 
 
-class BinaryFileEvaluationArtifact(EvaluationArtifact):
+class ImageEvaluationArtifact(EvaluationArtifact):
     def save(self, output_artifact_path):
-        with open(output_artifact_path, 'wb') as f:
-            f.write(self._content)
+        self._content.save(output_artifact_path)
 
     def _load_content_from_file(self, local_artifact_path):
-        with open(local_artifact_path, 'rb') as f:
-            self._content = f.read(-1)
+        from PIL.Image import open as open_image
+
+        self._content = open_image(local_artifact_path)
         return self._content
 
 
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index d2763d585397c..d129d4205f54a 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -8,7 +8,7 @@
 from mlflow.utils.file_utils import TempDir
 from mlflow.utils.string_utils import truncate_str_from_middle
 from mlflow.models.utils import plot_lines
-from mlflow.models.evaluation.artifacts import BinaryFileEvaluationArtifact, CsvEvaluationArtifact
+from mlflow.models.evaluation.artifacts import ImageEvaluationArtifact, CsvEvaluationArtifact
 
 from sklearn import metrics as sk_metrics
 import math
@@ -270,18 +270,18 @@ def _log_image_artifact(
     ):
         import matplotlib.pyplot as pyplot
 
-        artifact_file_name = _gen_log_key(artifact_name, self.dataset_name) + ".svg"
+        artifact_file_name = _gen_log_key(artifact_name, self.dataset_name) + ".png"
         artifact_file_local_path = self.temp_dir.path(artifact_file_name)
 
         try:
             pyplot.clf()
             do_plot()
-            pyplot.savefig(artifact_file_local_path, format='svg')
+            pyplot.savefig(artifact_file_local_path)
         finally:
             pyplot.close(pyplot.gcf())
 
         mlflow.log_artifact(artifact_file_local_path)
-        artifact = BinaryFileEvaluationArtifact(uri=mlflow.get_artifact_uri(artifact_file_name))
+        artifact = ImageEvaluationArtifact(uri=mlflow.get_artifact_uri(artifact_file_name))
         artifact.load(artifact_file_local_path)
         self.artifacts[artifact_name] = artifact
 
diff --git a/mlflow/models/evaluation/lift_curve.py b/mlflow/models/evaluation/lift_curve.py
index 168cf1849e5ed..cbcba712a9329 100644
--- a/mlflow/models/evaluation/lift_curve.py
+++ b/mlflow/models/evaluation/lift_curve.py
@@ -160,6 +160,6 @@ def plot_lift_curve(
     ax.set_ylabel("Lift", fontsize=text_fontsize)
     ax.tick_params(labelsize=text_fontsize)
     ax.grid("on")
-    ax.legend(loc="lower right", fontsize=text_fontsize)
+    ax.legend(loc="best", fontsize=text_fontsize)
 
     return ax

From c4973815780d29b992bc9ad9c1d03c90fb210d3e Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Thu, 6 Jan 2022 22:41:29 +0800
Subject: [PATCH 115/120] curve dashline, legend display ap/roc, legend move
 out

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              | 18 +++---
 mlflow/models/evaluation/default_evaluator.py | 55 ++++++++++++-------
 mlflow/models/utils.py                        |  2 +-
 tests/models/test_evaluation.py               |  4 +-
 4 files changed, 48 insertions(+), 31 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index b9a6517ff795b..523835f0a3439 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -155,7 +155,7 @@ def artifacts(self) -> Dict[str, "mlflow.models.evaluation.EvaluationArtifact"]:
 _cached_mlflow_client = None
 
 
-def _convert_uint64_ndarray_to_bytes(array):
+def _hash_uint64_ndarray_as_bytes(array):
     assert len(array.shape) == 1
     # see struct pack format string https://docs.python.org/3/library/struct.html#format-strings
     return struct.pack(f">{array.size}Q", *array)
@@ -165,12 +165,12 @@ def _hash_ndarray_as_bytes(nd_array):
     from pandas.util import hash_array
     import numpy as np
 
-    return _convert_uint64_ndarray_to_bytes(
+    return _hash_uint64_ndarray_as_bytes(
         hash_array(nd_array.flatten(order="C"))
-    ) + _convert_uint64_ndarray_to_bytes(np.array(nd_array.shape, dtype="uint64"))
+    ) + _hash_uint64_ndarray_as_bytes(np.array(nd_array.shape, dtype="uint64"))
 
 
-def _array_like_obj_to_bytes(data):
+def _hash_array_like_obj_as_bytes(data):
     """
     Helper method to convert pandas dataframe/numpy array/list into bytes for
     MD5 calculation purpose.
@@ -198,7 +198,7 @@ def _hash_array_like_element_as_bytes(v):
             return v
 
         data = data.applymap(_hash_array_like_element_as_bytes)
-        return _convert_uint64_ndarray_to_bytes(hash_pandas_object(data))
+        return _hash_uint64_ndarray_as_bytes(hash_pandas_object(data))
     elif isinstance(data, np.ndarray):
         return _hash_ndarray_as_bytes(data)
     elif isinstance(data, list):
@@ -216,15 +216,15 @@ def _gen_md5_for_arraylike_obj(md5_gen, data):
     """
     import numpy as np
 
-    len_bytes = _convert_uint64_ndarray_to_bytes(np.array([len(data)], dtype="uint64"))
+    len_bytes = _hash_uint64_ndarray_as_bytes(np.array([len(data)], dtype="uint64"))
     md5_gen.update(len_bytes)
     if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
-        md5_gen.update(_array_like_obj_to_bytes(data))
+        md5_gen.update(_hash_array_like_obj_as_bytes(data))
     else:
         head_rows = data[: EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH]
         tail_rows = data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH :]
-        md5_gen.update(_array_like_obj_to_bytes(head_rows))
-        md5_gen.update(_array_like_obj_to_bytes(tail_rows))
+        md5_gen.update(_hash_array_like_obj_as_bytes(head_rows))
+        md5_gen.update(_hash_array_like_obj_as_bytes(tail_rows))
 
 
 class EvaluationDataset:
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index d129d4205f54a..0703b5df4ef4a 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -181,30 +181,27 @@ def _gen_classifier_curve(
     """
     if curve_type == "roc":
 
-        def gen_x_y_thresholds_fn(_y, _y_prob):
-            fpr, tpr, _thresholds = sk_metrics.roc_curve(_y, _y_prob)
-            return fpr, tpr, _thresholds
+        def gen_line_x_y_label_fn(_y, _y_prob):
+            fpr, tpr, _ = sk_metrics.roc_curve(_y, _y_prob)
+            auc = sk_metrics.auc(fpr, tpr)
+            return fpr, tpr, f"AUC={auc:.3f}"
 
         xlabel = "False Positive Rate"
         ylabel = "True Positive Rate"
-        legend_loc = "lower right"
     elif curve_type == "pr":
-
-        def gen_x_y_thresholds_fn(_y, _y_prob):
+        def gen_line_x_y_label_fn(_y, _y_prob):
             precision, recall, _thresholds = sk_metrics.precision_recall_curve(_y, _y_prob)
-            _thresholds = np.append(_thresholds, [1.0], axis=0)
-            return recall, precision, _thresholds
+            ap = np.mean(precision)
+            return recall, precision, f"AP={ap:.3f}"
 
         xlabel = "recall"
         ylabel = "precision"
-        legend_loc = "lower left"
     else:
         assert False, "illegal curve type"
 
     if is_binomial:
-        x_data, y_data, thresholds = gen_x_y_thresholds_fn(y, y_probs)
-        data_series = [("positive class", x_data, y_data)]
-        legend_loc = None
+        x_data, y_data, line_label = gen_line_x_y_label_fn(y, y_probs)
+        data_series = [(line_label, x_data, y_data)]
         auc = sk_metrics.auc(x_data, y_data)
     else:
         curve_list = []
@@ -213,21 +210,41 @@ def gen_x_y_thresholds_fn(_y, _y_prob):
                 positive_class_index, positive_class, y, None, y_probs
             )
 
-            x_data, y_data, thresholds = gen_x_y_thresholds_fn(y_bin, y_prob_bin)
-            curve_list.append((positive_class, x_data, y_data, thresholds))
+            x_data, y_data, line_label = gen_line_x_y_label_fn(y_bin, y_prob_bin)
+            curve_list.append((positive_class, x_data, y_data, line_label))
 
         data_series = [
-            (f"Positive Class = {positive_class}", x_data, y_data)
-            for positive_class, x_data, y_data, _ in curve_list
+            (f"label={positive_class},{line_label}", x_data, y_data)
+            for positive_class, x_data, y_data, line_label in curve_list
         ]
         auc = [sk_metrics.auc(x_data, y_data) for _, x_data, y_data, _ in curve_list]
+
+    def _do_plot(**kwargs):
+        import matplotlib.pyplot as pyplot
+        _, ax = plot_lines(**kwargs)
+        dash_line_args = {
+            'color': 'gray',
+            'alpha': 0.3,
+            'drawstyle': 'default',
+            'linestyle': 'dashed',
+        }
+        if curve_type == 'pr':
+            ax.plot([0, 1], [1, 0], **dash_line_args)
+        elif curve_type == 'roc':
+            ax.plot([0, 1], [0, 1], **dash_line_args)
+
+        if is_binomial:
+            ax.legend(loc="best")
+        else:
+            ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
+            pyplot.subplots_adjust(right=0.6, bottom=0.25)
+
     return _Curve(
-        plot_fn=plot_lines,
+        plot_fn=_do_plot,
         plot_fn_args={
             "data_series": data_series,
             "xlabel": xlabel,
             "ylabel": ylabel,
-            "legend_loc": legend_loc,
             "line_kwargs": {"drawstyle": "steps-post"},
         },
         auc=auc,
@@ -235,7 +252,7 @@ def gen_x_y_thresholds_fn(_y, _y_prob):
 
 
 _matplotlib_config = {
-    "figure.dpi": 144,
+    "figure.dpi": 288,
     "figure.figsize": [6.0, 4.0],
 }
 
diff --git a/mlflow/models/utils.py b/mlflow/models/utils.py
index 39b30c98db229..d1cd730192990 100644
--- a/mlflow/models/utils.py
+++ b/mlflow/models/utils.py
@@ -252,4 +252,4 @@ def plot_lines(data_series, xlabel, ylabel, legend_loc=None, line_kwargs=None):
 
     ax.set(xlabel=xlabel, ylabel=ylabel)
 
-    return fig
+    return fig, ax
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index bc283d016ebc0..db1ac77b9f454 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -329,9 +329,9 @@ def get_md5(data):
 
 
 def test_dataset_hash(iris_dataset, iris_pandas_df_dataset, diabetes_spark_dataset):
-    assert iris_dataset.hash == "d4975e40e1443d94f4f8e72c4c7d46d2"
+    assert iris_dataset.hash == "99329a790dc483e7382c0d1d27aac3f3"
     assert iris_pandas_df_dataset.hash == "799d4f50e2e353127f94a0e5300add06"
-    assert diabetes_spark_dataset.hash == "99329a790dc483e7382c0d1d27aac3f3"
+    assert diabetes_spark_dataset.hash == "e646b03e976240bd0c79c6bcc1ae0bda"
 
 
 def test_datasset_with_pandas_dataframe():

From 9707f7482924b584483b1ab05793cbe343c804e6 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Fri, 7 Jan 2022 00:09:17 +0800
Subject: [PATCH 116/120] linewidth 1

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/default_evaluator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 0703b5df4ef4a..e2b4fde638b74 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -245,7 +245,7 @@ def _do_plot(**kwargs):
             "data_series": data_series,
             "xlabel": xlabel,
             "ylabel": ylabel,
-            "line_kwargs": {"drawstyle": "steps-post"},
+            "line_kwargs": {"drawstyle": "steps-post", "linewidth": 1},
         },
         auc=auc,
     )

From 21411b37872c548d08727e711aa219148669beb2 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Sun, 9 Jan 2022 21:56:13 +0800
Subject: [PATCH 117/120] keyword arguments for evaluate, fix tests

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py              | 15 +++++----
 mlflow/models/evaluation/default_evaluator.py | 16 ++++++----
 tests/models/test_default_evaluator.py        | 32 ++++++++-----------
 tests/models/test_evaluation.py               | 30 ++++++++---------
 4 files changed, 46 insertions(+), 47 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 523835f0a3439..2387a45415378 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -601,7 +601,7 @@ def _get_last_failed_evaluator():
 
 
 def _evaluate(
-    model, model_type, dataset, actual_run_id, evaluator_name_list, evaluator_name_to_conf_map
+    *, model, model_type, dataset, actual_run_id, evaluator_name_list, evaluator_name_to_conf_map
 ):
     """
     The public API "evaluate" will verify argument first, and then pass normalized arguments
@@ -650,6 +650,7 @@ def _evaluate(
 
 @experimental
 def evaluate(
+    *,
     model: Union[str, "mlflow.pyfunc.PyFuncModel"],
     model_type: str,
     dataset: "mlflow.models.evaluation.EvaluationDataset",
@@ -767,10 +768,10 @@ def evaluate(
 
     with _start_run_or_reuse_active_run(run_id) as actual_run_id:
         return _evaluate(
-            model,
-            model_type,
-            dataset,
-            actual_run_id,
-            evaluator_name_list,
-            evaluator_name_to_conf_map,
+            model=model,
+            model_type=model_type,
+            dataset=dataset,
+            actual_run_id=actual_run_id,
+            evaluator_name_list=evaluator_name_list,
+            evaluator_name_to_conf_map=evaluator_name_to_conf_map,
         )
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index e2b4fde638b74..f83dfb8a33a72 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -189,6 +189,7 @@ def gen_line_x_y_label_fn(_y, _y_prob):
         xlabel = "False Positive Rate"
         ylabel = "True Positive Rate"
     elif curve_type == "pr":
+
         def gen_line_x_y_label_fn(_y, _y_prob):
             precision, recall, _thresholds = sk_metrics.precision_recall_curve(_y, _y_prob)
             ap = np.mean(precision)
@@ -221,22 +222,23 @@ def gen_line_x_y_label_fn(_y, _y_prob):
 
     def _do_plot(**kwargs):
         import matplotlib.pyplot as pyplot
+
         _, ax = plot_lines(**kwargs)
         dash_line_args = {
-            'color': 'gray',
-            'alpha': 0.3,
-            'drawstyle': 'default',
-            'linestyle': 'dashed',
+            "color": "gray",
+            "alpha": 0.3,
+            "drawstyle": "default",
+            "linestyle": "dashed",
         }
-        if curve_type == 'pr':
+        if curve_type == "pr":
             ax.plot([0, 1], [1, 0], **dash_line_args)
-        elif curve_type == 'roc':
+        elif curve_type == "roc":
             ax.plot([0, 1], [0, 1], **dash_line_args)
 
         if is_binomial:
             ax.legend(loc="best")
         else:
-            ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
+            ax.legend(loc="center left", bbox_to_anchor=(1, 0.5))
             pyplot.subplots_adjust(right=0.6, bottom=0.25)
 
     return _Curve(
diff --git a/tests/models/test_default_evaluator.py b/tests/models/test_default_evaluator.py
index 0ebd4de8d23a9..872f599ecf717 100644
--- a/tests/models/test_default_evaluator.py
+++ b/tests/models/test_default_evaluator.py
@@ -40,7 +40,7 @@ def assert_dict_equal(d1, d2, rtol):
 def test_regressor_evaluation(linear_regressor_model_uri, diabetes_dataset):
     with mlflow.start_run() as run:
         result = evaluate(
-            linear_regressor_model_uri,
+            model=linear_regressor_model_uri,
             model_type="regressor",
             dataset=diabetes_dataset,
             evaluators="default",
@@ -81,7 +81,7 @@ def test_regressor_evaluation(linear_regressor_model_uri, diabetes_dataset):
 def test_multi_classifier_evaluation(multiclass_logistic_regressor_model_uri, iris_dataset):
     with mlflow.start_run() as run:
         result = evaluate(
-            multiclass_logistic_regressor_model_uri,
+            model=multiclass_logistic_regressor_model_uri,
             model_type="classifier",
             dataset=iris_dataset,
             evaluators="default",
@@ -132,7 +132,7 @@ def test_multi_classifier_evaluation(multiclass_logistic_regressor_model_uri, ir
 def test_bin_classifier_evaluation(binary_logistic_regressor_model_uri, breast_cancer_dataset):
     with mlflow.start_run() as run:
         result = evaluate(
-            binary_logistic_regressor_model_uri,
+            model=binary_logistic_regressor_model_uri,
             model_type="classifier",
             dataset=breast_cancer_dataset,
             evaluators="default",
@@ -184,7 +184,7 @@ def test_bin_classifier_evaluation(binary_logistic_regressor_model_uri, breast_c
 def test_spark_regressor_model_evaluation(spark_linear_regressor_model_uri, diabetes_spark_dataset):
     with mlflow.start_run() as run:
         result = evaluate(
-            spark_linear_regressor_model_uri,
+            model=spark_linear_regressor_model_uri,
             model_type="regressor",
             dataset=diabetes_spark_dataset,
             evaluators="default",
@@ -222,7 +222,7 @@ def test_spark_regressor_model_evaluation(spark_linear_regressor_model_uri, diab
 def test_svm_classifier_evaluation(svm_model_uri, breast_cancer_dataset):
     with mlflow.start_run() as run:
         result = evaluate(
-            svm_model_uri,
+            model=svm_model_uri,
             model_type="classifier",
             dataset=breast_cancer_dataset,
             evaluators="default",
@@ -384,7 +384,6 @@ def test_gen_binary_precision_recall_curve():
     results = _gen_classifier_curve(
         is_binomial=True, y=y, y_probs=y_prob, labels=[0, 1], curve_type="pr"
     )
-    assert results.plot_fn is plot_lines
     assert np.allclose(
         results.plot_fn_args["data_series"][0][1],
         np.array([1.0, 0.8, 0.8, 0.8, 0.6, 0.4, 0.4, 0.2, 0.0]),
@@ -397,8 +396,7 @@ def test_gen_binary_precision_recall_curve():
     )
     assert results.plot_fn_args["xlabel"] == "recall"
     assert results.plot_fn_args["ylabel"] == "precision"
-    assert results.plot_fn_args["legend_loc"] is None
-    assert results.plot_fn_args["line_kwargs"] == {"drawstyle": "steps-post"}
+    assert results.plot_fn_args["line_kwargs"] == {"drawstyle": "steps-post", "linewidth": 1}
     assert np.isclose(results.auc, 0.7088888888888889, rtol=1e-3)
 
 
@@ -409,7 +407,6 @@ def test_gen_binary_roc_curve():
     results = _gen_classifier_curve(
         is_binomial=True, y=y, y_probs=y_prob, labels=[0, 1], curve_type="roc"
     )
-    assert results.plot_fn is plot_lines
     assert np.allclose(
         results.plot_fn_args["data_series"][0][1],
         np.array([0.0, 0.0, 0.2, 0.4, 0.4, 0.8, 0.8, 1.0]),
@@ -422,8 +419,7 @@ def test_gen_binary_roc_curve():
     )
     assert results.plot_fn_args["xlabel"] == "False Positive Rate"
     assert results.plot_fn_args["ylabel"] == "True Positive Rate"
-    assert results.plot_fn_args["legend_loc"] is None
-    assert results.plot_fn_args["line_kwargs"] == {"drawstyle": "steps-post"}
+    assert results.plot_fn_args["line_kwargs"] == {"drawstyle": "steps-post", "linewidth": 1}
     assert np.isclose(results.auc, 0.66, rtol=1e-3)
 
 
@@ -446,15 +442,15 @@ def test_gen_multiclass_precision_recall_curve():
         [0.66666667, 0.5, 1.0],
         [0.4, 0.25, 0.33333333, 0.5, 0.0, 1.0],
     ]
+    line_labels = ["label=0,AP=0.500", "label=1,AP=0.722", "label=2,AP=0.414"]
     for index, (name, x_data, y_data) in enumerate(results.plot_fn_args["data_series"]):
-        assert name == f"Positive Class = {index}"
+        assert name == line_labels[index]
         assert np.allclose(x_data, expected_x_data_list[index], rtol=1e-3)
         assert np.allclose(y_data, expected_y_data_list[index], rtol=1e-3)
 
     assert results.plot_fn_args["xlabel"] == "recall"
     assert results.plot_fn_args["ylabel"] == "precision"
-    assert results.plot_fn_args["legend_loc"] == "lower left"
-    assert results.plot_fn_args["line_kwargs"] == {"drawstyle": "steps-post"}
+    assert results.plot_fn_args["line_kwargs"] == {"drawstyle": "steps-post", "linewidth": 1}
 
     expected_auc = [0.25, 0.6666666666666666, 0.2875]
     assert np.allclose(results.auc, expected_auc, rtol=1e-3)
@@ -481,15 +477,15 @@ def test_gen_multiclass_roc_curve():
         [0.0, 0.33333333, 0.33333333, 1.0, 1.0],
     ]
     expected_y_data_list = [[0.0, 0.0, 1.0, 1.0], [0.0, 0.5, 1.0, 1.0], [0.0, 0.0, 0.5, 0.5, 1.0]]
+    line_labels = ["label=0,AUC=0.750", "label=1,AUC=0.750", "label=2,AUC=0.333"]
     for index, (name, x_data, y_data) in enumerate(results.plot_fn_args["data_series"]):
-        assert name == f"Positive Class = {index}"
+        assert name == line_labels[index]
         assert np.allclose(x_data, expected_x_data_list[index], rtol=1e-3)
         assert np.allclose(y_data, expected_y_data_list[index], rtol=1e-3)
 
     assert results.plot_fn_args["xlabel"] == "False Positive Rate"
     assert results.plot_fn_args["ylabel"] == "True Positive Rate"
-    assert results.plot_fn_args["legend_loc"] == "lower right"
-    assert results.plot_fn_args["line_kwargs"] == {"drawstyle": "steps-post"}
+    assert results.plot_fn_args["line_kwargs"] == {"drawstyle": "steps-post", "linewidth": 1}
 
-    expected_auc = [0.75, 0.7500000000000001, 0.33333333333333337]
+    expected_auc = [0.75, 0.75, 0.3333]
     assert np.allclose(results.auc, expected_auc, rtol=1e-3)
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index db1ac77b9f454..9048da65a122c 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -220,9 +220,9 @@ def test_classifier_evaluate(multiclass_logistic_regressor_model_uri, iris_datas
 
     with mlflow.start_run() as run:
         eval_result = evaluate(
-            classifier_model,
-            "classifier",
-            iris_dataset,
+            model=classifier_model,
+            model_type="classifier",
+            dataset=iris_dataset,
             run_id=None,
             evaluators="dummy_evaluator",
         )
@@ -292,9 +292,9 @@ def test_regressor_evaluate(linear_regressor_model_uri, diabetes_dataset):
     for model in [regressor_model, linear_regressor_model_uri]:
         with mlflow.start_run() as run:
             eval_result = evaluate(
-                model,
-                "regressor",
-                diabetes_dataset,
+                model=model,
+                model_type="regressor",
+                dataset=diabetes_dataset,
                 run_id=None,
                 evaluators="dummy_evaluator",
             )
@@ -472,9 +472,9 @@ def test_evaluator_interface(multiclass_logistic_regressor_model_uri, iris_datas
                     match="The model could not be evaluated by any of the registered evaluators",
                 ):
                     evaluate(
-                        multiclass_logistic_regressor_model_uri,
-                        "classifier",
-                        iris_dataset,
+                        model=multiclass_logistic_regressor_model_uri,
+                        model_type="classifier",
+                        dataset=iris_dataset,
                         run_id=None,
                         evaluators="test_evaluator1",
                         evaluator_config=evaluator1_config,
@@ -489,9 +489,9 @@ def test_evaluator_interface(multiclass_logistic_regressor_model_uri, iris_datas
             classifier_model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri)
             with mlflow.start_run() as run:
                 eval1_result = evaluate(
-                    classifier_model,
-                    "classifier",
-                    iris_dataset,
+                    model=classifier_model,
+                    model_type="classifier",
+                    dataset=iris_dataset,
                     run_id=None,
                     evaluators="test_evaluator1",
                     evaluator_config=evaluator1_config,
@@ -536,9 +536,9 @@ def test_evaluate_with_multi_evaluators(multiclass_logistic_regressor_model_uri,
                 classifier_model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri)
                 with mlflow.start_run() as run:
                     eval_result = evaluate(
-                        classifier_model,
-                        "classifier",
-                        iris_dataset,
+                        model=classifier_model,
+                        model_type="classifier",
+                        dataset=iris_dataset,
                         run_id=None,
                         evaluators=evaluators,
                         evaluator_config={

From 84c539891a3d361847106ec2cfbe11a2b032ee5b Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 10 Jan 2022 18:01:35 +0800
Subject: [PATCH 118/120] mark abc.abstractmethod, kw args for ModelEvaluator
 methods

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py | 35 ++++++++++++++------------
 tests/models/test_evaluation.py  | 43 +++++++++++++++++++++-----------
 2 files changed, 47 insertions(+), 31 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 2387a45415378..b5538f4488b63 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -16,6 +16,7 @@
 import sys
 import math
 from collections import OrderedDict
+from abc import ABCMeta, abstractmethod
 
 
 _logger = logging.getLogger(__name__)
@@ -29,7 +30,7 @@ class EvaluationMetrics(dict):
     pass
 
 
-class EvaluationArtifact:
+class EvaluationArtifact(metaclass=ABCMeta):
     """
     A model evaluation artifact containing an artifact uri and content.
     """
@@ -38,12 +39,13 @@ def __init__(self, uri, content=None):
         self._uri = uri
         self._content = content
 
+    @abstractmethod
     def _load_content_from_file(self, local_artifact_path):
         """
         Abstract interface to load the content from local artifact file path,
         and return the loaded content.
         """
-        raise NotImplementedError()
+        pass
 
     def load(self, local_artifact_path=None):
         """
@@ -61,9 +63,10 @@ def load(self, local_artifact_path=None):
                 self._content = self._load_content_from_file(local_artifact_file)
         return self._content
 
+    @abstractmethod
     def save(self, output_artifact_path):
         """Save artifact content into specified path."""
-        raise NotImplementedError()
+        pass
 
     @property
     def content(self):
@@ -444,8 +447,9 @@ def _log_dataset_tag(self, client, run_id, model_uuid):
         )
 
 
-class ModelEvaluator:
-    def can_evaluate(self, model_type, evaluator_config=None, **kwargs) -> bool:
+class ModelEvaluator(metaclass=ABCMeta):
+    @abstractmethod
+    def can_evaluate(self, *, model_type, evaluator_config=None, **kwargs) -> bool:
         """
         :param model_type: A string describing the model type (e.g., "regressor",
                            "classifier", …).
@@ -459,15 +463,8 @@ def can_evaluate(self, model_type, evaluator_config=None, **kwargs) -> bool:
         """
         raise NotImplementedError()
 
-    def evaluate(
-        self,
-        model,
-        model_type,
-        dataset,
-        run_id,
-        evaluator_config,
-        **kwargs,
-    ):
+    @abstractmethod
+    def evaluate(self, *, model, model_type, dataset, run_id, evaluator_config, **kwargs):
         """
         The abstract API to log metrics and artifacts, and return evaluation results.
 
@@ -627,9 +624,15 @@ def _evaluate(
             continue
 
         _last_failed_evaluator = evaluator_name
-        if evaluator.can_evaluate(model_type, config):
+        if evaluator.can_evaluate(model_type=model_type, evaluator_config=config):
             _logger.info(f"Evaluating the model with the {evaluator_name} evaluator.")
-            result = evaluator.evaluate(model, model_type, dataset, actual_run_id, config)
+            result = evaluator.evaluate(
+                model=model,
+                model_type=model_type,
+                dataset=dataset,
+                run_id=actual_run_id,
+                evaluator_config=config,
+            )
             eval_results.append(result)
 
     _last_failed_evaluator = None
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index 9048da65a122c..3c4b924bd07da 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -479,7 +479,9 @@ def test_evaluator_interface(multiclass_logistic_regressor_model_uri, iris_datas
                         evaluators="test_evaluator1",
                         evaluator_config=evaluator1_config,
                     )
-                mock_can_evaluate.assert_called_once_with("classifier", evaluator1_config)
+                mock_can_evaluate.assert_called_once_with(
+                    model_type="classifier", evaluator_config=evaluator1_config
+                )
                 mock_evaluate.assert_not_called()
         with mock.patch.object(
             FakeEvauator1, "can_evaluate", return_value=True
@@ -499,9 +501,15 @@ def test_evaluator_interface(multiclass_logistic_regressor_model_uri, iris_datas
                 assert eval1_result.metrics == evaluator1_return_value.metrics
                 assert eval1_result.artifacts == evaluator1_return_value.artifacts
 
-                mock_can_evaluate.assert_called_once_with("classifier", evaluator1_config)
+                mock_can_evaluate.assert_called_once_with(
+                    model_type="classifier", evaluator_config=evaluator1_config
+                )
                 mock_evaluate.assert_called_once_with(
-                    classifier_model, "classifier", iris_dataset, run.info.run_id, evaluator1_config
+                    model=classifier_model,
+                    model_type="classifier",
+                    dataset=iris_dataset,
+                    run_id=run.info.run_id,
+                    evaluator_config=evaluator1_config,
                 )
 
 
@@ -554,21 +562,26 @@ def test_evaluate_with_multi_evaluators(multiclass_logistic_regressor_model_uri,
                         **evaluator1_return_value.artifacts,
                         **evaluator2_return_value.artifacts,
                     }
-                    mock_can_evaluate1.assert_called_once_with("classifier", evaluator1_config)
+                    mock_can_evaluate1.assert_called_once_with(
+                        model_type="classifier", evaluator_config=evaluator1_config
+                    )
                     mock_evaluate1.assert_called_once_with(
-                        classifier_model,
-                        "classifier",
-                        iris_dataset,
-                        run.info.run_id,
-                        evaluator1_config,
+                        model=classifier_model,
+                        model_type="classifier",
+                        dataset=iris_dataset,
+                        run_id=run.info.run_id,
+                        evaluator_config=evaluator1_config,
+                    )
+                    mock_can_evaluate2.assert_called_once_with(
+                        model_type="classifier",
+                        evaluator_config=evaluator2_config,
                     )
-                    mock_can_evaluate2.assert_called_once_with("classifier", evaluator2_config)
                     mock_evaluate2.assert_called_once_with(
-                        classifier_model,
-                        "classifier",
-                        iris_dataset,
-                        run.info.run_id,
-                        evaluator2_config,
+                        model=classifier_model,
+                        model_type="classifier",
+                        dataset=iris_dataset,
+                        run_id=run.info.run_id,
+                        evaluator_config=evaluator2_config,
                     )
 
 

From 464118353774c44c95aaf070cc7b8f4782dcff15 Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 10 Jan 2022 18:19:12 +0800
Subject: [PATCH 119/120] fix pylint

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/base.py                              | 2 +-
 mlflow/models/evaluation/default_evaluator.py                 | 3 ++-
 .../mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py  | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index b5538f4488b63..074da7537e7d6 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -449,7 +449,7 @@ def _log_dataset_tag(self, client, run_id, model_uuid):
 
 class ModelEvaluator(metaclass=ABCMeta):
     @abstractmethod
-    def can_evaluate(self, *, model_type, evaluator_config=None, **kwargs) -> bool:
+    def can_evaluate(self, *, model_type, evaluator_config, **kwargs) -> bool:
         """
         :param model_type: A string describing the model type (e.g., "regressor",
                            "classifier", …).
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index f83dfb8a33a72..0fe4dc499558a 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -261,7 +261,7 @@ def _do_plot(**kwargs):
 
 # pylint: disable=attribute-defined-outside-init
 class DefaultEvaluator(ModelEvaluator):
-    def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
+    def can_evaluate(self, *, model_type, evaluator_config, **kwargs):
         return model_type in ["classifier", "regressor"]
 
     def _log_metrics(self):
@@ -616,6 +616,7 @@ def _evaluate_regressor(self):
 
     def evaluate(
         self,
+        *,
         model: "mlflow.pyfunc.PyFuncModel",
         model_type,
         dataset,
diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
index 10ec46c63edac..cea1770bc443d 100644
--- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -23,7 +23,7 @@ def _load_content_from_file(self, local_artifact_path):
 
 
 class DummyEvaluator(ModelEvaluator):
-    def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
+    def can_evaluate(self, *, model_type, evaluator_config, **kwargs):
         return model_type in ["classifier", "regressor"]
 
     def _log_metrics(self, run_id, metrics, dataset_name):
@@ -41,7 +41,7 @@ def _log_metrics(self, run_id, metrics, dataset_name):
         )
 
     def evaluate(
-        self, model, model_type, dataset, run_id, evaluator_config=None, **kwargs
+        self, *, model, model_type, dataset, run_id, evaluator_config, **kwargs
     ) -> EvaluationResult:
         client = mlflow.tracking.MlflowClient()
         X = dataset.features_data

From f561e4f8c11ad6ed8d996578ab7e8e322328a0fd Mon Sep 17 00:00:00 2001
From: Weichen Xu <weichen.xu@databricks.com>
Date: Mon, 10 Jan 2022 18:42:34 +0800
Subject: [PATCH 120/120] fix pylint

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
---
 mlflow/models/evaluation/default_evaluator.py             | 1 +
 tests/models/test_evaluation.py                           | 8 ++++----
 .../mlflow_test_plugin/dummy_evaluator.py                 | 2 ++
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 0fe4dc499558a..6ee604aed8988 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -261,6 +261,7 @@ def _do_plot(**kwargs):
 
 # pylint: disable=attribute-defined-outside-init
 class DefaultEvaluator(ModelEvaluator):
+    # pylint: disable=unused-argument
     def can_evaluate(self, *, model_type, evaluator_config, **kwargs):
         return model_type in ["classifier", "regressor"]
 
diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py
index 3c4b924bd07da..0555ca82a9a47 100644
--- a/tests/models/test_evaluation.py
+++ b/tests/models/test_evaluation.py
@@ -421,18 +421,18 @@ def test_log_dataset_tag(iris_dataset, iris_pandas_df_dataset):
 
 
 class FakeEvauator1(ModelEvaluator):
-    def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
+    def can_evaluate(self, *, model_type, evaluator_config, **kwargs):
         raise RuntimeError()
 
-    def evaluate(self, model, model_type, dataset, run_id, evaluator_config, **kwargs):
+    def evaluate(self, *, model, model_type, dataset, run_id, evaluator_config, **kwargs):
         raise RuntimeError()
 
 
 class FakeEvauator2(ModelEvaluator):
-    def can_evaluate(self, model_type, evaluator_config=None, **kwargs):
+    def can_evaluate(self, *, model_type, evaluator_config, **kwargs):
         raise RuntimeError()
 
-    def evaluate(self, model, model_type, dataset, run_id, evaluator_config, **kwargs):
+    def evaluate(self, *, model, model_type, dataset, run_id, evaluator_config, **kwargs):
         raise RuntimeError()
 
 
diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
index cea1770bc443d..c88bd21d09321 100644
--- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
+++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py
@@ -23,6 +23,7 @@ def _load_content_from_file(self, local_artifact_path):
 
 
 class DummyEvaluator(ModelEvaluator):
+    # pylint: disable=unused-argument
     def can_evaluate(self, *, model_type, evaluator_config, **kwargs):
         return model_type in ["classifier", "regressor"]
 
@@ -40,6 +41,7 @@ def _log_metrics(self, run_id, metrics, dataset_name):
             ],
         )
 
+    # pylint: disable=unused-argument
     def evaluate(
         self, *, model, model_type, dataset, run_id, evaluator_config, **kwargs
     ) -> EvaluationResult: