From 964f5ab75098c55f028f8acfeeae05df35ea68d5 Mon Sep 17 00:00:00 2001 From: WeichenXu Date: Mon, 10 Jan 2022 20:17:19 +0800 Subject: [PATCH] Evaluation Default evaluator (#5092) * init Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * rename module Signed-off-by: Weichen Xu * address comments Signed-off-by: Weichen Xu * address comments Signed-off-by: Weichen Xu * revert black change Signed-off-by: Weichen Xu * change module path Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * address comments Signed-off-by: Weichen Xu * fix Signed-off-by: Weichen Xu * refactor Signed-off-by: Weichen Xu * lazy load pyspark Signed-off-by: Weichen Xu * revert export Signed-off-by: Weichen Xu * fix curcit import Signed-off-by: Weichen Xu * update tests Signed-off-by: Weichen Xu * fix conftest.py Signed-off-by: Weichen Xu * Revert "fix conftest.py" This reverts commit 2ea29c62bfffc5461bf77f3da15b5c00f51de19b. * fix tests Signed-off-by: Weichen Xu * update doc Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * default evaluator Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * fix Signed-off-by: Weichen Xu * fix Signed-off-by: Weichen Xu * address comments Signed-off-by: Weichen Xu * fix doc Signed-off-by: Weichen Xu * fix doc Signed-off-by: Weichen Xu * update import Signed-off-by: Weichen Xu * fix doc Signed-off-by: Weichen Xu * update hash algo Signed-off-by: Weichen Xu * update import Signed-off-by: Weichen Xu * address comment Signed-off-by: Weichen Xu * add tests Signed-off-by: Weichen Xu * fix lint Signed-off-by: Weichen Xu * add tests Signed-off-by: Weichen Xu * add more tests Signed-off-by: Weichen Xu * add tests Signed-off-by: Weichen Xu * fix lint Signed-off-by: Weichen Xu * update shap explainer Signed-off-by: Weichen Xu * address comments Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * remove scikitplot dep Signed-off-by: Weichen Xu * add pr curve Signed-off-by: Weichen Xu * add shap.summary_plot Signed-off-by: Weichen Xu * log explainer Signed-off-by: Weichen Xu * address comments Signed-off-by: Weichen Xu * address comments Signed-off-by: Weichen Xu * improve explainer code Signed-off-by: Weichen Xu * address comments Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * address comments Signed-off-by: Weichen Xu * update shap init Signed-off-by: Weichen Xu * update explainer creating Signed-off-by: Weichen Xu * update predict_proba Signed-off-by: Weichen Xu * address comments Signed-off-by: Weichen Xu * refactor Signed-off-by: Weichen Xu * add multi-class metrics artifacts Signed-off-by: Weichen Xu * update doc Signed-off-by: Weichen Xu * add log_loss metric Signed-off-by: Weichen Xu * lazy load pyspark Signed-off-by: Weichen Xu * address ben comments Signed-off-by: Weichen Xu * fix Signed-off-by: Weichen Xu * prevent show shap logo, add tests Signed-off-by: Weichen Xu * support spark model Signed-off-by: Weichen Xu * add tests Signed-off-by: Weichen Xu * add shap version check Signed-off-by: Weichen Xu * update docs, loose classifier label limit Signed-off-by: Weichen Xu * add tests Signed-off-by: Weichen Xu * multiclass classifier merge metrics/plots Signed-off-by: Weichen Xu * zfill feature name Signed-off-by: Weichen Xu * update doc Signed-off-by: Weichen Xu * add config max_num_classes_threshold_logging_roc_pr_curve_for_multiclass_classifier Signed-off-by: Weichen Xu * refactor Signed-off-by: Weichen Xu * update tests Signed-off-by: Weichen Xu * improve label handling Signed-off-by: Weichen Xu * refactor Signed-off-by: Weichen Xu * add tests Signed-off-by: Weichen Xu * black Signed-off-by: Weichen Xu * update Signed-off-by: Weichen Xu * increase plot dpi Signed-off-by: Weichen Xu * fix test fixture Signed-off-by: Weichen Xu * fix pylint Signed-off-by: Weichen Xu * update doc Signed-off-by: Weichen Xu * use matplot rc_context Signed-off-by: Weichen Xu * fix shap import Signed-off-by: Weichen Xu * refactor EvaluationDataset Signed-off-by: Weichen Xu * limit user specify shap algos Signed-off-by: Weichen Xu * clean Signed-off-by: Weichen Xu * update evaluation dataset Signed-off-by: Weichen Xu * use svg fig Signed-off-by: Weichen Xu * revert svg Signed-off-by: Weichen Xu * curve dashline, legend display ap/roc, legend move out Signed-off-by: Weichen Xu * linewidth 1 Signed-off-by: Weichen Xu * keyword arguments for evaluate, fix tests Signed-off-by: Weichen Xu * mark abc.abstractmethod, kw args for ModelEvaluator methods Signed-off-by: Weichen Xu * fix pylint Signed-off-by: Weichen Xu * fix pylint Signed-off-by: Weichen Xu --- mlflow/models/evaluation/artifacts.py | 23 + mlflow/models/evaluation/base.py | 597 +++++++++++----- mlflow/models/evaluation/default_evaluator.py | 672 ++++++++++++++++++ .../models/evaluation/evaluator_registry.py | 3 + mlflow/models/evaluation/lift_curve.py | 165 +++++ mlflow/models/utils.py | 19 + mlflow/utils/string_utils.py | 10 + requirements/small-requirements.txt | 3 + tests/models/test_default_evaluator.py | 491 +++++++++++++ tests/models/test_evaluation.py | 386 +++++++--- .../mlflow_test_plugin/dummy_evaluator.py | 9 +- 11 files changed, 2091 insertions(+), 287 deletions(-) create mode 100644 mlflow/models/evaluation/artifacts.py create mode 100644 mlflow/models/evaluation/default_evaluator.py create mode 100644 mlflow/models/evaluation/lift_curve.py create mode 100644 tests/models/test_default_evaluator.py diff --git a/mlflow/models/evaluation/artifacts.py b/mlflow/models/evaluation/artifacts.py new file mode 100644 index 0000000000000..6343d6e15aa48 --- /dev/null +++ b/mlflow/models/evaluation/artifacts.py @@ -0,0 +1,23 @@ +import pandas as pd + +from mlflow.models.evaluation.base import EvaluationArtifact + + +class ImageEvaluationArtifact(EvaluationArtifact): + def save(self, output_artifact_path): + self._content.save(output_artifact_path) + + def _load_content_from_file(self, local_artifact_path): + from PIL.Image import open as open_image + + self._content = open_image(local_artifact_path) + return self._content + + +class CsvEvaluationArtifact(EvaluationArtifact): + def save(self, output_artifact_path): + self._content.to_csv(output_artifact_path, index=False) + + def _load_content_from_file(self, local_artifact_path): + self._content = pd.read_csv(local_artifact_path) + return self._content diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py index 1c88ecd821c89..074da7537e7d6 100644 --- a/mlflow/models/evaluation/base.py +++ b/mlflow/models/evaluation/base.py @@ -10,8 +10,13 @@ from mlflow.tracking.artifact_utils import _download_artifact_from_uri from mlflow.utils import _get_fully_qualified_class_name from mlflow.utils.class_utils import _get_class_from_string +from mlflow.utils.annotations import experimental import logging import struct +import sys +import math +from collections import OrderedDict +from abc import ABCMeta, abstractmethod _logger = logging.getLogger(__name__) @@ -25,7 +30,7 @@ class EvaluationMetrics(dict): pass -class EvaluationArtifact: +class EvaluationArtifact(metaclass=ABCMeta): """ A model evaluation artifact containing an artifact uri and content. """ @@ -34,12 +39,13 @@ def __init__(self, uri, content=None): self._uri = uri self._content = content + @abstractmethod def _load_content_from_file(self, local_artifact_path): """ Abstract interface to load the content from local artifact file path, and return the loaded content. """ - raise NotImplementedError() + pass def load(self, local_artifact_path=None): """ @@ -57,9 +63,10 @@ def load(self, local_artifact_path=None): self._content = self._load_content_from_file(local_artifact_file) return self._content + @abstractmethod def save(self, output_artifact_path): """Save artifact content into specified path.""" - raise NotImplementedError() + pass @property def content(self): @@ -151,6 +158,78 @@ def artifacts(self) -> Dict[str, "mlflow.models.evaluation.EvaluationArtifact"]: _cached_mlflow_client = None +def _hash_uint64_ndarray_as_bytes(array): + assert len(array.shape) == 1 + # see struct pack format string https://docs.python.org/3/library/struct.html#format-strings + return struct.pack(f">{array.size}Q", *array) + + +def _hash_ndarray_as_bytes(nd_array): + from pandas.util import hash_array + import numpy as np + + return _hash_uint64_ndarray_as_bytes( + hash_array(nd_array.flatten(order="C")) + ) + _hash_uint64_ndarray_as_bytes(np.array(nd_array.shape, dtype="uint64")) + + +def _hash_array_like_obj_as_bytes(data): + """ + Helper method to convert pandas dataframe/numpy array/list into bytes for + MD5 calculation purpose. + """ + from pandas.util import hash_pandas_object + import numpy as np + import pandas as pd + + if isinstance(data, pd.DataFrame): + # add checking `'pyspark' in sys.modules` to avoid importing pyspark when user + # run code not related to pyspark. + if "pyspark" in sys.modules: + from pyspark.ml.linalg import Vector as spark_vector_type + else: + spark_vector_type = None + + def _hash_array_like_element_as_bytes(v): + if spark_vector_type is not None: + if isinstance(v, spark_vector_type): + return _hash_ndarray_as_bytes(v.toArray()) + if isinstance(v, np.ndarray): + return _hash_ndarray_as_bytes(v) + if isinstance(v, list): + return _hash_ndarray_as_bytes(np.array(v)) + return v + + data = data.applymap(_hash_array_like_element_as_bytes) + return _hash_uint64_ndarray_as_bytes(hash_pandas_object(data)) + elif isinstance(data, np.ndarray): + return _hash_ndarray_as_bytes(data) + elif isinstance(data, list): + return _hash_ndarray_as_bytes(np.array(data)) + else: + raise ValueError("Unsupported data type.") + + +def _gen_md5_for_arraylike_obj(md5_gen, data): + """ + Helper method to generate MD5 hash array-like object, the MD5 will calculate over: + - array length + - first NUM_SAMPLE_ROWS_FOR_HASH rows content + - last NUM_SAMPLE_ROWS_FOR_HASH rows content + """ + import numpy as np + + len_bytes = _hash_uint64_ndarray_as_bytes(np.array([len(data)], dtype="uint64")) + md5_gen.update(len_bytes) + if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2: + md5_gen.update(_hash_array_like_obj_as_bytes(data)) + else: + head_rows = data[: EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH] + tail_rows = data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH :] + md5_gen.update(_hash_array_like_obj_as_bytes(head_rows)) + md5_gen.update(_hash_array_like_obj_as_bytes(tail_rows)) + + class EvaluationDataset: """ An input dataset for model evaluation. This is intended for use with the @@ -161,13 +240,15 @@ class EvaluationDataset: NUM_SAMPLE_ROWS_FOR_HASH = 5 SPARK_DATAFRAME_LIMIT = 10000 - def __init__(self, data, labels, name=None, path=None): + def __init__(self, data, labels, name=None, path=None, feature_names=None): """ :param data: One of the following: - A numpy array or list of evaluation features, excluding labels. - A Pandas DataFrame, or a spark DataFrame, containing evaluation features and labels. All columns will be regarded as feature columns except the "labels" column. + Note: If the mlflow model to be evaluated is a pyspark ML model, then the input data must + be a spark DataFrame contains a feature column of "Vector" type, and a label column. :param labels: One of the following: - A numpy array or list of evaluation labels, if `data` is also a numpy array or list. @@ -178,147 +259,131 @@ def __init__(self, data, labels, name=None, path=None): :param path: (Optional) the path to a serialized DataFrame (must not contain "). (e.g. a delta table, parquet file) + + :param feature_names: (Optional) A list of the feature names attached to the numpy array + input data. The argument is only useful in the case the input data is numpy array. + For pandas DataFrame input case, the pandas column name will be used as feature name. + The feature names will be shown in model explainability plots. """ import numpy as np import pandas as pd - try: - from pyspark.sql import DataFrame as SparkDataFrame - - supported_dataframe_types = (pd.DataFrame, SparkDataFrame) - except ImportError: - supported_dataframe_types = (pd.DataFrame,) - if name is not None and '"' in name: raise ValueError(f'Dataset name cannot include a double quote (") but got {name}') if path is not None and '"' in path: raise ValueError(f'Dataset path cannot include a double quote (") but got {path}') + self._user_specified_name = name + self._path = path + self._hash = None + + try: + # add checking `'pyspark' in sys.modules` to avoid importing pyspark when user + # run code not related to pyspark. + if "pyspark" in sys.modules: + from pyspark.sql import DataFrame as SparkDataFrame + + supported_dataframe_types = (pd.DataFrame, SparkDataFrame) + spark_df_type = SparkDataFrame + else: + supported_dataframe_types = (pd.DataFrame,) + spark_df_type = None + except ImportError: + supported_dataframe_types = (pd.DataFrame,) + if isinstance(data, (np.ndarray, list)): if not isinstance(labels, (np.ndarray, list)): raise ValueError( "If data is a numpy array or list of evaluation features, " "labels must be a numpy array or list of evaluation labels" ) + if isinstance(data, list): + data = np.array(data) + + if len(data.shape) != 2: + raise ValueError( + "If the `data` argument is a numpy array, it must be a 2 dimension array " + "and second dimension represent the number of features. If the `data` " + "argument is a list, each of its element must be a feature array of " + "numpy array or list and all element must has the same length." + ) + + self._features_data = data + self._labels_data = labels if isinstance(labels, np.ndarray) else np.array(labels) + + if len(self._features_data) != len(self._labels_data): + raise ValueError( + "The input features example rows must be the same length with labels array." + ) + + num_features = data.shape[1] + + if feature_names is not None: + feature_names = list(feature_names) + if num_features != len(feature_names): + raise ValueError("feature name list must be the same length with feature data.") + self._feature_names = feature_names + else: + self._feature_names = [ + f"feature_{str(i + 1).zfill(math.ceil((math.log10(num_features + 1))))}" + for i in range(num_features) + ] elif isinstance(data, supported_dataframe_types): if not isinstance(labels, str): raise ValueError( "If data is a Pandas DataFrame or Spark DataFrame, labels must be the " "string name of a column from `data` that contains evaluation labels" ) + if isinstance(data, spark_df_type): + _logger.warning( + f"Specified Spark DataFrame is too large for model evaluation. Only " + f"the first {EvaluationDataset.SPARK_DATAFRAME_LIMIT} rows will be used." + "If you want evaluate on the whole spark dataframe, please manually call " + "`spark_dataframe.toPandas()`." + ) + data = data.limit(EvaluationDataset.SPARK_DATAFRAME_LIMIT).toPandas() + + self._features_data = data.drop(labels, axis=1, inplace=False) + self._labels_data = data[labels].to_numpy() + + if feature_names is not None: + raise ValueError( + "If `data` argument is pandas/spark dataframe, you cannot specify the " + "`feature_names` argument, instead, the column names of the input " + "dataframe will be used as the feature names." + ) + self._feature_names = list(self._features_data.columns) else: raise ValueError( "The data argument must be a numpy array, a list or a Pandas DataFrame, or " "spark DataFrame if pyspark package installed." ) - self._user_specified_name = name - self._original_data = data - self._data = None - self.labels = labels - self.path = path - self._hash = None + # generate dataset hash + md5_gen = hashlib.md5() + _gen_md5_for_arraylike_obj(md5_gen, self._features_data) + _gen_md5_for_arraylike_obj(md5_gen, self._labels_data) + md5_gen.update(",".join(self._feature_names).encode("UTF-8")) - @property - def data(self): - """ - Return original data if data is numpy array or pandas dataframe, - For spark dataframe, will only return the first SPARK_DATAFRAME_LIMIT rows as pandas - dataframe and emit warning. - """ - if self._data is not None: - return self._data - - try: - from pyspark.sql import DataFrame as SparkDataFrame - - spark_df_type = SparkDataFrame - except ImportError: - spark_df_type = None + self._hash = md5_gen.hexdigest() - if spark_df_type and isinstance(self._original_data, spark_df_type): - self._data = self._original_data.limit( - EvaluationDataset.SPARK_DATAFRAME_LIMIT - ).toPandas() - _logger.warning( - f"Specified Spark DataFrame is too large for model evaluation. Only " - f"the first {EvaluationDataset.SPARK_DATAFRAME_LIMIT} rows will be used." - ) - else: - self._data = self._original_data - - return self._data - - def _extract_features_and_labels(self): - """ - Extract features data and labels data. - For spark dataframe, will only extract the first SPARK_DATAFRAME_LIMIT rows data - and emit warning. - """ - import numpy as np - - if isinstance(self.data, np.ndarray): - return self.data, self.labels - else: - return ( - self.data.drop(self.labels, axis=1, inplace=False), - self.data[self.labels].to_numpy(), - ) - - @staticmethod - def _convert_uint64_ndarray_to_bytes(array): - assert len(array.shape) == 1 - # see struct pack format string https://docs.python.org/3/library/struct.html#format-strings - return struct.pack(f">{array.size}Q", *array) + @property + def feature_names(self): + return self._feature_names - @staticmethod - def _array_like_obj_to_bytes(data): + @property + def features_data(self): """ - Helper method to convert pandas dataframe/numpy array/list into bytes for - MD5 calculation purpose. + return features data as a numpy array or a pandas DataFrame. """ - from pandas.util import hash_pandas_object, hash_array - import numpy as np - import pandas as pd + return self._features_data - if isinstance(data, pd.DataFrame): - return EvaluationDataset._convert_uint64_ndarray_to_bytes(hash_pandas_object(data)) - elif isinstance(data, np.ndarray): - return EvaluationDataset._convert_uint64_ndarray_to_bytes( - hash_array(data.flatten(order="C")) - ) - elif isinstance(data, list): - return EvaluationDataset._convert_uint64_ndarray_to_bytes(hash_array(np.array(data))) - else: - raise ValueError("Unsupported data type.") - - @staticmethod - def _gen_md5_for_arraylike_obj(md5_gen, data): + @property + def labels_data(self): """ - Helper method to generate MD5 hash array-like object, the MD5 will calculate over: - - array length - - first NUM_SAMPLE_ROWS_FOR_HASH rows content - - last NUM_SAMPLE_ROWS_FOR_HASH rows content + return labels data as a numpy array """ - import numpy as np - - len_bytes = EvaluationDataset._convert_uint64_ndarray_to_bytes( - np.array([len(data)], dtype="uint64") - ) - md5_gen.update(len_bytes) - if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2: - md5_gen.update(EvaluationDataset._array_like_obj_to_bytes(data)) - else: - md5_gen.update( - EvaluationDataset._array_like_obj_to_bytes( - data[: EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH] - ) - ) - md5_gen.update( - EvaluationDataset._array_like_obj_to_bytes( - data[-EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH :] - ) - ) + return self._labels_data @property def name(self): @@ -329,25 +394,17 @@ def name(self): return self._user_specified_name if self._user_specified_name is not None else self.hash @property - def hash(self): + def path(self): """ - Compute a hash from the specified dataset by selecting the first 5 records, last 5 records, - dataset size and feeding them through a cheap, low-collision hash function + Dataset path """ - import numpy as np - import pandas as pd + return self._path - if self._hash is None: - md5_gen = hashlib.md5() - if isinstance(self.data, np.ndarray): - EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data) - EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.labels) - elif isinstance(self.data, pd.DataFrame): - column_names = ",".join(self.data.columns) - meta_str = f"columns={column_names}\nlabels={self.labels}" - md5_gen.update(meta_str.encode("UTF-8")) - EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, self.data) - self._hash = md5_gen.hexdigest() + @property + def hash(self): + """ + Dataset hash, includes hash on first 20 rows and last 20 rows. + """ return self._hash @property @@ -363,7 +420,7 @@ def _metadata(self): metadata["path"] = self.path return metadata - def _log_dataset_tag(self, client, run_id): + def _log_dataset_tag(self, client, run_id, model_uuid): """ Log dataset metadata as a tag "mlflow.datasets", if the tag already exists, it will append current dataset metadata into existing tag content. @@ -374,10 +431,14 @@ def _log_dataset_tag(self, client, run_id): dataset_metadata_list = json.loads(existing_dataset_metadata_str) for metadata in dataset_metadata_list: - if metadata["hash"] == self.hash and metadata["name"] == self._user_specified_name: + if ( + metadata["hash"] == self.hash + and metadata["name"] == self.name + and metadata["model"] == model_uuid + ): break else: - dataset_metadata_list.append(self._metadata) + dataset_metadata_list.append({**self._metadata, "model": model_uuid}) dataset_metadata_str = json.dumps(dataset_metadata_list, separators=(",", ":")) client.log_batch( @@ -386,8 +447,9 @@ def _log_dataset_tag(self, client, run_id): ) -class ModelEvaluator: - def can_evaluate(self, model_type, evaluator_config=None, **kwargs) -> bool: +class ModelEvaluator(metaclass=ABCMeta): + @abstractmethod + def can_evaluate(self, *, model_type, evaluator_config, **kwargs) -> bool: """ :param model_type: A string describing the model type (e.g., "regressor", "classifier", …). @@ -401,15 +463,8 @@ def can_evaluate(self, model_type, evaluator_config=None, **kwargs) -> bool: """ raise NotImplementedError() - def evaluate( - self, - model, - model_type, - dataset, - run_id, - evaluator_config, - **kwargs, - ): + @abstractmethod + def evaluate(self, *, model, model_type, dataset, run_id, evaluator_config, **kwargs): """ The abstract API to log metrics and artifacts, and return evaluation results. @@ -456,12 +511,149 @@ def _start_run_or_reuse_active_run(run_id): else: if run_id and active_run.info.run_id != run_id: raise ValueError( - "An active run exists, you cannot specify another run_id when " "evaluating." + "An active run exists, you cannot specify another run_id when evaluating." ) yield active_run.info.run_id +def _normalize_evaluators_and_evaluator_config_args( + evaluators, + evaluator_config, +): + from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry + + def check_nesting_config_dict(_evaluator_name_list, _evaluator_name_to_conf_map): + return isinstance(_evaluator_name_to_conf_map, dict) and all( + k in _evaluator_name_list and isinstance(v, dict) + for k, v in _evaluator_name_to_conf_map.items() + ) + + if evaluators is None: + evaluator_name_list = list(_model_evaluation_registry._registry.keys()) + if len(evaluator_name_list) > 1: + _logger.warning( + f"Multiple registered evaluators are found {evaluator_name_list} and " + "they will all be used in evaluation if they support the specified model type. " + "If you want to evaluate with one evaluator, specify the `evaluator` argument " + "and optionally specify the `evaluator_config` argument." + ) + if evaluator_config is not None: + conf_dict_value_error = ValueError( + "If `evaluators` argument is None, all available evaluators will be used. " + "If only the default evaluator is available, the `evaluator_config` argument is " + "interpreted as the config dictionary for the default evaluator. Otherwise, the " + "`evaluator_config` argument must be a dictionary mapping each evaluator's name " + "to its own evaluator config dictionary." + ) + if evaluator_name_list == ["default"]: + if not isinstance(evaluator_config, dict): + raise conf_dict_value_error + elif "default" not in evaluator_config: + evaluator_name_to_conf_map = {"default": evaluator_config} + else: + evaluator_name_to_conf_map = evaluator_config + else: + if not check_nesting_config_dict(evaluator_name_list, evaluator_config): + raise conf_dict_value_error + evaluator_name_to_conf_map = evaluator_config + else: + evaluator_name_to_conf_map = {} + elif isinstance(evaluators, str): + if not (evaluator_config is None or isinstance(evaluator_config, dict)): + raise ValueError( + "If `evaluators` argument is the name of an evaluator, evaluator_config must be " + "None or a dict containing config items for the evaluator." + ) + evaluator_name_list = [evaluators] + evaluator_name_to_conf_map = {evaluators: evaluator_config} + elif isinstance(evaluators, list): + if evaluator_config is not None: + if not check_nesting_config_dict(evaluators, evaluator_config): + raise ValueError( + "If `evaluators` argument is an evaluator name list, evaluator_config " + "must be a dict contains mapping from evaluator name to individual " + "evaluator config dict." + ) + # Use `OrderedDict.fromkeys` to deduplicate elements but keep elements order. + evaluator_name_list = list(OrderedDict.fromkeys(evaluators)) + evaluator_name_to_conf_map = evaluator_config or {} + else: + raise ValueError( + "`evaluators` argument must be None, an evaluator name string, or a list of " + "evaluator names." + ) + + return evaluator_name_list, evaluator_name_to_conf_map + + +_last_failed_evaluator = None + + +def _get_last_failed_evaluator(): + """ + Return the evaluator name of the last failed evaluator when calling `evalaute`. + This can be used to check which evaluator fail when `evaluate` API fail. + """ + return _last_failed_evaluator + + +def _evaluate( + *, model, model_type, dataset, actual_run_id, evaluator_name_list, evaluator_name_to_conf_map +): + """ + The public API "evaluate" will verify argument first, and then pass normalized arguments + to the _evaluate method. + """ + # import _model_evaluation_registry and PyFuncModel inside function to avoid circuit importing + from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry + + global _last_failed_evaluator + _last_failed_evaluator = None + + client = mlflow.tracking.MlflowClient() + model_uuid = model.metadata.model_uuid + dataset._log_dataset_tag(client, actual_run_id, model_uuid) + + eval_results = [] + for evaluator_name in evaluator_name_list: + config = evaluator_name_to_conf_map.get(evaluator_name) or {} + try: + evaluator = _model_evaluation_registry.get_evaluator(evaluator_name) + except MlflowException: + _logger.warning(f"Evaluator '{evaluator_name}' is not registered.") + continue + + _last_failed_evaluator = evaluator_name + if evaluator.can_evaluate(model_type=model_type, evaluator_config=config): + _logger.info(f"Evaluating the model with the {evaluator_name} evaluator.") + result = evaluator.evaluate( + model=model, + model_type=model_type, + dataset=dataset, + run_id=actual_run_id, + evaluator_config=config, + ) + eval_results.append(result) + + _last_failed_evaluator = None + + if len(eval_results) == 0: + raise ValueError( + "The model could not be evaluated by any of the registered evaluators, please " + "verify that the model type and other configs are set correctly." + ) + + merged_eval_result = EvaluationResult(EvaluationMetrics(), dict()) + for eval_result in eval_results: + merged_eval_result.metrics.update(eval_result.metrics) + merged_eval_result.artifacts.update(eval_result.artifacts) + + return merged_eval_result + + +@experimental def evaluate( + *, model: Union[str, "mlflow.pyfunc.PyFuncModel"], model_type: str, dataset: "mlflow.models.evaluation.EvaluationDataset", @@ -496,33 +688,72 @@ def evaluate( a nested dictionary whose key is the evaluator name. :return: An :py:class:`mlflow.models.evaluation.EvaluationDataset` instance containing evaluation results. + + The default evaluator supports the 'regressor' and 'classifer' model types. + + For both the 'regressor' and 'classifer' types, the default evaluator will generate model + summary plots and feature importance plots generated by shap explainer. + + For regressor model, the default evaluator will additionally log: + + - **metrics**: example_count, mean_absolute_error, mean_squared_error, root_mean_squared_error, + sum_on_label, mean_on_label, r2_score, max_error, mean_absolute_percentage_error. + + For binary classifier, the default evaluator will additionally log: + + - **metrics**: true_negatives, false_positives, false_negatives, true_positives, recall, + precision, f1_score, accuracy, example_count, log_loss, roc_auc, precision_recall_auc. + - **artifacts**: lift curve plot, precision-recall plot, ROC plot. + + For multiclass classifier, the default evaluator will additionally log: + + - **metrics**: accuracy, example_count, f1_score_micro, f1_score_macro, log_loss + - **artifacts**: A CSV file for "per_class_metrics" (per-class metrics includes true_negatives/ + false_positives/false_negatives/true_positives/recall/precision/roc_auc, + precision_recall_auc), precision-recall merged curves plot, ROC merged curves plot. + + The available `evaluator_config` options for the default evaluator include: + + - **log_model_explainability**: A boolean value specifying whether or not to log model + explainability insights, default value is True. + - **explainability_algorithm**: A string to specify the SHAP Explainer algorithm for model + explainability. Supported algorithm includes: 'exact', 'permutation', 'partition'. + If not set, `shap.Explainer` is used with the "auto" algorithm, which chooses the best + Explainer based on the model. + - **explainability_nsamples**: The number of sample rows to use for computing model + explainability insights. Default value is 2000. + - **max_num_classes_threshold_logging_roc_pr_curve_for_multiclass_classifier**: + For multiclass classifier, specify the max number of classes which allow logging per-class + ROC curve and Precision-Recall curve. + + Limitations of evaluation dataset: + - If the input dataset is pandas dataframe, the feature columns in pandas dataframe must be + scalar value columns, other object types (nd.array/list/etc.) are not supported yet. + - If the mlflow model to be evaluated is a pyspark ML model, then the input data must + be a spark DataFrame or pandas DataFrame contains a feature column with values of type + "pyspark.ml.linalg.Vector", and a label column. + - For classifier, evaluation dataset labels must contains all distinct values, the dataset + labels data will be used to infer the number of classes. For binary classifier, the + negative label value must be 0 or -1 or False, and the positive label value must be + 1 or True. + For multiclass classifier, if logging explainability insights enabled, the label values + must be number type. + + Limitations of metrics/artifacts computation: + - For classifier, some metrics and plot computation require model provides + "predict probability" function. Currently, for sklearn model, we will extract "predict_proba" + method from the raw model to achieve this, for other model, it will skip logging + metrics/artifacts which require probability prediction. + + Limitations of default evaluator logging model explainability insights: + - The `shap.Explainer` "auto" algorithm will choose Linear explainer for linear model, + and choose Tree explainer for tree model. But the shap Linear/Tree explainer does not + support multi-class classifier, in this case, default evaluator will fallback to use + shap Exact or Permutation explainer. + - Logging model explainability insights is not currently supported for PySpark models. """ - # import _model_evaluation_registry and PyFuncModel inside function to avoid circuit importing - from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry from mlflow.pyfunc import PyFuncModel - if not evaluators: - evaluators = list(_model_evaluation_registry._registry.keys()) - - if isinstance(evaluators, str): - evaluators = [evaluators] - if not (evaluator_config is None or isinstance(evaluator_config, dict)): - raise ValueError( - "If `evaluators` argument is a str, evaluator_config must be None or a dict." - ) - evaluator_config = {evaluators[0]: evaluator_config} - elif isinstance(evaluators, list): - evaluators = set(evaluators) - if not ( - isinstance(evaluator_config, dict) - and all(k in evaluators and isinstance(v, dict) for k, v in evaluator_config.items()) - ): - raise ValueError( - "If `evaluators` argument is a evaluator name list, evaluator_config" - "must be a dict contains mapping from evaluator name to individual " - "evaluator config dict." - ) - if isinstance(model, str): model = mlflow.pyfunc.load_model(model) elif isinstance(model, PyFuncModel): @@ -533,27 +764,17 @@ def evaluate( "an instance of `mlflow.pyfunc.PyFuncModel`." ) + ( + evaluator_name_list, + evaluator_name_to_conf_map, + ) = _normalize_evaluators_and_evaluator_config_args(evaluators, evaluator_config) + with _start_run_or_reuse_active_run(run_id) as actual_run_id: - client = mlflow.tracking.MlflowClient() - dataset._log_dataset_tag(client, actual_run_id) - - eval_results = [] - for evaluator_name in evaluators: - config = evaluator_config.get(evaluator_name) or {} - try: - evaluator = _model_evaluation_registry.get_evaluator(evaluator_name) - except MlflowException: - _logger.warning(f"Evaluator '{evaluator_name}' is not registered.") - continue - - if evaluator.can_evaluate(model_type, config): - _logger.info(f"Evaluating the model with the {evaluator_name} evaluator.") - result = evaluator.evaluate(model, model_type, dataset, actual_run_id, config) - eval_results.append(result) - - merged_eval_result = EvaluationResult(EvaluationMetrics(), dict()) - for eval_result in eval_results: - merged_eval_result.metrics.update(eval_result.metrics) - merged_eval_result.artifacts.update(eval_result.artifacts) - - return merged_eval_result + return _evaluate( + model=model, + model_type=model_type, + dataset=dataset, + actual_run_id=actual_run_id, + evaluator_name_list=evaluator_name_list, + evaluator_name_to_conf_map=evaluator_name_to_conf_map, + ) diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py new file mode 100644 index 0000000000000..6ee604aed8988 --- /dev/null +++ b/mlflow/models/evaluation/default_evaluator.py @@ -0,0 +1,672 @@ +import mlflow +from mlflow.models.evaluation.base import ( + ModelEvaluator, + EvaluationMetrics, + EvaluationResult, +) +from mlflow.entities.metric import Metric +from mlflow.utils.file_utils import TempDir +from mlflow.utils.string_utils import truncate_str_from_middle +from mlflow.models.utils import plot_lines +from mlflow.models.evaluation.artifacts import ImageEvaluationArtifact, CsvEvaluationArtifact + +from sklearn import metrics as sk_metrics +import math +from collections import namedtuple +import numbers +import pandas as pd +import numpy as np +import time +from functools import partial +import logging +from packaging.version import Version + +_logger = logging.getLogger(__name__) + +_DEFAULT_SAMPLE_ROWS_FOR_SHAP = 2000 + + +def _infer_model_type_by_labels(labels): + distinct_labels = set(labels) + for v in distinct_labels: + if not isinstance(v, numbers.Number): + return "classifier" + if not float(v).is_integer(): + return "regressor" + if len(distinct_labels) > 1000 and len(distinct_labels) / len(labels) > 0.7: + return "regressor" + return "classifier" + + +def _extract_raw_model_and_predict_fn(model): + model_loader_module = model.metadata.flavors["python_function"]["loader_module"] + predict_fn = model.predict + predict_proba_fn = None + + try: + if model_loader_module == "mlflow.sklearn": + raw_model = model._model_impl + else: + raw_model = None + except Exception as e: + raw_model = None + _logger.warning( + f"Raw model resolution fails unexpectedly on PyFuncModel {model!r}, " + f"error message is {e}" + ) + + if raw_model: + predict_fn = raw_model.predict + predict_proba_fn = getattr(raw_model, "predict_proba", None) + + try: + import xgboost + + if isinstance(raw_model, xgboost.XGBModel): + # Because shap evaluation will pass evaluation data in ndarray format + # (without feature names), if set validate_features=True it will raise error. + predict_fn = partial(predict_fn, validate_features=False) + if predict_proba_fn is not None: + predict_proba_fn = partial(predict_proba_fn, validate_features=False) + except ImportError: + pass + + return model_loader_module, raw_model, predict_fn, predict_proba_fn + + +def _gen_log_key(key, dataset_name): + return f"{key}_on_data_{dataset_name}" + + +def _get_regressor_metrics(y, y_pred): + return { + "example_count": len(y), + "mean_absolute_error": sk_metrics.mean_absolute_error(y, y_pred), + "mean_squared_error": sk_metrics.mean_squared_error(y, y_pred), + "root_mean_squared_error": math.sqrt(sk_metrics.mean_squared_error(y, y_pred)), + "sum_on_label": sum(y), + "mean_on_label": sum(y) / len(y), + "r2_score": sk_metrics.r2_score(y, y_pred), + "max_error": sk_metrics.max_error(y, y_pred), + "mean_absolute_percentage_error": sk_metrics.mean_absolute_percentage_error(y, y_pred), + } + + +def _get_binary_sum_up_label_pred_prob(positive_class_index, positive_class, y, y_pred, y_probs): + y = np.array(y) + y_bin = np.where(y == positive_class, 1, 0) + y_pred_bin = None + y_prob_bin = None + if y_pred is not None: + y_pred = np.array(y_pred) + y_pred_bin = np.where(y_pred == positive_class, 1, 0) + + if y_probs is not None: + y_probs = np.array(y_probs) + y_prob_bin = y_probs[:, positive_class_index] + + return y_bin, y_pred_bin, y_prob_bin + + +def _get_classifier_per_class_metrics(y, y_pred): + """ + get classifier metrics which computing over a specific class. + For binary classifier, y/y_pred is for the positive class. + For multiclass classifier, y/y_pred sum up to a binary "is class" and "is not class". + """ + metrics = {} + confusion_matrix = sk_metrics.confusion_matrix(y, y_pred) + tn, fp, fn, tp = confusion_matrix.ravel() + metrics["true_negatives"] = tn + metrics["false_positives"] = fp + metrics["false_negatives"] = fn + metrics["true_positives"] = tp + metrics["recall"] = sk_metrics.recall_score(y, y_pred) + metrics["precision"] = sk_metrics.precision_score(y, y_pred) + metrics["f1_score"] = sk_metrics.f1_score(y, y_pred) + return metrics + + +def _get_classifier_global_metrics(is_binomial, y, y_pred, y_probs, labels): + """ + get classifier metrics which computing over all classes examples. + """ + metrics = {} + metrics["accuracy"] = sk_metrics.accuracy_score(y, y_pred) + metrics["example_count"] = len(y) + + if not is_binomial: + metrics["f1_score_micro"] = sk_metrics.f1_score(y, y_pred, average="micro", labels=labels) + metrics["f1_score_macro"] = sk_metrics.f1_score(y, y_pred, average="macro", labels=labels) + + if y_probs is not None: + metrics["log_loss"] = sk_metrics.log_loss(y, y_probs, labels=labels) + + return metrics + + +def _get_classifier_per_class_metrics_collection_df(y, y_pred, labels): + per_class_metrics_list = [] + for positive_class_index, positive_class in enumerate(labels): + (y_bin, y_pred_bin, _,) = _get_binary_sum_up_label_pred_prob( + positive_class_index, positive_class, y, y_pred, None + ) + + per_class_metrics = {"positive_class": positive_class} + per_class_metrics.update(_get_classifier_per_class_metrics(y_bin, y_pred_bin)) + per_class_metrics_list.append(per_class_metrics) + + return pd.DataFrame(per_class_metrics_list) + + +_Curve = namedtuple("_Curve", ["plot_fn", "plot_fn_args", "auc"]) + + +def _gen_classifier_curve( + is_binomial, + y, + y_probs, + labels, + curve_type, +): + """ + Generate precision-recall curve or ROC curve for classifier. + :param is_binomial: True if it is binary classifier otherwise False + :param y: True label values + :param y_probs: if binary classifer, the predicted probability for positive class. + if multiclass classiifer, the predicted probabilities for all classes. + :param labels: The set of labels. + :param curve_type: "pr" or "roc" + :return: An instance of "_Curve" which includes attributes "plot_fn", "plot_fn_args", "auc". + """ + if curve_type == "roc": + + def gen_line_x_y_label_fn(_y, _y_prob): + fpr, tpr, _ = sk_metrics.roc_curve(_y, _y_prob) + auc = sk_metrics.auc(fpr, tpr) + return fpr, tpr, f"AUC={auc:.3f}" + + xlabel = "False Positive Rate" + ylabel = "True Positive Rate" + elif curve_type == "pr": + + def gen_line_x_y_label_fn(_y, _y_prob): + precision, recall, _thresholds = sk_metrics.precision_recall_curve(_y, _y_prob) + ap = np.mean(precision) + return recall, precision, f"AP={ap:.3f}" + + xlabel = "recall" + ylabel = "precision" + else: + assert False, "illegal curve type" + + if is_binomial: + x_data, y_data, line_label = gen_line_x_y_label_fn(y, y_probs) + data_series = [(line_label, x_data, y_data)] + auc = sk_metrics.auc(x_data, y_data) + else: + curve_list = [] + for positive_class_index, positive_class in enumerate(labels): + y_bin, _, y_prob_bin = _get_binary_sum_up_label_pred_prob( + positive_class_index, positive_class, y, None, y_probs + ) + + x_data, y_data, line_label = gen_line_x_y_label_fn(y_bin, y_prob_bin) + curve_list.append((positive_class, x_data, y_data, line_label)) + + data_series = [ + (f"label={positive_class},{line_label}", x_data, y_data) + for positive_class, x_data, y_data, line_label in curve_list + ] + auc = [sk_metrics.auc(x_data, y_data) for _, x_data, y_data, _ in curve_list] + + def _do_plot(**kwargs): + import matplotlib.pyplot as pyplot + + _, ax = plot_lines(**kwargs) + dash_line_args = { + "color": "gray", + "alpha": 0.3, + "drawstyle": "default", + "linestyle": "dashed", + } + if curve_type == "pr": + ax.plot([0, 1], [1, 0], **dash_line_args) + elif curve_type == "roc": + ax.plot([0, 1], [0, 1], **dash_line_args) + + if is_binomial: + ax.legend(loc="best") + else: + ax.legend(loc="center left", bbox_to_anchor=(1, 0.5)) + pyplot.subplots_adjust(right=0.6, bottom=0.25) + + return _Curve( + plot_fn=_do_plot, + plot_fn_args={ + "data_series": data_series, + "xlabel": xlabel, + "ylabel": ylabel, + "line_kwargs": {"drawstyle": "steps-post", "linewidth": 1}, + }, + auc=auc, + ) + + +_matplotlib_config = { + "figure.dpi": 288, + "figure.figsize": [6.0, 4.0], +} + + +# pylint: disable=attribute-defined-outside-init +class DefaultEvaluator(ModelEvaluator): + # pylint: disable=unused-argument + def can_evaluate(self, *, model_type, evaluator_config, **kwargs): + return model_type in ["classifier", "regressor"] + + def _log_metrics(self): + """ + Helper method to log metrics into specified run. + """ + timestamp = int(time.time() * 1000) + self.client.log_batch( + self.run_id, + metrics=[ + Metric( + key=_gen_log_key(key, self.dataset_name), + value=value, + timestamp=timestamp, + step=0, + ) + for key, value in self.metrics.items() + ], + ) + + def _log_image_artifact( + self, + do_plot, + artifact_name, + ): + import matplotlib.pyplot as pyplot + + artifact_file_name = _gen_log_key(artifact_name, self.dataset_name) + ".png" + artifact_file_local_path = self.temp_dir.path(artifact_file_name) + + try: + pyplot.clf() + do_plot() + pyplot.savefig(artifact_file_local_path) + finally: + pyplot.close(pyplot.gcf()) + + mlflow.log_artifact(artifact_file_local_path) + artifact = ImageEvaluationArtifact(uri=mlflow.get_artifact_uri(artifact_file_name)) + artifact.load(artifact_file_local_path) + self.artifacts[artifact_name] = artifact + + def _log_pandas_df_artifact(self, pandas_df, artifact_name): + artifact_file_name = _gen_log_key(artifact_name, self.dataset_name) + ".csv" + artifact_file_local_path = self.temp_dir.path(artifact_file_name) + pandas_df.to_csv(artifact_file_local_path, index=False) + mlflow.log_artifact(artifact_file_local_path) + artifact = CsvEvaluationArtifact( + uri=mlflow.get_artifact_uri(artifact_file_name), + content=pandas_df, + ) + artifact.load(artifact_file_local_path) + self.artifacts[artifact_name] = artifact + + def _log_model_explainability(self): + if not self.evaluator_config.get("log_model_explainability", True): + return + + if self.model_loader_module == "mlflow.spark": + # TODO: Shap explainer need to manipulate on each feature values, + # but spark model input dataframe contains Vector type feature column + # which shap explainer does not support. + # To support this, we need expand the Vector type feature column into + # multiple scaler feature columns and pass it to shap explainer. + _logger.warning( + "Logging model explainability insights is not currently supported for PySpark " + "models." + ) + return + + if self.model_type == "classifier" and not all( + [isinstance(label, (numbers.Number, np.bool_)) for label in self.label_list] + ): + _logger.warning( + "Skip logging model explainability insights because it requires all label " + "values to be Number type." + ) + return + + try: + import shap + import matplotlib.pyplot as pyplot + except ImportError: + _logger.warning( + "SHAP or matplotlib package is not installed, so model explainability insights " + "will not be logged." + ) + return + + if Version(shap.__version__) < Version("0.40"): + _logger.warning( + "Shap package version is lower than 0.40, Skip log model explainability." + ) + return + + is_multinomial_classifier = self.model_type == "classifier" and self.num_classes > 2 + + sample_rows = self.evaluator_config.get( + "explainability_nsamples", _DEFAULT_SAMPLE_ROWS_FOR_SHAP + ) + algorithm = self.evaluator_config.get("explainability_algorithm", None) + + truncated_feature_names = [truncate_str_from_middle(f, 20) for f in self.feature_names] + for i, truncated_name in enumerate(truncated_feature_names): + if truncated_name != self.feature_names[i]: + # For duplicated truncated name, attach "(f_{feature_index})" at the end + truncated_feature_names[i] = f"{truncated_name}(f_{i + 1})" + + truncated_feature_name_map = { + f: f2 for f, f2 in zip(self.feature_names, truncated_feature_names) + } + + sampled_X = shap.sample(self.X, sample_rows) + + if isinstance(sampled_X, pd.DataFrame): + # For some shap explainer, the plot will use the DataFrame column names instead of + # using feature_names argument value. So rename the dataframe column names. + sampled_X = sampled_X.rename(columns=truncated_feature_name_map, copy=False) + + if algorithm: + supported_algos = ["exact", "permutation", "partition"] + if algorithm not in supported_algos: + raise ValueError( + f"Specified explainer algorithm {algorithm} is unsupported. Currently only " + f"support {','.join(supported_algos)} algorithms." + ) + explainer = shap.Explainer( + self.predict_fn, + sampled_X, + feature_names=truncated_feature_names, + algorithm=algorithm, + ) + else: + if self.raw_model and not is_multinomial_classifier: + # For mulitnomial classifier, shap.Explainer may choose Tree/Linear explainer for + # raw model, this case shap plot doesn't support it well, so exclude the + # multinomial_classifier case here. + explainer = shap.Explainer( + self.raw_model, sampled_X, feature_names=truncated_feature_names + ) + else: + # fallback to default explainer + explainer = shap.Explainer( + self.predict_fn, sampled_X, feature_names=truncated_feature_names + ) + + _logger.info(f"Shap explainer {explainer.__class__.__name__} is used.") + + shap_values = explainer(sampled_X) + + try: + mlflow.shap.log_explainer( + explainer, artifact_path=_gen_log_key("explainer", self.dataset_name) + ) + except Exception as e: + # TODO: The explainer saver is buggy, if `get_underlying_model_flavor` return "unknown", + # then fallback to shap explainer saver, and shap explainer will call `model.save` + # for sklearn model, there is no `.save` method, so error will happen. + _logger.warning(f"Log explainer failed. Reason: {str(e)}") + + def plot_beeswarm(): + pyplot.subplots_adjust(bottom=0.2, left=0.4) + shap.plots.beeswarm(shap_values, show=False) + + self._log_image_artifact( + plot_beeswarm, + "shap_beeswarm_plot", + ) + + def plot_summary(): + pyplot.subplots_adjust(bottom=0.2, left=0.4) + shap.summary_plot(shap_values, show=False) + + self._log_image_artifact( + plot_summary, + "shap_summary_plot", + ) + + def plot_feature_importance(): + pyplot.subplots_adjust(bottom=0.2, left=0.4) + shap.plots.bar(shap_values, show=False) + + self._log_image_artifact( + plot_feature_importance, + "shap_feature_importance_plot", + ) + + def _log_binary_classifier(self): + self.metrics.update(_get_classifier_per_class_metrics(self.y, self.y_pred)) + + if self.y_probs is not None: + roc_curve = _gen_classifier_curve( + is_binomial=True, + y=self.y, + y_probs=self.y_prob, + labels=self.label_list, + curve_type="roc", + ) + + def plot_roc_curve(): + roc_curve.plot_fn(**roc_curve.plot_fn_args) + + self._log_image_artifact(plot_roc_curve, "roc_curve_plot") + self.metrics["roc_auc"] = roc_curve.auc + + pr_curve = _gen_classifier_curve( + is_binomial=True, + y=self.y, + y_probs=self.y_prob, + labels=self.label_list, + curve_type="pr", + ) + + def plot_pr_curve(): + pr_curve.plot_fn(**pr_curve.plot_fn_args) + + self._log_image_artifact(plot_pr_curve, "precision_recall_curve_plot") + self.metrics["precision_recall_auc"] = pr_curve.auc + + def _log_multiclass_classifier(self): + per_class_metrics_collection_df = _get_classifier_per_class_metrics_collection_df( + self.y, self.y_pred, self.label_list + ) + + log_roc_pr_curve = False + if self.y_probs is not None: + max_num_classes_for_logging_curve = self.evaluator_config.get( + "max_num_classes_threshold_logging_roc_pr_curve_for_multiclass_classifier", 10 + ) + if self.num_classes <= max_num_classes_for_logging_curve: + log_roc_pr_curve = True + else: + _logger.warning( + f"The classifier num_classes > {max_num_classes_for_logging_curve}, skip " + f"logging ROC curve and Precision-Recall curve. You can add evaluator config " + f"'max_num_classes_threshold_logging_roc_pr_curve_for_multiclass_classifier' " + f"to increase the threshold." + ) + + if log_roc_pr_curve: + roc_curve = _gen_classifier_curve( + is_binomial=False, + y=self.y, + y_probs=self.y_probs, + labels=self.label_list, + curve_type="roc", + ) + + def plot_roc_curve(): + roc_curve.plot_fn(**roc_curve.plot_fn_args) + + self._log_image_artifact(plot_roc_curve, "roc_curve_plot") + per_class_metrics_collection_df["roc_auc"] = roc_curve.auc + + pr_curve = _gen_classifier_curve( + is_binomial=False, + y=self.y, + y_probs=self.y_probs, + labels=self.label_list, + curve_type="pr", + ) + + def plot_pr_curve(): + pr_curve.plot_fn(**pr_curve.plot_fn_args) + + self._log_image_artifact(plot_pr_curve, "precision_recall_curve_plot") + per_class_metrics_collection_df["precision_recall_auc"] = pr_curve.auc + + self._log_pandas_df_artifact(per_class_metrics_collection_df, "per_class_metrics") + + def _evaluate_classifier(self): + from mlflow.models.evaluation.lift_curve import plot_lift_curve + + self.label_list = np.unique(self.y) + self.num_classes = len(self.label_list) + + self.y_pred = self.predict_fn(self.X) + self.is_binomial = self.num_classes <= 2 + + if self.is_binomial: + if list(self.label_list) not in [[0, 1], [-1, 1]]: + raise ValueError( + "Binary classifier evaluation dataset positive class label must be 1 or True, " + "negative class label must be 0 or -1 or False, and dataset must contains " + "both positive and negative examples." + ) + _logger.info( + "The evaluation dataset is inferred as binary dataset, positive label is " + f"{self.label_list[1]}, negative label is {self.label_list[0]}." + ) + else: + _logger.info( + "The evaluation dataset is inferred as multiclass dataset, number of classes " + f"is inferred as {self.num_classes}" + ) + + if self.predict_proba_fn is not None: + self.y_probs = self.predict_proba_fn(self.X) + if self.is_binomial: + self.y_prob = self.y_probs[:, 1] + else: + self.y_prob = None + else: + self.y_probs = None + self.y_prob = None + + self.metrics.update( + _get_classifier_global_metrics( + self.is_binomial, self.y, self.y_pred, self.y_probs, self.label_list + ) + ) + + if self.is_binomial: + self._log_binary_classifier() + else: + self._log_multiclass_classifier() + + if self.is_binomial and self.y_probs is not None: + self._log_image_artifact( + lambda: plot_lift_curve(self.y, self.y_probs), + "lift_curve_plot", + ) + + # normalize the confusion matrix, keep consistent with sklearn autologging. + confusion_matrix = sk_metrics.confusion_matrix( + self.y, self.y_pred, labels=self.label_list, normalize="true" + ) + + def plot_confusion_matrix(): + sk_metrics.ConfusionMatrixDisplay( + confusion_matrix=confusion_matrix, + display_labels=self.label_list, + ).plot(cmap="Blues") + + if hasattr(sk_metrics, "ConfusionMatrixDisplay"): + self._log_image_artifact( + plot_confusion_matrix, + "confusion_matrix", + ) + + self._log_metrics() + self._log_model_explainability() + return EvaluationResult(self.metrics, self.artifacts) + + def _evaluate_regressor(self): + self.y_pred = self.model.predict(self.X) + self.metrics.update(_get_regressor_metrics(self.y, self.y_pred)) + + self._log_metrics() + self._log_model_explainability() + return EvaluationResult(self.metrics, self.artifacts) + + def evaluate( + self, + *, + model: "mlflow.pyfunc.PyFuncModel", + model_type, + dataset, + run_id, + evaluator_config, + **kwargs, + ): + import matplotlib + + with TempDir() as temp_dir, matplotlib.rc_context(_matplotlib_config): + self.client = mlflow.tracking.MlflowClient() + + self.temp_dir = temp_dir + self.model = model + self.model_type = model_type + self.dataset = dataset + self.run_id = run_id + self.evaluator_config = evaluator_config + self.dataset_name = dataset.name + self.feature_names = dataset.feature_names + + ( + model_loader_module, + raw_model, + predict_fn, + predict_proba_fn, + ) = _extract_raw_model_and_predict_fn(model) + self.model_loader_module = model_loader_module + self.raw_model = raw_model + self.predict_fn = predict_fn + self.predict_proba_fn = predict_proba_fn + + self.X = dataset.features_data + self.y = dataset.labels_data + self.metrics = EvaluationMetrics() + self.artifacts = {} + + infered_model_type = _infer_model_type_by_labels(self.y) + + if model_type != infered_model_type: + _logger.warning( + f"According to the evaluation dataset label values, the model type looks like " + f"{infered_model_type}, but you specified model type {model_type}. Please " + f"verify that you set the `model_type` and `dataset` arguments correctly." + ) + + if model_type == "classifier": + return self._evaluate_classifier() + elif model_type == "regressor": + return self._evaluate_regressor() + else: + raise ValueError(f"Unsupported model type {model_type}") diff --git a/mlflow/models/evaluation/evaluator_registry.py b/mlflow/models/evaluation/evaluator_registry.py index 9e7f027f0496e..1822e313460ea 100644 --- a/mlflow/models/evaluation/evaluator_registry.py +++ b/mlflow/models/evaluation/evaluator_registry.py @@ -48,6 +48,9 @@ def get_evaluator(self, evaluator_name): def register_evaluators(module): + from mlflow.models.evaluation.default_evaluator import DefaultEvaluator + + module._model_evaluation_registry.register("default", DefaultEvaluator) module._model_evaluation_registry.register_entrypoints() diff --git a/mlflow/models/evaluation/lift_curve.py b/mlflow/models/evaluation/lift_curve.py new file mode 100644 index 0000000000000..cbcba712a9329 --- /dev/null +++ b/mlflow/models/evaluation/lift_curve.py @@ -0,0 +1,165 @@ +import matplotlib.pyplot as plt + +import numpy as np + + +def _cumulative_gain_curve(y_true, y_score, pos_label=None): + """ + This method is copied from scikit-plot package. + + This function generates the points necessary to plot the Cumulative Gain + + Note: This implementation is restricted to the binary classification task. + + Args: + y_true (array-like, shape (n_samples)): True labels of the data. + + y_score (array-like, shape (n_samples)): Target scores, can either be + probability estimates of the positive class, confidence values, or + non-thresholded measure of decisions (as returned by + decision_function on some classifiers). + + pos_label (int or str, default=None): Label considered as positive and + others are considered negative + + Returns: + percentages (numpy.ndarray): An array containing the X-axis values for + plotting the Cumulative Gains chart. + + gains (numpy.ndarray): An array containing the Y-axis values for one + curve of the Cumulative Gains chart. + + Raises: + ValueError: If `y_true` is not composed of 2 classes. The Cumulative + Gain Chart is only relevant in binary classification. + """ + y_true, y_score = np.asarray(y_true), np.asarray(y_score) + + # ensure binary classification if pos_label is not specified + classes = np.unique(y_true) + if pos_label is None and not ( + np.array_equal(classes, [0, 1]) + or np.array_equal(classes, [-1, 1]) + or np.array_equal(classes, [0]) + or np.array_equal(classes, [-1]) + or np.array_equal(classes, [1]) + ): + raise ValueError("Data is not binary and pos_label is not specified") + elif pos_label is None: + pos_label = 1.0 + + # make y_true a boolean vector + y_true = y_true == pos_label + + sorted_indices = np.argsort(y_score)[::-1] + y_true = y_true[sorted_indices] + gains = np.cumsum(y_true) + + percentages = np.arange(start=1, stop=len(y_true) + 1) + + gains = gains / float(np.sum(y_true)) + percentages = percentages / float(len(y_true)) + + gains = np.insert(gains, 0, [0]) + percentages = np.insert(percentages, 0, [0]) + + return percentages, gains + + +def plot_lift_curve( + y_true, + y_probas, + title="Lift Curve", + ax=None, + figsize=None, + title_fontsize="large", + text_fontsize="medium", +): + """ + This method is copied from scikit-plot package. + + Generates the Lift Curve from labels and scores/probabilities + + The lift curve is used to determine the effectiveness of a + binary classifier. A detailed explanation can be found at + http://www2.cs.uregina.ca/~dbd/cs831/notes/lift_chart/lift_chart.html. + The implementation here works only for binary classification. + + Args: + y_true (array-like, shape (n_samples)): + Ground truth (correct) target values. + + y_probas (array-like, shape (n_samples, n_classes)): + Prediction probabilities for each class returned by a classifier. + + title (string, optional): Title of the generated plot. Defaults to + "Lift Curve". + + ax (:class:`matplotlib.axes.Axes`, optional): The axes upon which to + plot the learning curve. If None, the plot is drawn on a new set of + axes. + + figsize (2-tuple, optional): Tuple denoting figure size of the plot + e.g. (6, 6). Defaults to ``None``. + + title_fontsize (string or int, optional): Matplotlib-style fontsizes. + Use e.g. "small", "medium", "large" or integer-values. Defaults to + "large". + + text_fontsize (string or int, optional): Matplotlib-style fontsizes. + Use e.g. "small", "medium", "large" or integer-values. Defaults to + "medium". + + Returns: + ax (:class:`matplotlib.axes.Axes`): The axes on which the plot was + drawn. + + Example: + >>> lr = LogisticRegression() + >>> lr = lr.fit(X_train, y_train) + >>> y_probas = lr.predict_proba(X_test) + >>> plot_lift_curve(y_test, y_probas) + + >>> plt.show() + + .. image:: _static/examples/plot_lift_curve.png + :align: center + :alt: Lift Curve + """ + y_true = np.array(y_true) + y_probas = np.array(y_probas) + + classes = np.unique(y_true) + if len(classes) != 2: + raise ValueError( + "Cannot calculate Lift Curve for data with " "{} category/ies".format(len(classes)) + ) + + # Compute Cumulative Gain Curves + percentages, gains1 = _cumulative_gain_curve(y_true, y_probas[:, 0], classes[0]) + percentages, gains2 = _cumulative_gain_curve(y_true, y_probas[:, 1], classes[1]) + + percentages = percentages[1:] + gains1 = gains1[1:] + gains2 = gains2[1:] + + gains1 = gains1 / percentages + gains2 = gains2 / percentages + + if ax is None: + _, ax = plt.subplots(1, 1, figsize=figsize) + + ax.set_title(title, fontsize=title_fontsize) + + ax.plot(percentages, gains1, lw=3, label="Class {}".format(classes[0])) + ax.plot(percentages, gains2, lw=3, label="Class {}".format(classes[1])) + + ax.plot([0, 1], [1, 1], "k--", lw=2, label="Baseline") + + ax.set_xlabel("Percentage of sample", fontsize=text_fontsize) + ax.set_ylabel("Lift", fontsize=text_fontsize) + ax.tick_params(labelsize=text_fontsize) + ax.grid("on") + ax.legend(loc="best", fontsize=text_fontsize) + + return ax diff --git a/mlflow/models/utils.py b/mlflow/models/utils.py index bb862c4c923fe..dc9d9c1f5bede 100644 --- a/mlflow/models/utils.py +++ b/mlflow/models/utils.py @@ -234,3 +234,22 @@ def _read_sparse_matrix_from_json(path, example_type): return csc_matrix((data, indices, indptr), shape=shape) else: return csr_matrix((data, indices, indptr), shape=shape) + + +def plot_lines(data_series, xlabel, ylabel, legend_loc=None, line_kwargs=None): + import matplotlib.pyplot as plt + + fig, ax = plt.subplots() + + if line_kwargs is None: + line_kwargs = {} + + for label, data_x, data_y in data_series: + ax.plot(data_x, data_y, label=label, **line_kwargs) + + if legend_loc: + ax.legend(loc=legend_loc) + + ax.set(xlabel=xlabel, ylabel=ylabel) + + return fig, ax diff --git a/mlflow/utils/string_utils.py b/mlflow/utils/string_utils.py index e3b1d0b911301..4cce65499b7cb 100644 --- a/mlflow/utils/string_utils.py +++ b/mlflow/utils/string_utils.py @@ -12,3 +12,13 @@ def strip_suffix(original, suffix): def is_string_type(item): return isinstance(item, str) + + +def truncate_str_from_middle(s, max_length): + assert max_length > 5 + if len(s) <= max_length: + return s + else: + left_part_len = (max_length - 3) // 2 + right_part_len = max_length - 3 - left_part_len + return f"{s[:left_part_len]}...{s[-right_part_len:]}" diff --git a/requirements/small-requirements.txt b/requirements/small-requirements.txt index 855d6a3052473..18424b787c6fc 100644 --- a/requirements/small-requirements.txt +++ b/requirements/small-requirements.txt @@ -13,3 +13,6 @@ moto!=2.0.7 azure-storage-blob>=12.0.0 azure-identity>=1.6.1 databricks-cli@git+https://github.com/databricks/databricks-cli.git +pillow +matplotlib +shap>=0.40 diff --git a/tests/models/test_default_evaluator.py b/tests/models/test_default_evaluator.py new file mode 100644 index 0000000000000..872f599ecf717 --- /dev/null +++ b/tests/models/test_default_evaluator.py @@ -0,0 +1,491 @@ +import numpy as np +import json + + +from mlflow.models.evaluation import evaluate +from mlflow.models.evaluation.default_evaluator import ( + _get_classifier_global_metrics, + _infer_model_type_by_labels, + _extract_raw_model_and_predict_fn, + _get_regressor_metrics, + _get_binary_sum_up_label_pred_prob, + _get_classifier_per_class_metrics, + _gen_classifier_curve, +) +import mlflow +from sklearn.linear_model import LogisticRegression + +# pylint: disable=unused-import +from tests.models.test_evaluation import ( + get_run_data, + linear_regressor_model_uri, + diabetes_dataset, + multiclass_logistic_regressor_model_uri, + iris_dataset, + binary_logistic_regressor_model_uri, + breast_cancer_dataset, + spark_linear_regressor_model_uri, + diabetes_spark_dataset, + svm_model_uri, +) +from mlflow.models.utils import plot_lines + + +def assert_dict_equal(d1, d2, rtol): + for k in d1: + assert k in d2 + assert np.isclose(d1[k], d2[k], rtol=rtol) + + +def test_regressor_evaluation(linear_regressor_model_uri, diabetes_dataset): + with mlflow.start_run() as run: + result = evaluate( + model=linear_regressor_model_uri, + model_type="regressor", + dataset=diabetes_dataset, + evaluators="default", + ) + + _, metrics, tags, artifacts = get_run_data(run.info.run_id) + + model = mlflow.pyfunc.load_model(linear_regressor_model_uri) + + y = diabetes_dataset.labels_data + y_pred = model.predict(diabetes_dataset.features_data) + + expected_metrics = _get_regressor_metrics(y, y_pred) + for metric_key in expected_metrics: + assert np.isclose( + expected_metrics[metric_key], + metrics[metric_key + "_on_data_diabetes_dataset"], + rtol=1e-3, + ) + assert np.isclose(expected_metrics[metric_key], result.metrics[metric_key], rtol=1e-3) + + assert json.loads(tags["mlflow.datasets"]) == [ + {**diabetes_dataset._metadata, "model": model.metadata.model_uuid} + ] + + assert set(artifacts) == { + "shap_beeswarm_plot_on_data_diabetes_dataset.png", + "shap_feature_importance_plot_on_data_diabetes_dataset.png", + "shap_summary_plot_on_data_diabetes_dataset.png", + } + assert result.artifacts.keys() == { + "shap_beeswarm_plot", + "shap_feature_importance_plot", + "shap_summary_plot", + } + + +def test_multi_classifier_evaluation(multiclass_logistic_regressor_model_uri, iris_dataset): + with mlflow.start_run() as run: + result = evaluate( + model=multiclass_logistic_regressor_model_uri, + model_type="classifier", + dataset=iris_dataset, + evaluators="default", + ) + + _, metrics, tags, artifacts = get_run_data(run.info.run_id) + + model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri) + + _, _, predict_fn, predict_proba_fn = _extract_raw_model_and_predict_fn(model) + y = iris_dataset.labels_data + y_pred = predict_fn(iris_dataset.features_data) + y_probs = predict_proba_fn(iris_dataset.features_data) + + expected_metrics = _get_classifier_global_metrics(False, y, y_pred, y_probs, labels=None) + + for metric_key in expected_metrics: + assert np.isclose( + expected_metrics[metric_key], metrics[metric_key + "_on_data_iris_dataset"], rtol=1e-3 + ) + assert np.isclose(expected_metrics[metric_key], result.metrics[metric_key], rtol=1e-3) + + assert json.loads(tags["mlflow.datasets"]) == [ + {**iris_dataset._metadata, "model": model.metadata.model_uuid} + ] + + assert set(artifacts) == { + "shap_beeswarm_plot_on_data_iris_dataset.png", + "per_class_metrics_on_data_iris_dataset.csv", + "roc_curve_plot_on_data_iris_dataset.png", + "precision_recall_curve_plot_on_data_iris_dataset.png", + "shap_feature_importance_plot_on_data_iris_dataset.png", + "explainer_on_data_iris_dataset", + "confusion_matrix_on_data_iris_dataset.png", + "shap_summary_plot_on_data_iris_dataset.png", + } + assert result.artifacts.keys() == { + "per_class_metrics", + "roc_curve_plot", + "precision_recall_curve_plot", + "confusion_matrix", + "shap_beeswarm_plot", + "shap_summary_plot", + "shap_feature_importance_plot", + } + + +def test_bin_classifier_evaluation(binary_logistic_regressor_model_uri, breast_cancer_dataset): + with mlflow.start_run() as run: + result = evaluate( + model=binary_logistic_regressor_model_uri, + model_type="classifier", + dataset=breast_cancer_dataset, + evaluators="default", + ) + + _, metrics, tags, artifacts = get_run_data(run.info.run_id) + + model = mlflow.pyfunc.load_model(binary_logistic_regressor_model_uri) + + _, _, predict_fn, predict_proba_fn = _extract_raw_model_and_predict_fn(model) + y = breast_cancer_dataset.labels_data + y_pred = predict_fn(breast_cancer_dataset.features_data) + y_probs = predict_proba_fn(breast_cancer_dataset.features_data) + + expected_metrics = _get_classifier_global_metrics(True, y, y_pred, y_probs, labels=None) + + for metric_key in expected_metrics: + assert np.isclose( + expected_metrics[metric_key], + metrics[metric_key + "_on_data_breast_cancer_dataset"], + rtol=1e-3, + ) + assert np.isclose(expected_metrics[metric_key], result.metrics[metric_key], rtol=1e-3) + + assert json.loads(tags["mlflow.datasets"]) == [ + {**breast_cancer_dataset._metadata, "model": model.metadata.model_uuid} + ] + + assert set(artifacts) == { + "shap_feature_importance_plot_on_data_breast_cancer_dataset.png", + "lift_curve_plot_on_data_breast_cancer_dataset.png", + "shap_beeswarm_plot_on_data_breast_cancer_dataset.png", + "precision_recall_curve_plot_on_data_breast_cancer_dataset.png", + "confusion_matrix_on_data_breast_cancer_dataset.png", + "shap_summary_plot_on_data_breast_cancer_dataset.png", + "roc_curve_plot_on_data_breast_cancer_dataset.png", + } + assert result.artifacts.keys() == { + "roc_curve_plot", + "precision_recall_curve_plot", + "lift_curve_plot", + "confusion_matrix", + "shap_beeswarm_plot", + "shap_summary_plot", + "shap_feature_importance_plot", + } + + +def test_spark_regressor_model_evaluation(spark_linear_regressor_model_uri, diabetes_spark_dataset): + with mlflow.start_run() as run: + result = evaluate( + model=spark_linear_regressor_model_uri, + model_type="regressor", + dataset=diabetes_spark_dataset, + evaluators="default", + evaluator_config={"log_model_explainability": True}, + ) + + _, metrics, tags, artifacts = get_run_data(run.info.run_id) + + model = mlflow.pyfunc.load_model(spark_linear_regressor_model_uri) + + X = diabetes_spark_dataset.features_data + y = diabetes_spark_dataset.labels_data + y_pred = model.predict(X) + + expected_metrics = _get_regressor_metrics(y, y_pred) + + for metric_key in expected_metrics: + assert np.isclose( + expected_metrics[metric_key], + metrics[metric_key + "_on_data_diabetes_spark_dataset"], + rtol=1e-3, + ) + assert np.isclose(expected_metrics[metric_key], result.metrics[metric_key], rtol=1e-3) + + model = mlflow.pyfunc.load_model(spark_linear_regressor_model_uri) + + assert json.loads(tags["mlflow.datasets"]) == [ + {**diabetes_spark_dataset._metadata, "model": model.metadata.model_uuid} + ] + + assert set(artifacts) == set() + assert result.artifacts == {} + + +def test_svm_classifier_evaluation(svm_model_uri, breast_cancer_dataset): + with mlflow.start_run() as run: + result = evaluate( + model=svm_model_uri, + model_type="classifier", + dataset=breast_cancer_dataset, + evaluators="default", + ) + + _, metrics, tags, artifacts = get_run_data(run.info.run_id) + + model = mlflow.pyfunc.load_model(svm_model_uri) + + _, _, predict_fn, _ = _extract_raw_model_and_predict_fn(model) + y = breast_cancer_dataset.labels_data + y_pred = predict_fn(breast_cancer_dataset.features_data) + + expected_metrics = _get_classifier_global_metrics(True, y, y_pred, None, labels=None) + + for metric_key in expected_metrics: + assert np.isclose( + expected_metrics[metric_key], + metrics[metric_key + "_on_data_breast_cancer_dataset"], + rtol=1e-3, + ) + assert np.isclose(expected_metrics[metric_key], result.metrics[metric_key], rtol=1e-3) + + assert json.loads(tags["mlflow.datasets"]) == [ + {**breast_cancer_dataset._metadata, "model": model.metadata.model_uuid} + ] + + assert set(artifacts) == { + "confusion_matrix_on_data_breast_cancer_dataset.png", + "shap_feature_importance_plot_on_data_breast_cancer_dataset.png", + "shap_beeswarm_plot_on_data_breast_cancer_dataset.png", + "shap_summary_plot_on_data_breast_cancer_dataset.png", + } + assert result.artifacts.keys() == { + "confusion_matrix", + "shap_beeswarm_plot", + "shap_summary_plot", + "shap_feature_importance_plot", + } + + +def test_infer_model_type_by_labels(): + assert _infer_model_type_by_labels(["a", "b"]) == "classifier" + assert _infer_model_type_by_labels([1, 2.5]) == "regressor" + assert _infer_model_type_by_labels(list(range(2000))) == "regressor" + assert _infer_model_type_by_labels([1, 2, 3]) == "classifier" + + +def test_extract_raw_model_and_predict_fn(binary_logistic_regressor_model_uri): + model = mlflow.pyfunc.load_model(binary_logistic_regressor_model_uri) + ( + model_loader_module, + raw_model, + predict_fn, + predict_proba_fn, + ) = _extract_raw_model_and_predict_fn(model) + assert model_loader_module == "mlflow.sklearn" + assert isinstance(raw_model, LogisticRegression) + assert predict_fn == raw_model.predict + assert predict_proba_fn == raw_model.predict_proba + + +def test_get_regressor_metrics(): + y = [1.1, 2.1, -3.5] + y_pred = [1.5, 2.0, -3.0] + + metrics = _get_regressor_metrics(y, y_pred) + expected_metrics = { + "example_count": 3, + "mean_absolute_error": 0.3333333333333333, + "mean_squared_error": 0.13999999999999999, + "root_mean_squared_error": 0.3741657386773941, + "sum_on_label": -0.2999999999999998, + "mean_on_label": -0.09999999999999994, + "r2_score": 0.976457399103139, + "max_error": 0.5, + "mean_absolute_percentage_error": 0.18470418470418468, + } + assert_dict_equal(metrics, expected_metrics, rtol=1e-3) + + +def test_get_binary_sum_up_label_pred_prob(): + y = [0, 1, 2] + y_pred = [0, 2, 1] + y_probs = [[0.7, 0.1, 0.2], [0.2, 0.3, 0.5], [0.25, 0.4, 0.35]] + + results = [] + for idx, label in enumerate([0, 1, 2]): + y_bin, y_pred_bin, y_prob_bin = _get_binary_sum_up_label_pred_prob( + idx, label, y, y_pred, y_probs + ) + results.append((list(y_bin), list(y_pred_bin), list(y_prob_bin))) + + print(results) + assert results == [ + ([1, 0, 0], [1, 0, 0], [0.7, 0.2, 0.25]), + ([0, 1, 0], [0, 0, 1], [0.1, 0.3, 0.4]), + ([0, 0, 1], [0, 1, 0], [0.2, 0.5, 0.35]), + ] + + +def test_get_classifier_per_class_metrics(): + y = [0, 1, 0, 1, 0, 1, 0, 1, 1, 0] + y_pred = [0, 1, 1, 0, 1, 1, 0, 1, 1, 0] + + expected_metrics = { + "true_negatives": 3, + "false_positives": 2, + "false_negatives": 1, + "true_positives": 4, + "recall": 0.8, + "precision": 0.6666666666666666, + "f1_score": 0.7272727272727272, + } + metrics = _get_classifier_per_class_metrics(y, y_pred) + assert_dict_equal(metrics, expected_metrics, rtol=1e-3) + + +def test_multiclass_get_classifier_global_metrics(): + y = [0, 1, 2, 1, 2] + y_pred = [0, 2, 1, 1, 0] + y_probs = [ + [0.7, 0.1, 0.2], + [0.2, 0.3, 0.5], + [0.25, 0.4, 0.35], + [0.3, 0.4, 0.3], + [0.8, 0.1, 0.1], + ] + + metrics = _get_classifier_global_metrics( + is_binomial=False, y=y, y_pred=y_pred, y_probs=y_probs, labels=[0, 1, 2] + ) + expected_metrics = { + "accuracy": 0.4, + "example_count": 5, + "f1_score_micro": 0.4, + "f1_score_macro": 0.38888888888888884, + "log_loss": 1.1658691395263094, + } + assert_dict_equal(metrics, expected_metrics, 1e-3) + + +def test_binary_get_classifier_global_metrics(): + y = [0, 1, 0, 1, 0, 1, 0, 1, 1, 0] + y_pred = [0, 1, 1, 0, 1, 1, 0, 1, 1, 0] + y_prob = [0.1, 0.9, 0.8, 0.2, 0.7, 0.8, 0.3, 0.6, 0.65, 0.4] + y_probs = [[1 - p, p] for p in y_prob] + metrics = _get_classifier_global_metrics( + is_binomial=True, y=y, y_pred=y_pred, y_probs=y_probs, labels=[0, 1] + ) + expected_metrics = {"accuracy": 0.7, "example_count": 10, "log_loss": 0.6665822319387167} + assert_dict_equal(metrics, expected_metrics, 1e-3) + + +def test_gen_binary_precision_recall_curve(): + y = [0, 1, 0, 1, 0, 1, 0, 1, 1, 0] + y_prob = [0.1, 0.9, 0.8, 0.2, 0.7, 0.8, 0.3, 0.6, 0.65, 0.4] + + results = _gen_classifier_curve( + is_binomial=True, y=y, y_probs=y_prob, labels=[0, 1], curve_type="pr" + ) + assert np.allclose( + results.plot_fn_args["data_series"][0][1], + np.array([1.0, 0.8, 0.8, 0.8, 0.6, 0.4, 0.4, 0.2, 0.0]), + rtol=1e-3, + ) + assert np.allclose( + results.plot_fn_args["data_series"][0][2], + np.array([0.55555556, 0.5, 0.57142857, 0.66666667, 0.6, 0.5, 0.66666667, 1.0, 1.0]), + rtol=1e-3, + ) + assert results.plot_fn_args["xlabel"] == "recall" + assert results.plot_fn_args["ylabel"] == "precision" + assert results.plot_fn_args["line_kwargs"] == {"drawstyle": "steps-post", "linewidth": 1} + assert np.isclose(results.auc, 0.7088888888888889, rtol=1e-3) + + +def test_gen_binary_roc_curve(): + y = [0, 1, 0, 1, 0, 1, 0, 1, 1, 0] + y_prob = [0.1, 0.9, 0.8, 0.2, 0.7, 0.8, 0.3, 0.6, 0.65, 0.4] + + results = _gen_classifier_curve( + is_binomial=True, y=y, y_probs=y_prob, labels=[0, 1], curve_type="roc" + ) + assert np.allclose( + results.plot_fn_args["data_series"][0][1], + np.array([0.0, 0.0, 0.2, 0.4, 0.4, 0.8, 0.8, 1.0]), + rtol=1e-3, + ) + assert np.allclose( + results.plot_fn_args["data_series"][0][2], + np.array([0.0, 0.2, 0.4, 0.4, 0.8, 0.8, 1.0, 1.0]), + rtol=1e-3, + ) + assert results.plot_fn_args["xlabel"] == "False Positive Rate" + assert results.plot_fn_args["ylabel"] == "True Positive Rate" + assert results.plot_fn_args["line_kwargs"] == {"drawstyle": "steps-post", "linewidth": 1} + assert np.isclose(results.auc, 0.66, rtol=1e-3) + + +def test_gen_multiclass_precision_recall_curve(): + y = [0, 1, 2, 1, 2] + y_probs = [ + [0.7, 0.1, 0.2], + [0.2, 0.3, 0.5], + [0.25, 0.4, 0.35], + [0.3, 0.4, 0.3], + [0.8, 0.1, 0.1], + ] + + results = _gen_classifier_curve( + is_binomial=False, y=y, y_probs=y_probs, labels=[0, 1, 2], curve_type="pr" + ) + expected_x_data_list = [[1.0, 0.0, 0.0], [1.0, 0.5, 0.0], [1.0, 0.5, 0.5, 0.5, 0.0, 0.0]] + expected_y_data_list = [ + [0.5, 0.0, 1.0], + [0.66666667, 0.5, 1.0], + [0.4, 0.25, 0.33333333, 0.5, 0.0, 1.0], + ] + line_labels = ["label=0,AP=0.500", "label=1,AP=0.722", "label=2,AP=0.414"] + for index, (name, x_data, y_data) in enumerate(results.plot_fn_args["data_series"]): + assert name == line_labels[index] + assert np.allclose(x_data, expected_x_data_list[index], rtol=1e-3) + assert np.allclose(y_data, expected_y_data_list[index], rtol=1e-3) + + assert results.plot_fn_args["xlabel"] == "recall" + assert results.plot_fn_args["ylabel"] == "precision" + assert results.plot_fn_args["line_kwargs"] == {"drawstyle": "steps-post", "linewidth": 1} + + expected_auc = [0.25, 0.6666666666666666, 0.2875] + assert np.allclose(results.auc, expected_auc, rtol=1e-3) + + +def test_gen_multiclass_roc_curve(): + y = [0, 1, 2, 1, 2] + y_probs = [ + [0.7, 0.1, 0.2], + [0.2, 0.3, 0.5], + [0.25, 0.4, 0.35], + [0.3, 0.4, 0.3], + [0.8, 0.1, 0.1], + ] + + results = _gen_classifier_curve( + is_binomial=False, y=y, y_probs=y_probs, labels=[0, 1, 2], curve_type="roc" + ) + print(results) + + expected_x_data_list = [ + [0.0, 0.25, 0.25, 1.0], + [0.0, 0.33333333, 0.33333333, 1.0], + [0.0, 0.33333333, 0.33333333, 1.0, 1.0], + ] + expected_y_data_list = [[0.0, 0.0, 1.0, 1.0], [0.0, 0.5, 1.0, 1.0], [0.0, 0.0, 0.5, 0.5, 1.0]] + line_labels = ["label=0,AUC=0.750", "label=1,AUC=0.750", "label=2,AUC=0.333"] + for index, (name, x_data, y_data) in enumerate(results.plot_fn_args["data_series"]): + assert name == line_labels[index] + assert np.allclose(x_data, expected_x_data_list[index], rtol=1e-3) + assert np.allclose(y_data, expected_y_data_list[index], rtol=1e-3) + + assert results.plot_fn_args["xlabel"] == "False Positive Rate" + assert results.plot_fn_args["ylabel"] == "True Positive Rate" + assert results.plot_fn_args["line_kwargs"] == {"drawstyle": "steps-post", "linewidth": 1} + + expected_auc = [0.75, 0.75, 0.3333] + assert np.allclose(results.auc, expected_auc, rtol=1e-3) diff --git a/tests/models/test_evaluation.py b/tests/models/test_evaluation.py index 6892e6d8227b7..0555ca82a9a47 100644 --- a/tests/models/test_evaluation.py +++ b/tests/models/test_evaluation.py @@ -1,4 +1,5 @@ import mlflow +from collections import namedtuple from mlflow.models.evaluation import ( evaluate, @@ -8,6 +9,9 @@ EvaluationArtifact, EvaluationMetrics, ) +from mlflow.models.evaluation.base import ( + _normalize_evaluators_and_evaluator_config_args as _normalize_config, +) import hashlib from mlflow.models.evaluation.base import _start_run_or_reuse_active_run import sklearn @@ -21,6 +25,7 @@ from mlflow.utils.file_utils import TempDir from mlflow_test_plugin.dummy_evaluator import Array2DEvaluationArtifact from mlflow.models.evaluation.evaluator_registry import _model_evaluation_registry +from mlflow.models.evaluation.base import _logger as _base_logger, _gen_md5_for_arraylike_obj from sklearn.metrics import ( accuracy_score, @@ -30,19 +35,39 @@ ) from pyspark.sql import SparkSession +from pyspark.ml.linalg import Vectors +from pyspark.ml.regression import LinearRegression as SparkLinearRegression from mlflow.tracking.artifact_utils import get_artifact_uri import json +import uuid def get_iris(): iris = sklearn.datasets.load_iris() - return iris.data[:, :2], iris.target + return iris.data, iris.target def get_diabetes_dataset(): data = sklearn.datasets.load_diabetes() - return data.data[:, :2], data.target + return data.data, data.target + + +def get_diabetes_spark_dataset(): + data = sklearn.datasets.load_diabetes() + spark = SparkSession.builder.master("local[*]").getOrCreate() + rows = [ + (Vectors.dense(features), float(label)) for features, label in zip(data.data, data.target) + ] + return spark.createDataFrame(spark.sparkContext.parallelize(rows, 1), ["features", "label"]) + + +def get_breast_cancer_dataset(): + data = sklearn.datasets.load_breast_cancer() + return data.data, data.target + + +RunData = namedtuple("RunData", ["params", "metrics", "tags", "artifacts"]) def get_run_data(run_id): @@ -50,7 +75,7 @@ def get_run_data(run_id): data = client.get_run(run_id).data tags = {k: v for k, v in data.tags.items()} artifacts = [f.path for f in client.list_artifacts(run_id)] - return data.params, data.metrics, tags, artifacts + return RunData(params=data.params, metrics=data.metrics, tags=tags, artifacts=artifacts) def get_raw_tag(run_id, tag_name): @@ -71,51 +96,118 @@ def spark_session(): @pytest.fixture(scope="module") -def regressor_model_uri(): +def iris_dataset(): + X, y = get_iris() + eval_X, eval_y = X[0::3], y[0::3] + return EvaluationDataset(data=eval_X, labels=eval_y, name="iris_dataset") + + +@pytest.fixture(scope="module") +def diabetes_dataset(): + X, y = get_diabetes_dataset() + eval_X, eval_y = X[0::3], y[0::3] + return EvaluationDataset(data=eval_X, labels=eval_y, name="diabetes_dataset") + + +@pytest.fixture(scope="module") +def diabetes_spark_dataset(): + spark_df = get_diabetes_spark_dataset().sample(fraction=0.3, seed=1) + return EvaluationDataset(data=spark_df, labels="label", name="diabetes_spark_dataset") + + +@pytest.fixture(scope="module") +def breast_cancer_dataset(): + X, y = get_breast_cancer_dataset() + eval_X, eval_y = X[0::3], y[0::3] + return EvaluationDataset(data=eval_X, labels=eval_y, name="breast_cancer_dataset") + + +@pytest.fixture +def linear_regressor_model_uri(): X, y = get_diabetes_dataset() reg = sklearn.linear_model.LinearRegression() reg.fit(X, y) with mlflow.start_run() as run: mlflow.sklearn.log_model(reg, "reg_model") - regressor_model_uri = get_artifact_uri(run.info.run_id, "reg_model") + linear_regressor_model_uri = get_artifact_uri(run.info.run_id, "reg_model") - return regressor_model_uri + return linear_regressor_model_uri -@pytest.fixture(scope="module") -def classifier_model_uri(): +@pytest.fixture +def spark_linear_regressor_model_uri(): + spark_df = get_diabetes_spark_dataset() + reg = SparkLinearRegression() + spark_reg_model = reg.fit(spark_df) + + with mlflow.start_run() as run: + mlflow.spark.log_model(spark_reg_model, "spark_reg_model") + spark_linear_regressor_model_uri = get_artifact_uri(run.info.run_id, "spark_reg_model") + + return spark_linear_regressor_model_uri + + +@pytest.fixture +def multiclass_logistic_regressor_model_uri(): X, y = get_iris() - clf = sklearn.linear_model.LogisticRegression() + clf = sklearn.linear_model.LogisticRegression(max_iter=2) clf.fit(X, y) with mlflow.start_run() as run: mlflow.sklearn.log_model(clf, "clf_model") - classifier_model_uri = get_artifact_uri(run.info.run_id, "clf_model") + multiclass_logistic_regressor_model_uri = get_artifact_uri(run.info.run_id, "clf_model") - return classifier_model_uri + return multiclass_logistic_regressor_model_uri -@pytest.fixture(scope="module") -def iris_dataset(): - X, y = get_iris() - eval_X, eval_y = X[0::3], y[0::3] - return EvaluationDataset(data=eval_X, labels=eval_y, name="iris_dataset") +@pytest.fixture +def binary_logistic_regressor_model_uri(): + X, y = get_breast_cancer_dataset() + clf = sklearn.linear_model.LogisticRegression() + clf.fit(X, y) + with mlflow.start_run() as run: + mlflow.sklearn.log_model(clf, "bin_clf_model") + binary_logistic_regressor_model_uri = get_artifact_uri(run.info.run_id, "bin_clf_model") -@pytest.fixture(scope="module") + return binary_logistic_regressor_model_uri + + +@pytest.fixture +def svm_model_uri(): + X, y = get_breast_cancer_dataset() + clf = sklearn.svm.LinearSVC() + clf.fit(X, y) + + with mlflow.start_run() as run: + mlflow.sklearn.log_model(clf, "svm_model") + svm_model_uri = get_artifact_uri(run.info.run_id, "svm_model") + + return svm_model_uri + + +@pytest.fixture def iris_pandas_df_dataset(): X, y = get_iris() eval_X, eval_y = X[0::3], y[0::3] - data = pd.DataFrame({"f1": eval_X[:, 0], "f2": eval_X[:, 1], "y": eval_y}) + data = pd.DataFrame( + { + "f1": eval_X[:, 0], + "f2": eval_X[:, 1], + "f3": eval_X[:, 2], + "f4": eval_X[:, 3], + "y": eval_y, + } + ) labels = "y" return EvaluationDataset(data=data, labels=labels, name="iris_pandas_df_dataset") -def test_classifier_evaluate(classifier_model_uri, iris_dataset): - y_true = iris_dataset.labels - classifier_model = mlflow.pyfunc.load_model(classifier_model_uri) - y_pred = classifier_model.predict(iris_dataset.data) +def test_classifier_evaluate(multiclass_logistic_regressor_model_uri, iris_dataset): + y_true = iris_dataset.labels_data + classifier_model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri) + y_pred = classifier_model.predict(iris_dataset.features_data) expected_accuracy_score = accuracy_score(y_true, y_pred) expected_metrics = { "accuracy_score": expected_accuracy_score, @@ -128,9 +220,9 @@ def test_classifier_evaluate(classifier_model_uri, iris_dataset): with mlflow.start_run() as run: eval_result = evaluate( - classifier_model, - "classifier", - iris_dataset, + model=classifier_model, + model_type="classifier", + dataset=iris_dataset, run_id=None, evaluators="dummy_evaluator", ) @@ -182,10 +274,10 @@ def test_classifier_evaluate(classifier_model_uri, iris_dataset): ) -def test_regressor_evaluate(regressor_model_uri, iris_dataset): - y_true = iris_dataset.labels - regressor_model = mlflow.pyfunc.load_model(regressor_model_uri) - y_pred = regressor_model.predict(iris_dataset.data) +def test_regressor_evaluate(linear_regressor_model_uri, diabetes_dataset): + y_true = diabetes_dataset.labels_data + regressor_model = mlflow.pyfunc.load_model(linear_regressor_model_uri) + y_pred = regressor_model.predict(diabetes_dataset.features_data) expected_mae = mean_absolute_error(y_true, y_pred) expected_mse = mean_squared_error(y_true, y_pred) expected_metrics = { @@ -193,16 +285,16 @@ def test_regressor_evaluate(regressor_model_uri, iris_dataset): "mean_squared_error": expected_mse, } expected_saved_metrics = { - "mean_absolute_error_on_iris_dataset": expected_mae, - "mean_squared_error_on_iris_dataset": expected_mse, + "mean_absolute_error_on_diabetes_dataset": expected_mae, + "mean_squared_error_on_diabetes_dataset": expected_mse, } - for model in [regressor_model, regressor_model_uri]: + for model in [regressor_model, linear_regressor_model_uri]: with mlflow.start_run() as run: eval_result = evaluate( - model, - "regressor", - iris_dataset, + model=model, + model_type="regressor", + dataset=diabetes_dataset, run_id=None, evaluators="dummy_evaluator", ) @@ -222,7 +314,7 @@ def test_dataset_name(): def test_gen_md5_for_arraylike_obj(): def get_md5(data): md5_gen = hashlib.md5() - EvaluationDataset._gen_md5_for_arraylike_obj(md5_gen, data) + _gen_md5_for_arraylike_obj(md5_gen, data) return md5_gen.hexdigest() list0 = list(range(20)) @@ -236,73 +328,111 @@ def get_md5(data): assert get_md5(list3) == get_md5(list4) -def test_dataset_hash(iris_dataset, iris_pandas_df_dataset): - assert iris_dataset.hash == "c7417e63a9ce038a32f37ecd7fb829f6" - assert iris_pandas_df_dataset.hash == "d06cfb6352dba29afe514d9be87021aa" +def test_dataset_hash(iris_dataset, iris_pandas_df_dataset, diabetes_spark_dataset): + assert iris_dataset.hash == "99329a790dc483e7382c0d1d27aac3f3" + assert iris_pandas_df_dataset.hash == "799d4f50e2e353127f94a0e5300add06" + assert diabetes_spark_dataset.hash == "e646b03e976240bd0c79c6bcc1ae0bda" + + +def test_datasset_with_pandas_dataframe(): + data = pd.DataFrame({"f1": [1, 2], "f2": [3, 4], "label": [0, 1]}) + eval_dataset = EvaluationDataset(data=data, labels="label") + + assert list(eval_dataset.features_data.columns) == ["f1", "f2"] + assert np.array_equal(eval_dataset.features_data.f1.to_numpy(), [1, 2]) + assert np.array_equal(eval_dataset.features_data.f2.to_numpy(), [3, 4]) + assert np.array_equal(eval_dataset.labels_data, [0, 1]) -def test_datasset_extract_features_label(iris_dataset, iris_pandas_df_dataset): - X1, y1 = iris_dataset._extract_features_and_labels() - assert np.array_equal(X1, iris_dataset.data) - assert np.array_equal(y1, iris_dataset.labels) +def test_datasset_with_array_data(): + features = [[1, 2], [3, 4]] + labels = [0, 1] - X2, y2 = iris_pandas_df_dataset._extract_features_and_labels() - assert list(X2.columns) == ["f1", "f2"] - assert np.array_equal(X2["f1"], X1[:, 0]) - assert np.array_equal(X2["f2"], X1[:, 1]) - assert np.array_equal(y2, y1) + for input_data in [features, np.array(features)]: + eval_dataset1 = EvaluationDataset(data=input_data, labels=labels) + assert np.array_equal(eval_dataset1.features_data, features) + assert np.array_equal(eval_dataset1.labels_data, labels) + assert list(eval_dataset1.feature_names) == ["feature_1", "feature_2"] + + with pytest.raises(ValueError, match="all element must has the same length"): + EvaluationDataset(data=[[1, 2], [3, 4, 5]], labels=labels) + + +def test_autogen_feature_names(): + labels = [0] + eval_dataset2 = EvaluationDataset(data=[list(range(9))], labels=labels) + assert eval_dataset2.feature_names == [f"feature_{i + 1}" for i in range(9)] + + eval_dataset2 = EvaluationDataset(data=[list(range(10))], labels=labels) + assert eval_dataset2.feature_names == [f"feature_{i + 1:02d}" for i in range(10)] + + eval_dataset2 = EvaluationDataset(data=[list(range(99))], labels=labels) + assert eval_dataset2.feature_names == [f"feature_{i + 1:02d}" for i in range(99)] + + eval_dataset2 = EvaluationDataset(data=[list(range(100))], labels=labels) + assert eval_dataset2.feature_names == [f"feature_{i + 1:03d}" for i in range(100)] + + with pytest.raises( + ValueError, match="features example rows must be the same length with labels array" + ): + EvaluationDataset(data=[[1, 2], [3, 4]], labels=[1, 2, 3]) def test_spark_df_dataset(spark_session): spark_df = spark_session.createDataFrame([(1.0, 2.0, 3.0)] * 10, ["f1", "f2", "y"]) with mock.patch.object(EvaluationDataset, "SPARK_DATAFRAME_LIMIT", 5): dataset = EvaluationDataset(spark_df, "y") - assert list(dataset.data.columns) == ["f1", "f2", "y"] - assert list(dataset.data["f1"]) == [1.0] * 5 - assert list(dataset.data["f2"]) == [2.0] * 5 - assert list(dataset.data["y"]) == [3.0] * 5 + assert list(dataset.features_data.columns) == ["f1", "f2"] + assert list(dataset.features_data["f1"]) == [1.0] * 5 + assert list(dataset.features_data["f2"]) == [2.0] * 5 + assert list(dataset.labels_data) == [3.0] * 5 def test_log_dataset_tag(iris_dataset, iris_pandas_df_dataset): + model_uuid = uuid.uuid4().hex with mlflow.start_run() as run: client = mlflow.tracking.MlflowClient() - iris_dataset._log_dataset_tag(client, run.info.run_id) + iris_dataset._log_dataset_tag(client, run.info.run_id, model_uuid=model_uuid) _, _, tags, _ = get_run_data(run.info.run_id) - assert json.loads(tags["mlflow.datasets"]) == [iris_dataset._metadata] + + logged_meta1 = {**iris_dataset._metadata, "model": model_uuid} + logged_meta2 = {**iris_pandas_df_dataset._metadata, "model": model_uuid} + + assert json.loads(tags["mlflow.datasets"]) == [logged_meta1] raw_tag = get_raw_tag(run.info.run_id, "mlflow.datasets") assert " " not in raw_tag # assert the tag string remove all whitespace chars. # Test appending dataset tag - iris_pandas_df_dataset._log_dataset_tag(client, run.info.run_id) + iris_pandas_df_dataset._log_dataset_tag(client, run.info.run_id, model_uuid=model_uuid) _, _, tags, _ = get_run_data(run.info.run_id) assert json.loads(tags["mlflow.datasets"]) == [ - iris_dataset._metadata, - iris_pandas_df_dataset._metadata, + logged_meta1, + logged_meta2, ] # Test log repetitive dataset - iris_dataset._log_dataset_tag(client, run.info.run_id) + iris_dataset._log_dataset_tag(client, run.info.run_id, model_uuid=model_uuid) _, _, tags, _ = get_run_data(run.info.run_id) assert json.loads(tags["mlflow.datasets"]) == [ - iris_dataset._metadata, - iris_pandas_df_dataset._metadata, + logged_meta1, + logged_meta2, ] class FakeEvauator1(ModelEvaluator): - def can_evaluate(self, model_type, evaluator_config=None, **kwargs): + def can_evaluate(self, *, model_type, evaluator_config, **kwargs): raise RuntimeError() - def evaluate(self, model, model_type, dataset, run_id, evaluator_config, **kwargs): + def evaluate(self, *, model, model_type, dataset, run_id, evaluator_config, **kwargs): raise RuntimeError() class FakeEvauator2(ModelEvaluator): - def can_evaluate(self, model_type, evaluator_config=None, **kwargs): + def can_evaluate(self, *, model_type, evaluator_config, **kwargs): raise RuntimeError() - def evaluate(self, model, model_type, dataset, run_id, evaluator_config, **kwargs): + def evaluate(self, *, model, model_type, dataset, run_id, evaluator_config, **kwargs): raise RuntimeError() @@ -322,7 +452,7 @@ def _load_content_from_file(self, local_artifact_path): raise RuntimeError() -def test_evaluator_interface(classifier_model_uri, iris_dataset): +def test_evaluator_interface(multiclass_logistic_regressor_model_uri, iris_dataset): with mock.patch.object( _model_evaluation_registry, "_registry", {"test_evaluator1": FakeEvauator1} ): @@ -337,27 +467,33 @@ def test_evaluator_interface(classifier_model_uri, iris_dataset): FakeEvauator1, "evaluate", return_value=evaluator1_return_value ) as mock_evaluate: with mlflow.start_run(): - evaluate( - classifier_model_uri, - "classifier", - iris_dataset, - run_id=None, - evaluators="test_evaluator1", - evaluator_config=evaluator1_config, + with pytest.raises( + ValueError, + match="The model could not be evaluated by any of the registered evaluators", + ): + evaluate( + model=multiclass_logistic_regressor_model_uri, + model_type="classifier", + dataset=iris_dataset, + run_id=None, + evaluators="test_evaluator1", + evaluator_config=evaluator1_config, + ) + mock_can_evaluate.assert_called_once_with( + model_type="classifier", evaluator_config=evaluator1_config ) - mock_can_evaluate.assert_called_once_with("classifier", evaluator1_config) mock_evaluate.assert_not_called() with mock.patch.object( FakeEvauator1, "can_evaluate", return_value=True ) as mock_can_evaluate, mock.patch.object( FakeEvauator1, "evaluate", return_value=evaluator1_return_value ) as mock_evaluate: - classifier_model = mlflow.pyfunc.load_model(classifier_model_uri) + classifier_model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri) with mlflow.start_run() as run: eval1_result = evaluate( - classifier_model, - "classifier", - iris_dataset, + model=classifier_model, + model_type="classifier", + dataset=iris_dataset, run_id=None, evaluators="test_evaluator1", evaluator_config=evaluator1_config, @@ -365,13 +501,19 @@ def test_evaluator_interface(classifier_model_uri, iris_dataset): assert eval1_result.metrics == evaluator1_return_value.metrics assert eval1_result.artifacts == evaluator1_return_value.artifacts - mock_can_evaluate.assert_called_once_with("classifier", evaluator1_config) + mock_can_evaluate.assert_called_once_with( + model_type="classifier", evaluator_config=evaluator1_config + ) mock_evaluate.assert_called_once_with( - classifier_model, "classifier", iris_dataset, run.info.run_id, evaluator1_config + model=classifier_model, + model_type="classifier", + dataset=iris_dataset, + run_id=run.info.run_id, + evaluator_config=evaluator1_config, ) -def test_evaluate_with_multi_evaluators(classifier_model_uri, iris_dataset): +def test_evaluate_with_multi_evaluators(multiclass_logistic_regressor_model_uri, iris_dataset): with mock.patch.object( _model_evaluation_registry, "_registry", @@ -399,12 +541,12 @@ def test_evaluate_with_multi_evaluators(classifier_model_uri, iris_dataset): ) as mock_can_evaluate2, mock.patch.object( FakeEvauator2, "evaluate", return_value=evaluator2_return_value ) as mock_evaluate2: - classifier_model = mlflow.pyfunc.load_model(classifier_model_uri) + classifier_model = mlflow.pyfunc.load_model(multiclass_logistic_regressor_model_uri) with mlflow.start_run() as run: eval_result = evaluate( - classifier_model, - "classifier", - iris_dataset, + model=classifier_model, + model_type="classifier", + dataset=iris_dataset, run_id=None, evaluators=evaluators, evaluator_config={ @@ -420,21 +562,26 @@ def test_evaluate_with_multi_evaluators(classifier_model_uri, iris_dataset): **evaluator1_return_value.artifacts, **evaluator2_return_value.artifacts, } - mock_can_evaluate1.assert_called_once_with("classifier", evaluator1_config) + mock_can_evaluate1.assert_called_once_with( + model_type="classifier", evaluator_config=evaluator1_config + ) mock_evaluate1.assert_called_once_with( - classifier_model, - "classifier", - iris_dataset, - run.info.run_id, - evaluator1_config, + model=classifier_model, + model_type="classifier", + dataset=iris_dataset, + run_id=run.info.run_id, + evaluator_config=evaluator1_config, + ) + mock_can_evaluate2.assert_called_once_with( + model_type="classifier", + evaluator_config=evaluator2_config, ) - mock_can_evaluate2.assert_called_once_with("classifier", evaluator2_config) mock_evaluate2.assert_called_once_with( - classifier_model, - "classifier", - iris_dataset, - run.info.run_id, - evaluator2_config, + model=classifier_model, + model_type="classifier", + dataset=iris_dataset, + run_id=run.info.run_id, + evaluator_config=evaluator2_config, ) @@ -462,3 +609,50 @@ def test_start_run_or_reuse_active_run(): with pytest.raises(ValueError, match="An active run exists"): with _start_run_or_reuse_active_run(run_id=previous_run_id): pass + + +def test_normalize_evaluators_and_evaluator_config_args(): + from mlflow.models.evaluation.default_evaluator import DefaultEvaluator + + with mock.patch.object( + _model_evaluation_registry, + "_registry", + {"default": DefaultEvaluator}, + ): + assert _normalize_config(None, None) == (["default"], {}) + assert _normalize_config(None, {"a": 3}) == (["default"], {"default": {"a": 3}}) + assert _normalize_config(None, {"default": {"a": 3}}) == ( + ["default"], + {"default": {"a": 3}}, + ) + + assert _normalize_config(None, None) == (["default", "dummy_evaluator"], {}) + with pytest.raises( + ValueError, match="`evaluator_config` argument must be a dictionary mapping each evaluator" + ): + assert _normalize_config(None, {"a": 3}) == (["default", "dummy_evaluator"], {}) + + assert _normalize_config(None, {"default": {"a": 3}}) == ( + ["default", "dummy_evaluator"], + {"default": {"a": 3}}, + ) + + with mock.patch.object(_base_logger, "warning") as patched_warning_fn: + _normalize_config(None, None) + patched_warning_fn.assert_called_once() + assert "Multiple registered evaluators are found" in patched_warning_fn.call_args[0][0] + + assert _normalize_config("dummy_evaluator", {"a": 3}) == ( + ["dummy_evaluator"], + {"dummy_evaluator": {"a": 3}}, + ) + + assert _normalize_config(["default", "dummy_evaluator"], {"dummy_evaluator": {"a": 3}}) == ( + ["default", "dummy_evaluator"], + {"dummy_evaluator": {"a": 3}}, + ) + + with pytest.raises( + ValueError, match="evaluator_config must be a dict contains mapping from evaluator name to" + ): + _normalize_config(["default", "dummy_evaluator"], {"abc": {"a": 3}}) diff --git a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py index bb3c8a8875849..c88bd21d09321 100644 --- a/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py +++ b/tests/resources/mlflow-test-plugin/mlflow_test_plugin/dummy_evaluator.py @@ -23,7 +23,8 @@ def _load_content_from_file(self, local_artifact_path): class DummyEvaluator(ModelEvaluator): - def can_evaluate(self, model_type, evaluator_config=None, **kwargs): + # pylint: disable=unused-argument + def can_evaluate(self, *, model_type, evaluator_config, **kwargs): return model_type in ["classifier", "regressor"] def _log_metrics(self, run_id, metrics, dataset_name): @@ -40,11 +41,13 @@ def _log_metrics(self, run_id, metrics, dataset_name): ], ) + # pylint: disable=unused-argument def evaluate( - self, model, model_type, dataset, run_id, evaluator_config=None, **kwargs + self, *, model, model_type, dataset, run_id, evaluator_config, **kwargs ) -> EvaluationResult: client = mlflow.tracking.MlflowClient() - X, y = dataset._extract_features_and_labels() + X = dataset.features_data + y = dataset.labels_data y_pred = model.predict(X) if model_type == "classifier": accuracy_score = sk_metrics.accuracy_score(y, y_pred)