Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve evaluation api #5256

Merged
merged 14 commits into from Jan 14, 2022
6 changes: 4 additions & 2 deletions mlflow/models/__init__.py
Expand Up @@ -24,14 +24,16 @@
from .model import Model
from .flavor_backend import FlavorBackend
from ..utils.environment import infer_pip_requirements
from .evaluation import evaluate, EvaluationDataset
from .evaluation import evaluate, EvaluationArtifact, EvaluationResult, list_evaluators

__all__ = [
"Model",
"FlavorBackend",
"infer_pip_requirements",
"evaluate",
"EvaluationDataset",
"EvaluationArtifact",
"EvaluationResult",
"list_evaluators",
]


Expand Down
5 changes: 1 addition & 4 deletions mlflow/models/evaluation/__init__.py
@@ -1,18 +1,15 @@
from mlflow.models.evaluation.base import (
ModelEvaluator,
EvaluationDataset,
EvaluationResult,
EvaluationMetrics,
EvaluationArtifact,
evaluate,
list_evaluators,
_get_last_failed_evaluator,
)

__all__ = [
"ModelEvaluator",
"EvaluationDataset",
"EvaluationResult",
"EvaluationMetrics",
"EvaluationArtifact",
"evaluate",
"list_evaluators",
Expand Down
239 changes: 134 additions & 105 deletions mlflow/models/evaluation/base.py

Large diffs are not rendered by default.

20 changes: 14 additions & 6 deletions mlflow/models/evaluation/default_evaluator.py
@@ -1,7 +1,6 @@
import mlflow
from mlflow.models.evaluation.base import (
ModelEvaluator,
EvaluationMetrics,
EvaluationResult,
)
from mlflow.entities.metric import Metric
Expand Down Expand Up @@ -333,15 +332,24 @@ def _log_model_explainability(self):
)
return

if self.model_type == "classifier" and not all(
[isinstance(label, (numbers.Number, np.bool_)) for label in self.label_list]
):
if not (np.issubdtype(self.y.dtype, np.number) or self.y.dtype == np.bool_):
# Note: python bool type inherits number type but np.bool_ does not inherit np.number.
_logger.warning(
"Skip logging model explainability insights because it requires all label "
"values to be Number type."
"values to be number type or bool type."
)
return

feature_dtypes = list(self.X.dtypes) if isinstance(self.X, pd.DataFrame) else [self.X.dtype]
for feature_dtype in feature_dtypes:
if not np.issubdtype(feature_dtype, np.number):
_logger.warning(
"Skip logging model explainability insights because it requires all feature "
"values to be number type, and each feature column must only contain scaler "
"values."
)
return
WeichenXu123 marked this conversation as resolved.
Show resolved Hide resolved

try:
import shap
import matplotlib.pyplot as pyplot
Expand Down Expand Up @@ -652,7 +660,7 @@ def evaluate(

self.X = dataset.features_data
self.y = dataset.labels_data
self.metrics = EvaluationMetrics()
self.metrics = dict()
self.artifacts = {}

infered_model_type = _infer_model_type_by_labels(self.y)
Expand Down
30 changes: 20 additions & 10 deletions tests/models/test_default_evaluator.py
Expand Up @@ -40,9 +40,11 @@ def assert_dict_equal(d1, d2, rtol):
def test_regressor_evaluation(linear_regressor_model_uri, diabetes_dataset):
with mlflow.start_run() as run:
result = evaluate(
model=linear_regressor_model_uri,
linear_regressor_model_uri,
diabetes_dataset._constructor_args["data"],
model_type="regressor",
dataset=diabetes_dataset,
targets=diabetes_dataset._constructor_args["targets"],
dataset_name=diabetes_dataset.name,
evaluators="default",
)

Expand Down Expand Up @@ -81,9 +83,11 @@ def test_regressor_evaluation(linear_regressor_model_uri, diabetes_dataset):
def test_multi_classifier_evaluation(multiclass_logistic_regressor_model_uri, iris_dataset):
with mlflow.start_run() as run:
result = evaluate(
model=multiclass_logistic_regressor_model_uri,
multiclass_logistic_regressor_model_uri,
iris_dataset._constructor_args["data"],
model_type="classifier",
dataset=iris_dataset,
targets=iris_dataset._constructor_args["targets"],
dataset_name=iris_dataset.name,
evaluators="default",
)

Expand Down Expand Up @@ -132,9 +136,11 @@ def test_multi_classifier_evaluation(multiclass_logistic_regressor_model_uri, ir
def test_bin_classifier_evaluation(binary_logistic_regressor_model_uri, breast_cancer_dataset):
with mlflow.start_run() as run:
result = evaluate(
model=binary_logistic_regressor_model_uri,
binary_logistic_regressor_model_uri,
breast_cancer_dataset._constructor_args["data"],
model_type="classifier",
dataset=breast_cancer_dataset,
targets=breast_cancer_dataset._constructor_args["targets"],
dataset_name=breast_cancer_dataset.name,
evaluators="default",
)

Expand Down Expand Up @@ -184,9 +190,11 @@ def test_bin_classifier_evaluation(binary_logistic_regressor_model_uri, breast_c
def test_spark_regressor_model_evaluation(spark_linear_regressor_model_uri, diabetes_spark_dataset):
with mlflow.start_run() as run:
result = evaluate(
model=spark_linear_regressor_model_uri,
spark_linear_regressor_model_uri,
diabetes_spark_dataset._constructor_args["data"],
model_type="regressor",
dataset=diabetes_spark_dataset,
targets=diabetes_spark_dataset._constructor_args["targets"],
dataset_name=diabetes_spark_dataset.name,
evaluators="default",
evaluator_config={"log_model_explainability": True},
)
Expand Down Expand Up @@ -222,9 +230,11 @@ def test_spark_regressor_model_evaluation(spark_linear_regressor_model_uri, diab
def test_svm_classifier_evaluation(svm_model_uri, breast_cancer_dataset):
with mlflow.start_run() as run:
result = evaluate(
model=svm_model_uri,
svm_model_uri,
breast_cancer_dataset._constructor_args["data"],
model_type="classifier",
dataset=breast_cancer_dataset,
targets=breast_cancer_dataset._constructor_args["targets"],
dataset_name=breast_cancer_dataset.name,
evaluators="default",
)

Expand Down