From e0f0c44877e660582fa00802fd68d6dfd313942a Mon Sep 17 00:00:00 2001 From: Matthieu Maitre Date: Sun, 2 Jan 2022 11:57:19 -0800 Subject: [PATCH 1/4] Add method to load model input example Signed-off-by: Matthieu Maitre --- mlflow/models/model.py | 16 ++++++++++++++++ tests/models/test_model.py | 6 ++++++ 2 files changed, 22 insertions(+) diff --git a/mlflow/models/model.py b/mlflow/models/model.py index 806357c10ab4a..1e66ce7201844 100644 --- a/mlflow/models/model.py +++ b/mlflow/models/model.py @@ -68,6 +68,22 @@ def get_input_schema(self): def get_output_schema(self): return self.signature.outputs if self.signature is not None else None + def load_input_example(self, path: str): + """ + Load the input example saved along a model. Returns None if there is no example metadata (i.e. the + model was saved without example). Raises IO Exception if there is model metadata but the example + file is missing. + + :param path: Path to the model directory. + :return: Input example (NumPy ndarray, SciPy csc_matrix, SciPy csr_matrix, pandas DataFrame, dict) + : or None if the model has no example. + """ + from mlflow.models.utils import ( + _read_example, + ) # Just-in-time import to avoid loading NumPy/pandas/DataFrame when not needed + + return _read_example(self, path) + def add_flavor(self, name, **params): """Add an entry for how to serve the model in a given format.""" self.flavors[name] = params diff --git a/tests/models/test_model.py b/tests/models/test_model.py index 131a70004b54a..79cedf246cb06 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -104,6 +104,9 @@ def test_model_log(): assert x.to_dict(orient="records")[0] == input_example assert not hasattr(loaded_model, "databricks_runtime") + loaded_example = loaded_model.load_input_example(local_path) + assert loaded_example.to_dict(orient="records")[0] == input_example + def test_model_log_with_databricks_runtime(): dbr = "8.3.x-snapshot-gpu-ml-scala2.12" @@ -165,6 +168,9 @@ def test_model_log_with_input_example_succeeds(): input_example["d"] = input_example["d"].apply(lambda x: x.isoformat()) assert x.equals(input_example) + loaded_example = loaded_model.load_input_example(local_path) + assert loaded_example.equals(input_example) + def _is_valid_uuid(val): import uuid From f17c180bf9f96f86068f4a76f5de202c354514a5 Mon Sep 17 00:00:00 2001 From: Matthieu Maitre Date: Sun, 2 Jan 2022 12:29:01 -0800 Subject: [PATCH 2/4] Try fixing doc and lint failures Signed-off-by: Matthieu Maitre --- mlflow/models/model.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mlflow/models/model.py b/mlflow/models/model.py index 1e66ce7201844..805ae1e605dc1 100644 --- a/mlflow/models/model.py +++ b/mlflow/models/model.py @@ -70,13 +70,14 @@ def get_output_schema(self): def load_input_example(self, path: str): """ - Load the input example saved along a model. Returns None if there is no example metadata (i.e. the - model was saved without example). Raises IO Exception if there is model metadata but the example - file is missing. + Load the input example saved along a model. Returns None if there is no example metadata + (i.e. the model was saved without example). Raises IO Exception if there is model metadata + but the example file is missing. :param path: Path to the model directory. - :return: Input example (NumPy ndarray, SciPy csc_matrix, SciPy csr_matrix, pandas DataFrame, dict) - : or None if the model has no example. + + :return: Input example (NumPy ndarray, SciPy csc_matrix, SciPy csr_matrix, + pandas DataFrame, dict) or None if the model has no example. """ from mlflow.models.utils import ( _read_example, From 3f5913d5e6fc28828c8488250390b2c1a4fe34b0 Mon Sep 17 00:00:00 2001 From: Liang Zhang Date: Thu, 30 Dec 2021 23:06:12 -0800 Subject: [PATCH 3/4] Remove param 'evals_result' 'early_stopping_rounds' in lightgbm version > 3.3.1 (#5206) The parameter evals_result is removed in the master of lightgbm: microsoft/LightGBM#4882. The parameter early_stopping_rounds is removed in the master of lightgbm: microsoft/LightGBM#4908. We should also remove this param in our test. This PR also fixed the sagemaker test failure. Signed-off-by: Liang Zhang --- mlflow/lightgbm.py | 16 ++- tests/lightgbm/test_lightgbm_autolog.py | 175 +++++++++++++++++------- tests/sagemaker/mock/__init__.py | 8 +- 3 files changed, 134 insertions(+), 65 deletions(-) diff --git a/mlflow/lightgbm.py b/mlflow/lightgbm.py index b8ab5df94d88d..5ced88a004ace 100644 --- a/mlflow/lightgbm.py +++ b/mlflow/lightgbm.py @@ -25,6 +25,7 @@ import logging import functools from copy import deepcopy +from packaging.version import Version import mlflow from mlflow import pyfunc @@ -372,7 +373,8 @@ def autolog( - parameters specified in `lightgbm.train`_. - metrics on each iteration (if ``valid_sets`` specified). - - metrics at the best iteration (if ``early_stopping_rounds`` specified). + - metrics at the best iteration (if ``early_stopping_rounds`` specified or ``early_stopping`` + callback is set). - feature importance (both "split" and "gain") as JSON files and plots. - trained model, including: - an example of valid input. @@ -496,10 +498,13 @@ def log_feature_importance_plot(features, importance, importance_type): "fobj", "feval", "init_model", - "evals_result", "learning_rates", "callbacks", ] + if Version(lightgbm.__version__) <= Version("3.3.1"): + # The parameter `evals_result` in `lightgbm.train` is removed in this PR: + # https://github.com/microsoft/LightGBM/pull/4882 + unlogged_params.append("evals_result") params_to_log_for_fn = get_mlflow_run_params_for_fn_args( original, args, kwargs, unlogged_params @@ -531,12 +536,9 @@ def log_feature_importance_plot(features, importance, importance_type): # training model model = original(*args, **kwargs) - # If early_stopping_rounds is present, logging metrics at the best iteration + # If early stopping is activated, logging metrics at the best iteration # as extra metrics with the max step + 1. - early_stopping_index = all_arg_names.index("early_stopping_rounds") - early_stopping = ( - num_pos_args >= early_stopping_index + 1 or "early_stopping_rounds" in kwargs - ) + early_stopping = model.best_iteration > 0 if early_stopping: extra_step = len(eval_results) autologging_client.log_metrics( diff --git a/tests/lightgbm/test_lightgbm_autolog.py b/tests/lightgbm/test_lightgbm_autolog.py index 1347786bfc455..013df418e6605 100644 --- a/tests/lightgbm/test_lightgbm_autolog.py +++ b/tests/lightgbm/test_lightgbm_autolog.py @@ -102,10 +102,13 @@ def test_lgb_autolog_logs_default_params(bst_params, train_set): "fobj", "feval", "init_model", - "evals_result", "learning_rates", "callbacks", ] + if Version(lgb.__version__) <= Version("3.3.1"): + # The parameter `evals_result` in `lightgbm.train` is removed in this PR: + # https://github.com/microsoft/LightGBM/pull/4882 + unlogged_params.append("evals_result") for param in unlogged_params: assert param not in params @@ -116,12 +119,14 @@ def test_lgb_autolog_logs_specified_params(bst_params, train_set): mlflow.lightgbm.autolog() expected_params = { "num_boost_round": 10, - "early_stopping_rounds": 5, } if Version(lgb.__version__) <= Version("3.3.1"): # The parameter `verbose_eval` in `lightgbm.train` is removed in this PR: # https://github.com/microsoft/LightGBM/pull/4878 + # The parameter `early_stopping_rounds` in `lightgbm.train` is removed in this PR: + # https://github.com/microsoft/LightGBM/pull/4908 expected_params["verbose_eval"] = False + expected_params["early_stopping_rounds"] = 5 lgb.train(bst_params, train_set, valid_sets=[train_set], **expected_params) run = get_latest_run() params = run.data.params @@ -140,10 +145,13 @@ def test_lgb_autolog_logs_specified_params(bst_params, train_set): "fobj", "feval", "init_model", - "evals_result", "learning_rates", "callbacks", ] + if Version(lgb.__version__) <= Version("3.3.1"): + # The parameter `evals_result` in `lightgbm.train` is removed in this PR: + # https://github.com/microsoft/LightGBM/pull/4882 + unlogged_params.append("evals_result") for param in unlogged_params: assert param not in params @@ -153,14 +161,24 @@ def test_lgb_autolog_logs_specified_params(bst_params, train_set): def test_lgb_autolog_logs_metrics_with_validation_data(bst_params, train_set): mlflow.lightgbm.autolog() evals_result = {} - lgb.train( - bst_params, - train_set, - num_boost_round=10, - valid_sets=[train_set], - valid_names=["train"], - evals_result=evals_result, - ) + if Version(lgb.__version__) <= Version("3.3.1"): + lgb.train( + bst_params, + train_set, + num_boost_round=10, + valid_sets=[train_set], + valid_names=["train"], + evals_result=evals_result, + ) + else: + lgb.train( + bst_params, + train_set, + num_boost_round=10, + valid_sets=[train_set], + valid_names=["train"], + callbacks=[lgb.record_evaluation(evals_result)], + ) run = get_latest_run() data = run.data client = mlflow.tracking.MlflowClient() @@ -179,14 +197,24 @@ def test_lgb_autolog_logs_metrics_with_multi_validation_data(bst_params, train_s # To avoid that, create a new Dataset object. valid_sets = [train_set, lgb.Dataset(train_set.data)] valid_names = ["train", "valid"] - lgb.train( - bst_params, - train_set, - num_boost_round=10, - valid_sets=valid_sets, - valid_names=valid_names, - evals_result=evals_result, - ) + if Version(lgb.__version__) <= Version("3.3.1"): + lgb.train( + bst_params, + train_set, + num_boost_round=10, + valid_sets=valid_sets, + valid_names=valid_names, + evals_result=evals_result, + ) + else: + lgb.train( + bst_params, + train_set, + num_boost_round=10, + valid_sets=valid_sets, + valid_names=valid_names, + callbacks=[lgb.record_evaluation(evals_result)], + ) run = get_latest_run() data = run.data client = mlflow.tracking.MlflowClient() @@ -206,14 +234,24 @@ def test_lgb_autolog_logs_metrics_with_multi_metrics(bst_params, train_set): params.update(bst_params) valid_sets = [train_set] valid_names = ["train"] - lgb.train( - params, - train_set, - num_boost_round=10, - valid_sets=valid_sets, - valid_names=valid_names, - evals_result=evals_result, - ) + if Version(lgb.__version__) <= Version("3.3.1"): + lgb.train( + params, + train_set, + num_boost_round=10, + valid_sets=valid_sets, + valid_names=valid_names, + evals_result=evals_result, + ) + else: + lgb.train( + params, + train_set, + num_boost_round=10, + valid_sets=valid_sets, + valid_names=valid_names, + callbacks=[lgb.record_evaluation(evals_result)], + ) run = get_latest_run() data = run.data client = mlflow.tracking.MlflowClient() @@ -233,14 +271,24 @@ def test_lgb_autolog_logs_metrics_with_multi_validation_data_and_metrics(bst_par params.update(bst_params) valid_sets = [train_set, lgb.Dataset(train_set.data)] valid_names = ["train", "valid"] - lgb.train( - params, - train_set, - num_boost_round=10, - valid_sets=valid_sets, - valid_names=valid_names, - evals_result=evals_result, - ) + if Version(lgb.__version__) <= Version("3.3.1"): + lgb.train( + params, + train_set, + num_boost_round=10, + valid_sets=valid_sets, + valid_names=valid_names, + evals_result=evals_result, + ) + else: + lgb.train( + params, + train_set, + num_boost_round=10, + valid_sets=valid_sets, + valid_names=valid_names, + callbacks=[lgb.record_evaluation(evals_result)], + ) run = get_latest_run() data = run.data client = mlflow.tracking.MlflowClient() @@ -279,14 +327,24 @@ def record_metrics_side_effect(self, metrics, step=None): params.update(bst_params) valid_sets = [train_set, lgb.Dataset(train_set.data)] valid_names = ["train", "valid"] - lgb.train( - params, - train_set, - num_boost_round=10, - valid_sets=valid_sets, - valid_names=valid_names, - evals_result=evals_result, - ) + if Version(lgb.__version__) <= Version("3.3.1"): + lgb.train( + params, + train_set, + num_boost_round=10, + valid_sets=valid_sets, + valid_names=valid_names, + evals_result=evals_result, + ) + else: + lgb.train( + params, + train_set, + num_boost_round=10, + valid_sets=valid_sets, + valid_names=valid_names, + callbacks=[lgb.record_evaluation(evals_result)], + ) run = get_latest_run() original_metrics = run.data.metrics @@ -307,15 +365,28 @@ def test_lgb_autolog_logs_metrics_with_early_stopping(bst_params, train_set): params.update(bst_params) valid_sets = [train_set, lgb.Dataset(train_set.data)] valid_names = ["train", "valid"] - model = lgb.train( - params, - train_set, - num_boost_round=10, - early_stopping_rounds=5, - valid_sets=valid_sets, - valid_names=valid_names, - evals_result=evals_result, - ) + if Version(lgb.__version__) <= Version("3.3.1"): + model = lgb.train( + params, + train_set, + num_boost_round=10, + early_stopping_rounds=5, + valid_sets=valid_sets, + valid_names=valid_names, + evals_result=evals_result, + ) + else: + model = lgb.train( + params, + train_set, + num_boost_round=10, + valid_sets=valid_sets, + valid_names=valid_names, + callbacks=[ + lgb.record_evaluation(evals_result), + lgb.early_stopping(5), + ], + ) run = get_latest_run() data = run.data client = mlflow.tracking.MlflowClient() diff --git a/tests/sagemaker/mock/__init__.py b/tests/sagemaker/mock/__init__.py index c04888246a8cf..e15c6e6e2ede4 100644 --- a/tests/sagemaker/mock/__init__.py +++ b/tests/sagemaker/mock/__init__.py @@ -5,7 +5,6 @@ from moto.core import BaseBackend, BaseModel from moto.core.responses import BaseResponse -from moto.ec2 import ec2_backends from moto.iam.models import ACCOUNT_ID from moto.core.models import base_decorator @@ -1076,10 +1075,7 @@ def response_object(self): return response -# Create a SageMaker backend for each EC2 region -sagemaker_backends = {} -for region, ec2_backend in ec2_backends.items(): - new_backend = SageMakerBackend() - sagemaker_backends[region] = new_backend +# Create a SageMaker backend for EC2 region: "us-west-2" +sagemaker_backends = {"us-west-2": SageMakerBackend()} mock_sagemaker = base_decorator(sagemaker_backends) From 549254cbeb0a1e27b1ccf9f9f057e1f0f766aad8 Mon Sep 17 00:00:00 2001 From: Matthieu Maitre Date: Mon, 3 Jan 2022 16:36:02 -0800 Subject: [PATCH 4/4] Address PR feedback Signed-off-by: Matthieu Maitre --- mlflow/models/model.py | 11 +++--- mlflow/models/utils.py | 4 +-- tests/models/test_model.py | 74 +++++++++++++++++++++++++++++++++++++- 3 files changed, 81 insertions(+), 8 deletions(-) diff --git a/mlflow/models/model.py b/mlflow/models/model.py index 805ae1e605dc1..1e75c95dd19f9 100644 --- a/mlflow/models/model.py +++ b/mlflow/models/model.py @@ -71,17 +71,18 @@ def get_output_schema(self): def load_input_example(self, path: str): """ Load the input example saved along a model. Returns None if there is no example metadata - (i.e. the model was saved without example). Raises IO Exception if there is model metadata - but the example file is missing. + (i.e. the model was saved without example). Raises FileNotFoundError if there is model + metadata but the example file is missing. :param path: Path to the model directory. :return: Input example (NumPy ndarray, SciPy csc_matrix, SciPy csr_matrix, pandas DataFrame, dict) or None if the model has no example. """ - from mlflow.models.utils import ( - _read_example, - ) # Just-in-time import to avoid loading NumPy/pandas/DataFrame when not needed + + # Just-in-time import to only load example-parsing libraries (e.g. numpy, pandas, etc.) if + # example is requested. + from mlflow.models.utils import _read_example return _read_example(self, path) diff --git a/mlflow/models/utils.py b/mlflow/models/utils.py index 137a1eac0f0ab..206d0de6a128b 100644 --- a/mlflow/models/utils.py +++ b/mlflow/models/utils.py @@ -192,8 +192,8 @@ def _save_example(mlflow_model: Model, input_example: ModelInputExample, path: s def _read_example(mlflow_model: Model, path: str): """ Read example from a model directory. Returns None if there is no example metadata (i.e. the - model was saved without example). Raises IO Exception if there is model metadata but the example - file is missing. + model was saved without example). Raises FileNotFoundError if there is model metadata but the + example file is missing. :param mlflow_model: Model metadata. :param path: Path to the model directory. diff --git a/tests/models/test_model.py b/tests/models/test_model.py index 79cedf246cb06..67be4d7fb21c9 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -1,4 +1,5 @@ import os +import pytest from datetime import date import mlflow @@ -9,11 +10,12 @@ from mlflow.models import Model from mlflow.models.signature import ModelSignature from mlflow.models.utils import _save_example -from mlflow.types.schema import Schema, ColSpec +from mlflow.types.schema import Schema, ColSpec, TensorSpec from mlflow.utils.file_utils import TempDir from mlflow.utils.proto_json_utils import _dataframe_from_json from unittest import mock +from scipy.sparse import csc_matrix def test_model_save_load(): @@ -105,6 +107,7 @@ def test_model_log(): assert not hasattr(loaded_model, "databricks_runtime") loaded_example = loaded_model.load_input_example(local_path) + assert isinstance(loaded_example, pd.DataFrame) assert loaded_example.to_dict(orient="records")[0] == input_example @@ -169,9 +172,78 @@ def test_model_log_with_input_example_succeeds(): assert x.equals(input_example) loaded_example = loaded_model.load_input_example(local_path) + assert isinstance(loaded_example, pd.DataFrame) assert loaded_example.equals(input_example) +def test_model_load_input_example_numpy(): + with TempDir(chdr=True) as tmp: + input_example = np.array([[3, 4, 5]], dtype=np.int32) + sig = ModelSignature( + inputs=Schema([TensorSpec(type=input_example.dtype, shape=input_example.shape)]), + outputs=Schema([ColSpec(name=None, type="double")]), + ) + + local_path, _ = _log_model_with_signature_and_example(tmp, sig, input_example) + loaded_model = Model.load(os.path.join(local_path, "MLmodel")) + loaded_example = loaded_model.load_input_example(local_path) + + assert isinstance(loaded_example, np.ndarray) + assert np.array_equal(input_example, loaded_example) + + +def test_model_load_input_example_scipy(): + with TempDir(chdr=True) as tmp: + input_example = csc_matrix(np.arange(0, 12, 0.5).reshape(3, 8)) + sig = ModelSignature( + inputs=Schema([TensorSpec(type=input_example.data.dtype, shape=input_example.shape)]), + outputs=Schema([ColSpec(name=None, type="double")]), + ) + + local_path, _ = _log_model_with_signature_and_example(tmp, sig, input_example) + loaded_model = Model.load(os.path.join(local_path, "MLmodel")) + loaded_example = loaded_model.load_input_example(local_path) + + assert isinstance(loaded_example, csc_matrix) + assert np.array_equal(input_example.data, loaded_example.data) + + +def test_model_load_input_example_failures(): + with TempDir(chdr=True) as tmp: + input_example = np.array([[3, 4, 5]], dtype=np.int32) + sig = ModelSignature( + inputs=Schema([TensorSpec(type=input_example.dtype, shape=input_example.shape)]), + outputs=Schema([ColSpec(name=None, type="double")]), + ) + + local_path, _ = _log_model_with_signature_and_example(tmp, sig, input_example) + loaded_model = Model.load(os.path.join(local_path, "MLmodel")) + loaded_example = loaded_model.load_input_example(local_path) + assert loaded_example is not None + + with pytest.raises(FileNotFoundError, match="No such file or directory"): + loaded_model.load_input_example(os.path.join(local_path, "folder_which_does_not_exist")) + + path = os.path.join(local_path, loaded_model.saved_input_example_info["artifact_path"]) + os.remove(path) + with pytest.raises(FileNotFoundError, match="No such file or directory"): + loaded_model.load_input_example(local_path) + + +def test_model_load_input_example_no_signature(): + with TempDir(chdr=True) as tmp: + input_example = np.array([[3, 4, 5]], dtype=np.int32) + sig = ModelSignature( + inputs=Schema([TensorSpec(type=input_example.dtype, shape=input_example.shape)]), + outputs=Schema([ColSpec(name=None, type="double")]), + ) + + local_path, _ = _log_model_with_signature_and_example(tmp, sig, input_example=None) + loaded_model = Model.load(os.path.join(local_path, "MLmodel")) + loaded_example = loaded_model.load_input_example(local_path) + assert loaded_example is None + + def _is_valid_uuid(val): import uuid