From e0f0c44877e660582fa00802fd68d6dfd313942a Mon Sep 17 00:00:00 2001
From: Matthieu Maitre <mmaitre@microsoft.com>
Date: Sun, 2 Jan 2022 11:57:19 -0800
Subject: [PATCH 1/4] Add method to load model input example

Signed-off-by: Matthieu Maitre <mmaitre@microsoft.com>
---
 mlflow/models/model.py     | 16 ++++++++++++++++
 tests/models/test_model.py |  6 ++++++
 2 files changed, 22 insertions(+)

diff --git a/mlflow/models/model.py b/mlflow/models/model.py
index 806357c10ab4a..1e66ce7201844 100644
--- a/mlflow/models/model.py
+++ b/mlflow/models/model.py
@@ -68,6 +68,22 @@ def get_input_schema(self):
     def get_output_schema(self):
         return self.signature.outputs if self.signature is not None else None
 
+    def load_input_example(self, path: str):
+        """
+        Load the input example saved along a model. Returns None if there is no example metadata (i.e. the
+        model was saved without example). Raises IO Exception if there is model metadata but the example
+        file is missing.
+
+        :param path: Path to the model directory.
+        :return: Input example (NumPy ndarray, SciPy csc_matrix, SciPy csr_matrix, pandas DataFrame, dict)
+        :        or None if the model has no example.
+        """
+        from mlflow.models.utils import (
+            _read_example,
+        )  # Just-in-time import to avoid loading NumPy/pandas/DataFrame when not needed
+
+        return _read_example(self, path)
+
     def add_flavor(self, name, **params):
         """Add an entry for how to serve the model in a given format."""
         self.flavors[name] = params
diff --git a/tests/models/test_model.py b/tests/models/test_model.py
index 131a70004b54a..79cedf246cb06 100644
--- a/tests/models/test_model.py
+++ b/tests/models/test_model.py
@@ -104,6 +104,9 @@ def test_model_log():
         assert x.to_dict(orient="records")[0] == input_example
         assert not hasattr(loaded_model, "databricks_runtime")
 
+        loaded_example = loaded_model.load_input_example(local_path)
+        assert loaded_example.to_dict(orient="records")[0] == input_example
+
 
 def test_model_log_with_databricks_runtime():
     dbr = "8.3.x-snapshot-gpu-ml-scala2.12"
@@ -165,6 +168,9 @@ def test_model_log_with_input_example_succeeds():
         input_example["d"] = input_example["d"].apply(lambda x: x.isoformat())
         assert x.equals(input_example)
 
+        loaded_example = loaded_model.load_input_example(local_path)
+        assert loaded_example.equals(input_example)
+
 
 def _is_valid_uuid(val):
     import uuid

From f17c180bf9f96f86068f4a76f5de202c354514a5 Mon Sep 17 00:00:00 2001
From: Matthieu Maitre <mmaitre@microsoft.com>
Date: Sun, 2 Jan 2022 12:29:01 -0800
Subject: [PATCH 2/4] Try fixing doc and lint failures

Signed-off-by: Matthieu Maitre <mmaitre@microsoft.com>
---
 mlflow/models/model.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/mlflow/models/model.py b/mlflow/models/model.py
index 1e66ce7201844..805ae1e605dc1 100644
--- a/mlflow/models/model.py
+++ b/mlflow/models/model.py
@@ -70,13 +70,14 @@ def get_output_schema(self):
 
     def load_input_example(self, path: str):
         """
-        Load the input example saved along a model. Returns None if there is no example metadata (i.e. the
-        model was saved without example). Raises IO Exception if there is model metadata but the example
-        file is missing.
+        Load the input example saved along a model. Returns None if there is no example metadata
+        (i.e. the model was saved without example). Raises IO Exception if there is model metadata
+        but the example file is missing.
 
         :param path: Path to the model directory.
-        :return: Input example (NumPy ndarray, SciPy csc_matrix, SciPy csr_matrix, pandas DataFrame, dict)
-        :        or None if the model has no example.
+
+        :return: Input example (NumPy ndarray, SciPy csc_matrix, SciPy csr_matrix,
+                 pandas DataFrame, dict) or None if the model has no example.
         """
         from mlflow.models.utils import (
             _read_example,

From 3f5913d5e6fc28828c8488250390b2c1a4fe34b0 Mon Sep 17 00:00:00 2001
From: Liang Zhang <liang.zhang@databricks.com>
Date: Thu, 30 Dec 2021 23:06:12 -0800
Subject: [PATCH 3/4] Remove param 'evals_result' 'early_stopping_rounds' in
 lightgbm version > 3.3.1 (#5206)

The parameter evals_result is removed in the master of lightgbm: microsoft/LightGBM#4882.
The parameter early_stopping_rounds is removed in the master of lightgbm: microsoft/LightGBM#4908.

We should also remove this param in our test.

This PR also fixed the sagemaker test failure.

Signed-off-by: Liang Zhang <liang.zhang@databricks.com>
---
 mlflow/lightgbm.py                      |  16 ++-
 tests/lightgbm/test_lightgbm_autolog.py | 175 +++++++++++++++++-------
 tests/sagemaker/mock/__init__.py        |   8 +-
 3 files changed, 134 insertions(+), 65 deletions(-)

diff --git a/mlflow/lightgbm.py b/mlflow/lightgbm.py
index b8ab5df94d88d..5ced88a004ace 100644
--- a/mlflow/lightgbm.py
+++ b/mlflow/lightgbm.py
@@ -25,6 +25,7 @@
 import logging
 import functools
 from copy import deepcopy
+from packaging.version import Version
 
 import mlflow
 from mlflow import pyfunc
@@ -372,7 +373,8 @@ def autolog(
 
     - parameters specified in `lightgbm.train`_.
     - metrics on each iteration (if ``valid_sets`` specified).
-    - metrics at the best iteration (if ``early_stopping_rounds`` specified).
+    - metrics at the best iteration (if ``early_stopping_rounds`` specified or ``early_stopping``
+        callback is set).
     - feature importance (both "split" and "gain") as JSON files and plots.
     - trained model, including:
         - an example of valid input.
@@ -496,10 +498,13 @@ def log_feature_importance_plot(features, importance, importance_type):
             "fobj",
             "feval",
             "init_model",
-            "evals_result",
             "learning_rates",
             "callbacks",
         ]
+        if Version(lightgbm.__version__) <= Version("3.3.1"):
+            # The parameter `evals_result` in `lightgbm.train` is removed in this PR:
+            # https://github.com/microsoft/LightGBM/pull/4882
+            unlogged_params.append("evals_result")
 
         params_to_log_for_fn = get_mlflow_run_params_for_fn_args(
             original, args, kwargs, unlogged_params
@@ -531,12 +536,9 @@ def log_feature_importance_plot(features, importance, importance_type):
             # training model
             model = original(*args, **kwargs)
 
-            # If early_stopping_rounds is present, logging metrics at the best iteration
+            # If early stopping is activated, logging metrics at the best iteration
             # as extra metrics with the max step + 1.
-            early_stopping_index = all_arg_names.index("early_stopping_rounds")
-            early_stopping = (
-                num_pos_args >= early_stopping_index + 1 or "early_stopping_rounds" in kwargs
-            )
+            early_stopping = model.best_iteration > 0
             if early_stopping:
                 extra_step = len(eval_results)
                 autologging_client.log_metrics(
diff --git a/tests/lightgbm/test_lightgbm_autolog.py b/tests/lightgbm/test_lightgbm_autolog.py
index 1347786bfc455..013df418e6605 100644
--- a/tests/lightgbm/test_lightgbm_autolog.py
+++ b/tests/lightgbm/test_lightgbm_autolog.py
@@ -102,10 +102,13 @@ def test_lgb_autolog_logs_default_params(bst_params, train_set):
         "fobj",
         "feval",
         "init_model",
-        "evals_result",
         "learning_rates",
         "callbacks",
     ]
+    if Version(lgb.__version__) <= Version("3.3.1"):
+        # The parameter `evals_result` in `lightgbm.train` is removed in this PR:
+        # https://github.com/microsoft/LightGBM/pull/4882
+        unlogged_params.append("evals_result")
 
     for param in unlogged_params:
         assert param not in params
@@ -116,12 +119,14 @@ def test_lgb_autolog_logs_specified_params(bst_params, train_set):
     mlflow.lightgbm.autolog()
     expected_params = {
         "num_boost_round": 10,
-        "early_stopping_rounds": 5,
     }
     if Version(lgb.__version__) <= Version("3.3.1"):
         # The parameter `verbose_eval` in `lightgbm.train` is removed in this PR:
         # https://github.com/microsoft/LightGBM/pull/4878
+        # The parameter `early_stopping_rounds` in `lightgbm.train` is removed in this PR:
+        # https://github.com/microsoft/LightGBM/pull/4908
         expected_params["verbose_eval"] = False
+        expected_params["early_stopping_rounds"] = 5
     lgb.train(bst_params, train_set, valid_sets=[train_set], **expected_params)
     run = get_latest_run()
     params = run.data.params
@@ -140,10 +145,13 @@ def test_lgb_autolog_logs_specified_params(bst_params, train_set):
         "fobj",
         "feval",
         "init_model",
-        "evals_result",
         "learning_rates",
         "callbacks",
     ]
+    if Version(lgb.__version__) <= Version("3.3.1"):
+        # The parameter `evals_result` in `lightgbm.train` is removed in this PR:
+        # https://github.com/microsoft/LightGBM/pull/4882
+        unlogged_params.append("evals_result")
 
     for param in unlogged_params:
         assert param not in params
@@ -153,14 +161,24 @@ def test_lgb_autolog_logs_specified_params(bst_params, train_set):
 def test_lgb_autolog_logs_metrics_with_validation_data(bst_params, train_set):
     mlflow.lightgbm.autolog()
     evals_result = {}
-    lgb.train(
-        bst_params,
-        train_set,
-        num_boost_round=10,
-        valid_sets=[train_set],
-        valid_names=["train"],
-        evals_result=evals_result,
-    )
+    if Version(lgb.__version__) <= Version("3.3.1"):
+        lgb.train(
+            bst_params,
+            train_set,
+            num_boost_round=10,
+            valid_sets=[train_set],
+            valid_names=["train"],
+            evals_result=evals_result,
+        )
+    else:
+        lgb.train(
+            bst_params,
+            train_set,
+            num_boost_round=10,
+            valid_sets=[train_set],
+            valid_names=["train"],
+            callbacks=[lgb.record_evaluation(evals_result)],
+        )
     run = get_latest_run()
     data = run.data
     client = mlflow.tracking.MlflowClient()
@@ -179,14 +197,24 @@ def test_lgb_autolog_logs_metrics_with_multi_validation_data(bst_params, train_s
     # To avoid that, create a new Dataset object.
     valid_sets = [train_set, lgb.Dataset(train_set.data)]
     valid_names = ["train", "valid"]
-    lgb.train(
-        bst_params,
-        train_set,
-        num_boost_round=10,
-        valid_sets=valid_sets,
-        valid_names=valid_names,
-        evals_result=evals_result,
-    )
+    if Version(lgb.__version__) <= Version("3.3.1"):
+        lgb.train(
+            bst_params,
+            train_set,
+            num_boost_round=10,
+            valid_sets=valid_sets,
+            valid_names=valid_names,
+            evals_result=evals_result,
+        )
+    else:
+        lgb.train(
+            bst_params,
+            train_set,
+            num_boost_round=10,
+            valid_sets=valid_sets,
+            valid_names=valid_names,
+            callbacks=[lgb.record_evaluation(evals_result)],
+        )
     run = get_latest_run()
     data = run.data
     client = mlflow.tracking.MlflowClient()
@@ -206,14 +234,24 @@ def test_lgb_autolog_logs_metrics_with_multi_metrics(bst_params, train_set):
     params.update(bst_params)
     valid_sets = [train_set]
     valid_names = ["train"]
-    lgb.train(
-        params,
-        train_set,
-        num_boost_round=10,
-        valid_sets=valid_sets,
-        valid_names=valid_names,
-        evals_result=evals_result,
-    )
+    if Version(lgb.__version__) <= Version("3.3.1"):
+        lgb.train(
+            params,
+            train_set,
+            num_boost_round=10,
+            valid_sets=valid_sets,
+            valid_names=valid_names,
+            evals_result=evals_result,
+        )
+    else:
+        lgb.train(
+            params,
+            train_set,
+            num_boost_round=10,
+            valid_sets=valid_sets,
+            valid_names=valid_names,
+            callbacks=[lgb.record_evaluation(evals_result)],
+        )
     run = get_latest_run()
     data = run.data
     client = mlflow.tracking.MlflowClient()
@@ -233,14 +271,24 @@ def test_lgb_autolog_logs_metrics_with_multi_validation_data_and_metrics(bst_par
     params.update(bst_params)
     valid_sets = [train_set, lgb.Dataset(train_set.data)]
     valid_names = ["train", "valid"]
-    lgb.train(
-        params,
-        train_set,
-        num_boost_round=10,
-        valid_sets=valid_sets,
-        valid_names=valid_names,
-        evals_result=evals_result,
-    )
+    if Version(lgb.__version__) <= Version("3.3.1"):
+        lgb.train(
+            params,
+            train_set,
+            num_boost_round=10,
+            valid_sets=valid_sets,
+            valid_names=valid_names,
+            evals_result=evals_result,
+        )
+    else:
+        lgb.train(
+            params,
+            train_set,
+            num_boost_round=10,
+            valid_sets=valid_sets,
+            valid_names=valid_names,
+            callbacks=[lgb.record_evaluation(evals_result)],
+        )
     run = get_latest_run()
     data = run.data
     client = mlflow.tracking.MlflowClient()
@@ -279,14 +327,24 @@ def record_metrics_side_effect(self, metrics, step=None):
         params.update(bst_params)
         valid_sets = [train_set, lgb.Dataset(train_set.data)]
         valid_names = ["train", "valid"]
-        lgb.train(
-            params,
-            train_set,
-            num_boost_round=10,
-            valid_sets=valid_sets,
-            valid_names=valid_names,
-            evals_result=evals_result,
-        )
+        if Version(lgb.__version__) <= Version("3.3.1"):
+            lgb.train(
+                params,
+                train_set,
+                num_boost_round=10,
+                valid_sets=valid_sets,
+                valid_names=valid_names,
+                evals_result=evals_result,
+            )
+        else:
+            lgb.train(
+                params,
+                train_set,
+                num_boost_round=10,
+                valid_sets=valid_sets,
+                valid_names=valid_names,
+                callbacks=[lgb.record_evaluation(evals_result)],
+            )
 
     run = get_latest_run()
     original_metrics = run.data.metrics
@@ -307,15 +365,28 @@ def test_lgb_autolog_logs_metrics_with_early_stopping(bst_params, train_set):
     params.update(bst_params)
     valid_sets = [train_set, lgb.Dataset(train_set.data)]
     valid_names = ["train", "valid"]
-    model = lgb.train(
-        params,
-        train_set,
-        num_boost_round=10,
-        early_stopping_rounds=5,
-        valid_sets=valid_sets,
-        valid_names=valid_names,
-        evals_result=evals_result,
-    )
+    if Version(lgb.__version__) <= Version("3.3.1"):
+        model = lgb.train(
+            params,
+            train_set,
+            num_boost_round=10,
+            early_stopping_rounds=5,
+            valid_sets=valid_sets,
+            valid_names=valid_names,
+            evals_result=evals_result,
+        )
+    else:
+        model = lgb.train(
+            params,
+            train_set,
+            num_boost_round=10,
+            valid_sets=valid_sets,
+            valid_names=valid_names,
+            callbacks=[
+                lgb.record_evaluation(evals_result),
+                lgb.early_stopping(5),
+            ],
+        )
     run = get_latest_run()
     data = run.data
     client = mlflow.tracking.MlflowClient()
diff --git a/tests/sagemaker/mock/__init__.py b/tests/sagemaker/mock/__init__.py
index c04888246a8cf..e15c6e6e2ede4 100644
--- a/tests/sagemaker/mock/__init__.py
+++ b/tests/sagemaker/mock/__init__.py
@@ -5,7 +5,6 @@
 
 from moto.core import BaseBackend, BaseModel
 from moto.core.responses import BaseResponse
-from moto.ec2 import ec2_backends
 
 from moto.iam.models import ACCOUNT_ID
 from moto.core.models import base_decorator
@@ -1076,10 +1075,7 @@ def response_object(self):
         return response
 
 
-# Create a SageMaker backend for each EC2 region
-sagemaker_backends = {}
-for region, ec2_backend in ec2_backends.items():
-    new_backend = SageMakerBackend()
-    sagemaker_backends[region] = new_backend
+# Create a SageMaker backend for EC2 region: "us-west-2"
+sagemaker_backends = {"us-west-2": SageMakerBackend()}
 
 mock_sagemaker = base_decorator(sagemaker_backends)

From 549254cbeb0a1e27b1ccf9f9f057e1f0f766aad8 Mon Sep 17 00:00:00 2001
From: Matthieu Maitre <mmaitre@microsoft.com>
Date: Mon, 3 Jan 2022 16:36:02 -0800
Subject: [PATCH 4/4] Address PR feedback

Signed-off-by: Matthieu Maitre <mmaitre@microsoft.com>
---
 mlflow/models/model.py     | 11 +++---
 mlflow/models/utils.py     |  4 +--
 tests/models/test_model.py | 74 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 81 insertions(+), 8 deletions(-)

diff --git a/mlflow/models/model.py b/mlflow/models/model.py
index 805ae1e605dc1..1e75c95dd19f9 100644
--- a/mlflow/models/model.py
+++ b/mlflow/models/model.py
@@ -71,17 +71,18 @@ def get_output_schema(self):
     def load_input_example(self, path: str):
         """
         Load the input example saved along a model. Returns None if there is no example metadata
-        (i.e. the model was saved without example). Raises IO Exception if there is model metadata
-        but the example file is missing.
+        (i.e. the model was saved without example). Raises FileNotFoundError if there is model
+        metadata but the example file is missing.
 
         :param path: Path to the model directory.
 
         :return: Input example (NumPy ndarray, SciPy csc_matrix, SciPy csr_matrix,
                  pandas DataFrame, dict) or None if the model has no example.
         """
-        from mlflow.models.utils import (
-            _read_example,
-        )  # Just-in-time import to avoid loading NumPy/pandas/DataFrame when not needed
+
+        # Just-in-time import to only load example-parsing libraries (e.g. numpy, pandas, etc.) if
+        # example is requested.
+        from mlflow.models.utils import _read_example
 
         return _read_example(self, path)
 
diff --git a/mlflow/models/utils.py b/mlflow/models/utils.py
index 137a1eac0f0ab..206d0de6a128b 100644
--- a/mlflow/models/utils.py
+++ b/mlflow/models/utils.py
@@ -192,8 +192,8 @@ def _save_example(mlflow_model: Model, input_example: ModelInputExample, path: s
 def _read_example(mlflow_model: Model, path: str):
     """
     Read example from a model directory. Returns None if there is no example metadata (i.e. the
-    model was saved without example). Raises IO Exception if there is model metadata but the example
-    file is missing.
+    model was saved without example). Raises FileNotFoundError if there is model metadata but the
+    example file is missing.
 
     :param mlflow_model: Model metadata.
     :param path: Path to the model directory.
diff --git a/tests/models/test_model.py b/tests/models/test_model.py
index 79cedf246cb06..67be4d7fb21c9 100644
--- a/tests/models/test_model.py
+++ b/tests/models/test_model.py
@@ -1,4 +1,5 @@
 import os
+import pytest
 from datetime import date
 
 import mlflow
@@ -9,11 +10,12 @@
 from mlflow.models import Model
 from mlflow.models.signature import ModelSignature
 from mlflow.models.utils import _save_example
-from mlflow.types.schema import Schema, ColSpec
+from mlflow.types.schema import Schema, ColSpec, TensorSpec
 from mlflow.utils.file_utils import TempDir
 from mlflow.utils.proto_json_utils import _dataframe_from_json
 
 from unittest import mock
+from scipy.sparse import csc_matrix
 
 
 def test_model_save_load():
@@ -105,6 +107,7 @@ def test_model_log():
         assert not hasattr(loaded_model, "databricks_runtime")
 
         loaded_example = loaded_model.load_input_example(local_path)
+        assert isinstance(loaded_example, pd.DataFrame)
         assert loaded_example.to_dict(orient="records")[0] == input_example
 
 
@@ -169,9 +172,78 @@ def test_model_log_with_input_example_succeeds():
         assert x.equals(input_example)
 
         loaded_example = loaded_model.load_input_example(local_path)
+        assert isinstance(loaded_example, pd.DataFrame)
         assert loaded_example.equals(input_example)
 
 
+def test_model_load_input_example_numpy():
+    with TempDir(chdr=True) as tmp:
+        input_example = np.array([[3, 4, 5]], dtype=np.int32)
+        sig = ModelSignature(
+            inputs=Schema([TensorSpec(type=input_example.dtype, shape=input_example.shape)]),
+            outputs=Schema([ColSpec(name=None, type="double")]),
+        )
+
+        local_path, _ = _log_model_with_signature_and_example(tmp, sig, input_example)
+        loaded_model = Model.load(os.path.join(local_path, "MLmodel"))
+        loaded_example = loaded_model.load_input_example(local_path)
+
+        assert isinstance(loaded_example, np.ndarray)
+        assert np.array_equal(input_example, loaded_example)
+
+
+def test_model_load_input_example_scipy():
+    with TempDir(chdr=True) as tmp:
+        input_example = csc_matrix(np.arange(0, 12, 0.5).reshape(3, 8))
+        sig = ModelSignature(
+            inputs=Schema([TensorSpec(type=input_example.data.dtype, shape=input_example.shape)]),
+            outputs=Schema([ColSpec(name=None, type="double")]),
+        )
+
+        local_path, _ = _log_model_with_signature_and_example(tmp, sig, input_example)
+        loaded_model = Model.load(os.path.join(local_path, "MLmodel"))
+        loaded_example = loaded_model.load_input_example(local_path)
+
+        assert isinstance(loaded_example, csc_matrix)
+        assert np.array_equal(input_example.data, loaded_example.data)
+
+
+def test_model_load_input_example_failures():
+    with TempDir(chdr=True) as tmp:
+        input_example = np.array([[3, 4, 5]], dtype=np.int32)
+        sig = ModelSignature(
+            inputs=Schema([TensorSpec(type=input_example.dtype, shape=input_example.shape)]),
+            outputs=Schema([ColSpec(name=None, type="double")]),
+        )
+
+        local_path, _ = _log_model_with_signature_and_example(tmp, sig, input_example)
+        loaded_model = Model.load(os.path.join(local_path, "MLmodel"))
+        loaded_example = loaded_model.load_input_example(local_path)
+        assert loaded_example is not None
+
+        with pytest.raises(FileNotFoundError, match="No such file or directory"):
+            loaded_model.load_input_example(os.path.join(local_path, "folder_which_does_not_exist"))
+
+        path = os.path.join(local_path, loaded_model.saved_input_example_info["artifact_path"])
+        os.remove(path)
+        with pytest.raises(FileNotFoundError, match="No such file or directory"):
+            loaded_model.load_input_example(local_path)
+
+
+def test_model_load_input_example_no_signature():
+    with TempDir(chdr=True) as tmp:
+        input_example = np.array([[3, 4, 5]], dtype=np.int32)
+        sig = ModelSignature(
+            inputs=Schema([TensorSpec(type=input_example.dtype, shape=input_example.shape)]),
+            outputs=Schema([ColSpec(name=None, type="double")]),
+        )
+
+        local_path, _ = _log_model_with_signature_and_example(tmp, sig, input_example=None)
+        loaded_model = Model.load(os.path.join(local_path, "MLmodel"))
+        loaded_example = loaded_model.load_input_example(local_path)
+        assert loaded_example is None
+
+
 def _is_valid_uuid(val):
     import uuid