From 04e6c821d0655200b09a35c75cce4fde119abb58 Mon Sep 17 00:00:00 2001
From: Matthieu Maitre <mmaitre@microsoft.com>
Date: Tue, 4 Jan 2022 10:09:25 -0800
Subject: [PATCH 1/3] Add a Python API to load model input examples (#5212)

* Add method to load model input example

Signed-off-by: Matthieu Maitre <mmaitre@microsoft.com>

* Try fixing doc and lint failures

Signed-off-by: Matthieu Maitre <mmaitre@microsoft.com>

* Remove param 'evals_result' 'early_stopping_rounds' in lightgbm version > 3.3.1 (#5206)

The parameter evals_result is removed in the master of lightgbm: microsoft/LightGBM#4882.
The parameter early_stopping_rounds is removed in the master of lightgbm: microsoft/LightGBM#4908.

We should also remove this param in our test.

This PR also fixed the sagemaker test failure.

Signed-off-by: Liang Zhang <liang.zhang@databricks.com>

* Address PR feedback

Signed-off-by: Matthieu Maitre <mmaitre@microsoft.com>

Co-authored-by: Liang Zhang <liang.zhang@databricks.com>
---
 mlflow/models/model.py     | 18 +++++++++
 mlflow/models/utils.py     |  4 +-
 tests/models/test_model.py | 80 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 99 insertions(+), 3 deletions(-)

diff --git a/mlflow/models/model.py b/mlflow/models/model.py
index 806357c10ab4a..1e75c95dd19f9 100644
--- a/mlflow/models/model.py
+++ b/mlflow/models/model.py
@@ -68,6 +68,24 @@ def get_input_schema(self):
     def get_output_schema(self):
         return self.signature.outputs if self.signature is not None else None
 
+    def load_input_example(self, path: str):
+        """
+        Load the input example saved along a model. Returns None if there is no example metadata
+        (i.e. the model was saved without example). Raises FileNotFoundError if there is model
+        metadata but the example file is missing.
+
+        :param path: Path to the model directory.
+
+        :return: Input example (NumPy ndarray, SciPy csc_matrix, SciPy csr_matrix,
+                 pandas DataFrame, dict) or None if the model has no example.
+        """
+
+        # Just-in-time import to only load example-parsing libraries (e.g. numpy, pandas, etc.) if
+        # example is requested.
+        from mlflow.models.utils import _read_example
+
+        return _read_example(self, path)
+
     def add_flavor(self, name, **params):
         """Add an entry for how to serve the model in a given format."""
         self.flavors[name] = params
diff --git a/mlflow/models/utils.py b/mlflow/models/utils.py
index 137a1eac0f0ab..206d0de6a128b 100644
--- a/mlflow/models/utils.py
+++ b/mlflow/models/utils.py
@@ -192,8 +192,8 @@ def _save_example(mlflow_model: Model, input_example: ModelInputExample, path: s
 def _read_example(mlflow_model: Model, path: str):
     """
     Read example from a model directory. Returns None if there is no example metadata (i.e. the
-    model was saved without example). Raises IO Exception if there is model metadata but the example
-    file is missing.
+    model was saved without example). Raises FileNotFoundError if there is model metadata but the
+    example file is missing.
 
     :param mlflow_model: Model metadata.
     :param path: Path to the model directory.
diff --git a/tests/models/test_model.py b/tests/models/test_model.py
index 131a70004b54a..67be4d7fb21c9 100644
--- a/tests/models/test_model.py
+++ b/tests/models/test_model.py
@@ -1,4 +1,5 @@
 import os
+import pytest
 from datetime import date
 
 import mlflow
@@ -9,11 +10,12 @@
 from mlflow.models import Model
 from mlflow.models.signature import ModelSignature
 from mlflow.models.utils import _save_example
-from mlflow.types.schema import Schema, ColSpec
+from mlflow.types.schema import Schema, ColSpec, TensorSpec
 from mlflow.utils.file_utils import TempDir
 from mlflow.utils.proto_json_utils import _dataframe_from_json
 
 from unittest import mock
+from scipy.sparse import csc_matrix
 
 
 def test_model_save_load():
@@ -104,6 +106,10 @@ def test_model_log():
         assert x.to_dict(orient="records")[0] == input_example
         assert not hasattr(loaded_model, "databricks_runtime")
 
+        loaded_example = loaded_model.load_input_example(local_path)
+        assert isinstance(loaded_example, pd.DataFrame)
+        assert loaded_example.to_dict(orient="records")[0] == input_example
+
 
 def test_model_log_with_databricks_runtime():
     dbr = "8.3.x-snapshot-gpu-ml-scala2.12"
@@ -165,6 +171,78 @@ def test_model_log_with_input_example_succeeds():
         input_example["d"] = input_example["d"].apply(lambda x: x.isoformat())
         assert x.equals(input_example)
 
+        loaded_example = loaded_model.load_input_example(local_path)
+        assert isinstance(loaded_example, pd.DataFrame)
+        assert loaded_example.equals(input_example)
+
+
+def test_model_load_input_example_numpy():
+    with TempDir(chdr=True) as tmp:
+        input_example = np.array([[3, 4, 5]], dtype=np.int32)
+        sig = ModelSignature(
+            inputs=Schema([TensorSpec(type=input_example.dtype, shape=input_example.shape)]),
+            outputs=Schema([ColSpec(name=None, type="double")]),
+        )
+
+        local_path, _ = _log_model_with_signature_and_example(tmp, sig, input_example)
+        loaded_model = Model.load(os.path.join(local_path, "MLmodel"))
+        loaded_example = loaded_model.load_input_example(local_path)
+
+        assert isinstance(loaded_example, np.ndarray)
+        assert np.array_equal(input_example, loaded_example)
+
+
+def test_model_load_input_example_scipy():
+    with TempDir(chdr=True) as tmp:
+        input_example = csc_matrix(np.arange(0, 12, 0.5).reshape(3, 8))
+        sig = ModelSignature(
+            inputs=Schema([TensorSpec(type=input_example.data.dtype, shape=input_example.shape)]),
+            outputs=Schema([ColSpec(name=None, type="double")]),
+        )
+
+        local_path, _ = _log_model_with_signature_and_example(tmp, sig, input_example)
+        loaded_model = Model.load(os.path.join(local_path, "MLmodel"))
+        loaded_example = loaded_model.load_input_example(local_path)
+
+        assert isinstance(loaded_example, csc_matrix)
+        assert np.array_equal(input_example.data, loaded_example.data)
+
+
+def test_model_load_input_example_failures():
+    with TempDir(chdr=True) as tmp:
+        input_example = np.array([[3, 4, 5]], dtype=np.int32)
+        sig = ModelSignature(
+            inputs=Schema([TensorSpec(type=input_example.dtype, shape=input_example.shape)]),
+            outputs=Schema([ColSpec(name=None, type="double")]),
+        )
+
+        local_path, _ = _log_model_with_signature_and_example(tmp, sig, input_example)
+        loaded_model = Model.load(os.path.join(local_path, "MLmodel"))
+        loaded_example = loaded_model.load_input_example(local_path)
+        assert loaded_example is not None
+
+        with pytest.raises(FileNotFoundError, match="No such file or directory"):
+            loaded_model.load_input_example(os.path.join(local_path, "folder_which_does_not_exist"))
+
+        path = os.path.join(local_path, loaded_model.saved_input_example_info["artifact_path"])
+        os.remove(path)
+        with pytest.raises(FileNotFoundError, match="No such file or directory"):
+            loaded_model.load_input_example(local_path)
+
+
+def test_model_load_input_example_no_signature():
+    with TempDir(chdr=True) as tmp:
+        input_example = np.array([[3, 4, 5]], dtype=np.int32)
+        sig = ModelSignature(
+            inputs=Schema([TensorSpec(type=input_example.dtype, shape=input_example.shape)]),
+            outputs=Schema([ColSpec(name=None, type="double")]),
+        )
+
+        local_path, _ = _log_model_with_signature_and_example(tmp, sig, input_example=None)
+        loaded_model = Model.load(os.path.join(local_path, "MLmodel"))
+        loaded_example = loaded_model.load_input_example(local_path)
+        assert loaded_example is None
+
 
 def _is_valid_uuid(val):
     import uuid

From d6abb30a96a887c797572b5d969fb2429ae07258 Mon Sep 17 00:00:00 2001
From: Wendy Hu <wendy.hu@databricks.com>
Date: Tue, 4 Jan 2022 14:18:04 -0800
Subject: [PATCH 2/3] Propagate API response error to client for list_artifacts
 (#5189)

* propagate API response error to client for list_artifacts

Signed-off-by: Wendy Hu <wendy.hu@databricks.com>

* Add error message to test

Signed-off-by: Wendy Hu <wendy.hu@databricks.com>

* lint

Signed-off-by: Wendy Hu <wendy.hu@databricks.com>
---
 .../databricks_models_artifact_repo.py        |  3 +-
 .../test_databricks_models_artifact_repo.py   | 29 ++++++++++++++++++-
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/mlflow/store/artifact/databricks_models_artifact_repo.py b/mlflow/store/artifact/databricks_models_artifact_repo.py
index 4f04e0f7f86cf..6670d8bd02ee9 100644
--- a/mlflow/store/artifact/databricks_models_artifact_repo.py
+++ b/mlflow/store/artifact/databricks_models_artifact_repo.py
@@ -71,8 +71,9 @@ def list_artifacts(self, path=None):
             json_body = self._make_json_body(path, page_token)
             response = self._call_endpoint(json_body, REGISTRY_LIST_ARTIFACTS_ENDPOINT)
             try:
+                response.raise_for_status()
                 json_response = json.loads(response.text)
-            except ValueError:
+            except Exception:
                 raise MlflowException(
                     "API request to list files under path `%s` failed with status code %s. "
                     "Response body: %s" % (path, response.status_code, response.text)
diff --git a/tests/store/artifact/test_databricks_models_artifact_repo.py b/tests/store/artifact/test_databricks_models_artifact_repo.py
index bcce2d170044b..4049e7bbe4838 100644
--- a/tests/store/artifact/test_databricks_models_artifact_repo.py
+++ b/tests/store/artifact/test_databricks_models_artifact_repo.py
@@ -142,8 +142,16 @@ def test_init_with_valid_uri_but_no_profile(self, valid_profileless_artifact_uri
                 DatabricksModelsArtifactRepository(valid_profileless_artifact_uri)
 
     def test_list_artifacts(self, databricks_model_artifact_repo):
+        status_code = 200
+
+        def _raise_for_status():
+            if status_code == 404:
+                raise Exception(
+                    "404 Client Error: Not Found for url: https://shard-uri/api/2.0/mlflow/model-versions/list-artifacts?name=model&version=1"
+                )
+
         list_artifact_dir_response_mock = mock.MagicMock()
-        list_artifact_dir_response_mock.status_code = 200
+        list_artifact_dir_response_mock.status_code = status_code
         list_artifact_dir_json_mock = {
             "files": [
                 {"path": "MLmodel", "is_dir": False, "file_size": 294},
@@ -151,6 +159,7 @@ def test_list_artifacts(self, databricks_model_artifact_repo):
             ]
         }
         list_artifact_dir_response_mock.text = json.dumps(list_artifact_dir_json_mock)
+        list_artifact_dir_response_mock.raise_for_status.side_effect = _raise_for_status
         with mock.patch(
             DATABRICKS_MODEL_ARTIFACT_REPOSITORY + "._call_endpoint"
         ) as call_endpoint_mock:
@@ -166,6 +175,24 @@ def test_list_artifacts(self, databricks_model_artifact_repo):
             assert artifacts[1].file_size is None
             call_endpoint_mock.assert_called_once_with(ANY, REGISTRY_LIST_ARTIFACTS_ENDPOINT)
 
+        # errors from API are propagated through to cli response
+        list_artifact_dir_bad_response_mock = mock.MagicMock()
+        status_code = 404
+        list_artifact_dir_bad_response_mock.status_code = status_code
+        list_artifact_dir_bad_response_mock.text = "An error occurred"
+        list_artifact_dir_bad_response_mock.raise_for_status.side_effect = _raise_for_status
+        with mock.patch(
+            DATABRICKS_MODEL_ARTIFACT_REPOSITORY + "._call_endpoint"
+        ) as call_endpoint_mock:
+            call_endpoint_mock.return_value = list_artifact_dir_bad_response_mock
+            with pytest.raises(
+                MlflowException,
+                match=r"API request to list files under path `` failed with status code 404. "
+                "Response body: An error occurred",
+            ):
+                databricks_model_artifact_repo.list_artifacts("")
+            call_endpoint_mock.assert_called_once_with(ANY, REGISTRY_LIST_ARTIFACTS_ENDPOINT)
+
     def test_list_artifacts_for_single_file(self, databricks_model_artifact_repo):
         list_artifact_file_response_mock = mock.MagicMock()
         list_artifact_file_response_mock.status_code = 200

From 2d8cb0c20c12e9ae5627004a4d11295af59c08ba Mon Sep 17 00:00:00 2001
From: Harutaka Kawamura <hkawamura0130@gmail.com>
Date: Thu, 6 Jan 2022 00:11:43 +0900
Subject: [PATCH 3/3] Remove mleap commands in `.create-test-env.R` (#5217)

* Remove mleap commands in .create-test-env.R

Signed-off-by: harupy <17039389+harupy@users.noreply.github.com>

* remove restore keys

Signed-off-by: harupy <17039389+harupy@users.noreply.github.com>

* Remove mlflow/R/mlflow/.spark-version

Signed-off-by: harupy <17039389+harupy@users.noreply.github.com>

* remove spark cache

Signed-off-by: harupy <17039389+harupy@users.noreply.github.com>

* remove if

Signed-off-by: harupy <17039389+harupy@users.noreply.github.com>
---
 .github/workflows/master.yml       |  8 --------
 mlflow/R/mlflow/.create-test-env.R | 11 -----------
 mlflow/R/mlflow/.spark-version     |  1 -
 3 files changed, 20 deletions(-)
 delete mode 100644 mlflow/R/mlflow/.spark-version

diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
index b62dcbff32229..63493c10991c8 100644
--- a/.github/workflows/master.yml
+++ b/.github/workflows/master.yml
@@ -84,20 +84,12 @@ jobs:
         os_name=$(lsb_release -ds | sed 's/\s/-/g')
         echo "::set-output name=os-name::$os_name"
     - name: Cache R packages
-      if: runner.os != 'Windows'
       uses: actions/cache@v2
       with:
         path: ${{ env.R_LIBS_USER }}
         # We cache R dependencies based on a tuple of the current OS, the R version, and the list of
         # R dependencies
         key: ${{ steps.os-name.outputs.os-name }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
-        restore-keys: ${{ steps.os-name.outputs.os-name }}-${{ hashFiles('.github/R-version') }}-1-
-    # Cache spark archive downloaded in `mlflow/R/mlflow/.create-test-env.R`
-    - name: Cache spark archive
-      uses: actions/cache@v2
-      with:
-        path: ~/spark/spark-2.4.5-bin-hadoop2.7
-        key: ${{ hashFiles('mlflow/R/mlflow/.spark-version') }}
     - name: Install system dependencies
       run: |
         sudo apt-get install -y libcurl4-openssl-dev
diff --git a/mlflow/R/mlflow/.create-test-env.R b/mlflow/R/mlflow/.create-test-env.R
index c42babf63d0e2..240a39a5e0472 100644
--- a/mlflow/R/mlflow/.create-test-env.R
+++ b/mlflow/R/mlflow/.create-test-env.R
@@ -13,14 +13,3 @@ reticulate::conda_install("'h5py<3.0.0'", envname = mlflow:::mlflow_conda_env_na
 reticulate::conda_install(Sys.getenv("MLFLOW_HOME", "../../../../."), envname = mlflow:::mlflow_conda_env_name(), pip = TRUE)
 reticulate::conda_install("xgboost", envname = mlflow:::mlflow_conda_env_name())
 reticulate::conda_install(paste0("h2o==", packageVersion("h2o")), envname = mlflow:::mlflow_conda_env_name(), pip = TRUE)
-
-# The default timeout value (60 seconds) can be insufficient for `spark_install` to complete
-options(timeout=60 * 60)
-
-spark_version_file <- "../.spark-version"
-spark_version <- trimws(readChar(spark_version_file, file.info(spark_version_file)$size))
-
-# Install MLeap runtime and required dependencies
-sparklyr::spark_install(version = spark_version, verbose = TRUE)
-mleap::install_maven()
-mleap::install_mleap(version = "0.16.0")
diff --git a/mlflow/R/mlflow/.spark-version b/mlflow/R/mlflow/.spark-version
deleted file mode 100644
index 59aa62c1fa4c2..0000000000000
--- a/mlflow/R/mlflow/.spark-version
+++ /dev/null
@@ -1 +0,0 @@
-2.4.5