From 04e6c821d0655200b09a35c75cce4fde119abb58 Mon Sep 17 00:00:00 2001 From: Matthieu Maitre Date: Tue, 4 Jan 2022 10:09:25 -0800 Subject: [PATCH 1/3] Add a Python API to load model input examples (#5212) * Add method to load model input example Signed-off-by: Matthieu Maitre * Try fixing doc and lint failures Signed-off-by: Matthieu Maitre * Remove param 'evals_result' 'early_stopping_rounds' in lightgbm version > 3.3.1 (#5206) The parameter evals_result is removed in the master of lightgbm: microsoft/LightGBM#4882. The parameter early_stopping_rounds is removed in the master of lightgbm: microsoft/LightGBM#4908. We should also remove this param in our test. This PR also fixed the sagemaker test failure. Signed-off-by: Liang Zhang * Address PR feedback Signed-off-by: Matthieu Maitre Co-authored-by: Liang Zhang --- mlflow/models/model.py | 18 +++++++++ mlflow/models/utils.py | 4 +- tests/models/test_model.py | 80 +++++++++++++++++++++++++++++++++++++- 3 files changed, 99 insertions(+), 3 deletions(-) diff --git a/mlflow/models/model.py b/mlflow/models/model.py index 806357c10ab4a..1e75c95dd19f9 100644 --- a/mlflow/models/model.py +++ b/mlflow/models/model.py @@ -68,6 +68,24 @@ def get_input_schema(self): def get_output_schema(self): return self.signature.outputs if self.signature is not None else None + def load_input_example(self, path: str): + """ + Load the input example saved along a model. Returns None if there is no example metadata + (i.e. the model was saved without example). Raises FileNotFoundError if there is model + metadata but the example file is missing. + + :param path: Path to the model directory. + + :return: Input example (NumPy ndarray, SciPy csc_matrix, SciPy csr_matrix, + pandas DataFrame, dict) or None if the model has no example. + """ + + # Just-in-time import to only load example-parsing libraries (e.g. numpy, pandas, etc.) if + # example is requested. + from mlflow.models.utils import _read_example + + return _read_example(self, path) + def add_flavor(self, name, **params): """Add an entry for how to serve the model in a given format.""" self.flavors[name] = params diff --git a/mlflow/models/utils.py b/mlflow/models/utils.py index 137a1eac0f0ab..206d0de6a128b 100644 --- a/mlflow/models/utils.py +++ b/mlflow/models/utils.py @@ -192,8 +192,8 @@ def _save_example(mlflow_model: Model, input_example: ModelInputExample, path: s def _read_example(mlflow_model: Model, path: str): """ Read example from a model directory. Returns None if there is no example metadata (i.e. the - model was saved without example). Raises IO Exception if there is model metadata but the example - file is missing. + model was saved without example). Raises FileNotFoundError if there is model metadata but the + example file is missing. :param mlflow_model: Model metadata. :param path: Path to the model directory. diff --git a/tests/models/test_model.py b/tests/models/test_model.py index 131a70004b54a..67be4d7fb21c9 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -1,4 +1,5 @@ import os +import pytest from datetime import date import mlflow @@ -9,11 +10,12 @@ from mlflow.models import Model from mlflow.models.signature import ModelSignature from mlflow.models.utils import _save_example -from mlflow.types.schema import Schema, ColSpec +from mlflow.types.schema import Schema, ColSpec, TensorSpec from mlflow.utils.file_utils import TempDir from mlflow.utils.proto_json_utils import _dataframe_from_json from unittest import mock +from scipy.sparse import csc_matrix def test_model_save_load(): @@ -104,6 +106,10 @@ def test_model_log(): assert x.to_dict(orient="records")[0] == input_example assert not hasattr(loaded_model, "databricks_runtime") + loaded_example = loaded_model.load_input_example(local_path) + assert isinstance(loaded_example, pd.DataFrame) + assert loaded_example.to_dict(orient="records")[0] == input_example + def test_model_log_with_databricks_runtime(): dbr = "8.3.x-snapshot-gpu-ml-scala2.12" @@ -165,6 +171,78 @@ def test_model_log_with_input_example_succeeds(): input_example["d"] = input_example["d"].apply(lambda x: x.isoformat()) assert x.equals(input_example) + loaded_example = loaded_model.load_input_example(local_path) + assert isinstance(loaded_example, pd.DataFrame) + assert loaded_example.equals(input_example) + + +def test_model_load_input_example_numpy(): + with TempDir(chdr=True) as tmp: + input_example = np.array([[3, 4, 5]], dtype=np.int32) + sig = ModelSignature( + inputs=Schema([TensorSpec(type=input_example.dtype, shape=input_example.shape)]), + outputs=Schema([ColSpec(name=None, type="double")]), + ) + + local_path, _ = _log_model_with_signature_and_example(tmp, sig, input_example) + loaded_model = Model.load(os.path.join(local_path, "MLmodel")) + loaded_example = loaded_model.load_input_example(local_path) + + assert isinstance(loaded_example, np.ndarray) + assert np.array_equal(input_example, loaded_example) + + +def test_model_load_input_example_scipy(): + with TempDir(chdr=True) as tmp: + input_example = csc_matrix(np.arange(0, 12, 0.5).reshape(3, 8)) + sig = ModelSignature( + inputs=Schema([TensorSpec(type=input_example.data.dtype, shape=input_example.shape)]), + outputs=Schema([ColSpec(name=None, type="double")]), + ) + + local_path, _ = _log_model_with_signature_and_example(tmp, sig, input_example) + loaded_model = Model.load(os.path.join(local_path, "MLmodel")) + loaded_example = loaded_model.load_input_example(local_path) + + assert isinstance(loaded_example, csc_matrix) + assert np.array_equal(input_example.data, loaded_example.data) + + +def test_model_load_input_example_failures(): + with TempDir(chdr=True) as tmp: + input_example = np.array([[3, 4, 5]], dtype=np.int32) + sig = ModelSignature( + inputs=Schema([TensorSpec(type=input_example.dtype, shape=input_example.shape)]), + outputs=Schema([ColSpec(name=None, type="double")]), + ) + + local_path, _ = _log_model_with_signature_and_example(tmp, sig, input_example) + loaded_model = Model.load(os.path.join(local_path, "MLmodel")) + loaded_example = loaded_model.load_input_example(local_path) + assert loaded_example is not None + + with pytest.raises(FileNotFoundError, match="No such file or directory"): + loaded_model.load_input_example(os.path.join(local_path, "folder_which_does_not_exist")) + + path = os.path.join(local_path, loaded_model.saved_input_example_info["artifact_path"]) + os.remove(path) + with pytest.raises(FileNotFoundError, match="No such file or directory"): + loaded_model.load_input_example(local_path) + + +def test_model_load_input_example_no_signature(): + with TempDir(chdr=True) as tmp: + input_example = np.array([[3, 4, 5]], dtype=np.int32) + sig = ModelSignature( + inputs=Schema([TensorSpec(type=input_example.dtype, shape=input_example.shape)]), + outputs=Schema([ColSpec(name=None, type="double")]), + ) + + local_path, _ = _log_model_with_signature_and_example(tmp, sig, input_example=None) + loaded_model = Model.load(os.path.join(local_path, "MLmodel")) + loaded_example = loaded_model.load_input_example(local_path) + assert loaded_example is None + def _is_valid_uuid(val): import uuid From d6abb30a96a887c797572b5d969fb2429ae07258 Mon Sep 17 00:00:00 2001 From: Wendy Hu Date: Tue, 4 Jan 2022 14:18:04 -0800 Subject: [PATCH 2/3] Propagate API response error to client for list_artifacts (#5189) * propagate API response error to client for list_artifacts Signed-off-by: Wendy Hu * Add error message to test Signed-off-by: Wendy Hu * lint Signed-off-by: Wendy Hu --- .../databricks_models_artifact_repo.py | 3 +- .../test_databricks_models_artifact_repo.py | 29 ++++++++++++++++++- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/mlflow/store/artifact/databricks_models_artifact_repo.py b/mlflow/store/artifact/databricks_models_artifact_repo.py index 4f04e0f7f86cf..6670d8bd02ee9 100644 --- a/mlflow/store/artifact/databricks_models_artifact_repo.py +++ b/mlflow/store/artifact/databricks_models_artifact_repo.py @@ -71,8 +71,9 @@ def list_artifacts(self, path=None): json_body = self._make_json_body(path, page_token) response = self._call_endpoint(json_body, REGISTRY_LIST_ARTIFACTS_ENDPOINT) try: + response.raise_for_status() json_response = json.loads(response.text) - except ValueError: + except Exception: raise MlflowException( "API request to list files under path `%s` failed with status code %s. " "Response body: %s" % (path, response.status_code, response.text) diff --git a/tests/store/artifact/test_databricks_models_artifact_repo.py b/tests/store/artifact/test_databricks_models_artifact_repo.py index bcce2d170044b..4049e7bbe4838 100644 --- a/tests/store/artifact/test_databricks_models_artifact_repo.py +++ b/tests/store/artifact/test_databricks_models_artifact_repo.py @@ -142,8 +142,16 @@ def test_init_with_valid_uri_but_no_profile(self, valid_profileless_artifact_uri DatabricksModelsArtifactRepository(valid_profileless_artifact_uri) def test_list_artifacts(self, databricks_model_artifact_repo): + status_code = 200 + + def _raise_for_status(): + if status_code == 404: + raise Exception( + "404 Client Error: Not Found for url: https://shard-uri/api/2.0/mlflow/model-versions/list-artifacts?name=model&version=1" + ) + list_artifact_dir_response_mock = mock.MagicMock() - list_artifact_dir_response_mock.status_code = 200 + list_artifact_dir_response_mock.status_code = status_code list_artifact_dir_json_mock = { "files": [ {"path": "MLmodel", "is_dir": False, "file_size": 294}, @@ -151,6 +159,7 @@ def test_list_artifacts(self, databricks_model_artifact_repo): ] } list_artifact_dir_response_mock.text = json.dumps(list_artifact_dir_json_mock) + list_artifact_dir_response_mock.raise_for_status.side_effect = _raise_for_status with mock.patch( DATABRICKS_MODEL_ARTIFACT_REPOSITORY + "._call_endpoint" ) as call_endpoint_mock: @@ -166,6 +175,24 @@ def test_list_artifacts(self, databricks_model_artifact_repo): assert artifacts[1].file_size is None call_endpoint_mock.assert_called_once_with(ANY, REGISTRY_LIST_ARTIFACTS_ENDPOINT) + # errors from API are propagated through to cli response + list_artifact_dir_bad_response_mock = mock.MagicMock() + status_code = 404 + list_artifact_dir_bad_response_mock.status_code = status_code + list_artifact_dir_bad_response_mock.text = "An error occurred" + list_artifact_dir_bad_response_mock.raise_for_status.side_effect = _raise_for_status + with mock.patch( + DATABRICKS_MODEL_ARTIFACT_REPOSITORY + "._call_endpoint" + ) as call_endpoint_mock: + call_endpoint_mock.return_value = list_artifact_dir_bad_response_mock + with pytest.raises( + MlflowException, + match=r"API request to list files under path `` failed with status code 404. " + "Response body: An error occurred", + ): + databricks_model_artifact_repo.list_artifacts("") + call_endpoint_mock.assert_called_once_with(ANY, REGISTRY_LIST_ARTIFACTS_ENDPOINT) + def test_list_artifacts_for_single_file(self, databricks_model_artifact_repo): list_artifact_file_response_mock = mock.MagicMock() list_artifact_file_response_mock.status_code = 200 From 2d8cb0c20c12e9ae5627004a4d11295af59c08ba Mon Sep 17 00:00:00 2001 From: Harutaka Kawamura Date: Thu, 6 Jan 2022 00:11:43 +0900 Subject: [PATCH 3/3] Remove mleap commands in `.create-test-env.R` (#5217) * Remove mleap commands in .create-test-env.R Signed-off-by: harupy <17039389+harupy@users.noreply.github.com> * remove restore keys Signed-off-by: harupy <17039389+harupy@users.noreply.github.com> * Remove mlflow/R/mlflow/.spark-version Signed-off-by: harupy <17039389+harupy@users.noreply.github.com> * remove spark cache Signed-off-by: harupy <17039389+harupy@users.noreply.github.com> * remove if Signed-off-by: harupy <17039389+harupy@users.noreply.github.com> --- .github/workflows/master.yml | 8 -------- mlflow/R/mlflow/.create-test-env.R | 11 ----------- mlflow/R/mlflow/.spark-version | 1 - 3 files changed, 20 deletions(-) delete mode 100644 mlflow/R/mlflow/.spark-version diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index b62dcbff32229..63493c10991c8 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -84,20 +84,12 @@ jobs: os_name=$(lsb_release -ds | sed 's/\s/-/g') echo "::set-output name=os-name::$os_name" - name: Cache R packages - if: runner.os != 'Windows' uses: actions/cache@v2 with: path: ${{ env.R_LIBS_USER }} # We cache R dependencies based on a tuple of the current OS, the R version, and the list of # R dependencies key: ${{ steps.os-name.outputs.os-name }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} - restore-keys: ${{ steps.os-name.outputs.os-name }}-${{ hashFiles('.github/R-version') }}-1- - # Cache spark archive downloaded in `mlflow/R/mlflow/.create-test-env.R` - - name: Cache spark archive - uses: actions/cache@v2 - with: - path: ~/spark/spark-2.4.5-bin-hadoop2.7 - key: ${{ hashFiles('mlflow/R/mlflow/.spark-version') }} - name: Install system dependencies run: | sudo apt-get install -y libcurl4-openssl-dev diff --git a/mlflow/R/mlflow/.create-test-env.R b/mlflow/R/mlflow/.create-test-env.R index c42babf63d0e2..240a39a5e0472 100644 --- a/mlflow/R/mlflow/.create-test-env.R +++ b/mlflow/R/mlflow/.create-test-env.R @@ -13,14 +13,3 @@ reticulate::conda_install("'h5py<3.0.0'", envname = mlflow:::mlflow_conda_env_na reticulate::conda_install(Sys.getenv("MLFLOW_HOME", "../../../../."), envname = mlflow:::mlflow_conda_env_name(), pip = TRUE) reticulate::conda_install("xgboost", envname = mlflow:::mlflow_conda_env_name()) reticulate::conda_install(paste0("h2o==", packageVersion("h2o")), envname = mlflow:::mlflow_conda_env_name(), pip = TRUE) - -# The default timeout value (60 seconds) can be insufficient for `spark_install` to complete -options(timeout=60 * 60) - -spark_version_file <- "../.spark-version" -spark_version <- trimws(readChar(spark_version_file, file.info(spark_version_file)$size)) - -# Install MLeap runtime and required dependencies -sparklyr::spark_install(version = spark_version, verbose = TRUE) -mleap::install_maven() -mleap::install_mleap(version = "0.16.0") diff --git a/mlflow/R/mlflow/.spark-version b/mlflow/R/mlflow/.spark-version deleted file mode 100644 index 59aa62c1fa4c2..0000000000000 --- a/mlflow/R/mlflow/.spark-version +++ /dev/null @@ -1 +0,0 @@ -2.4.5