From 1fa5433313ba2767fd56a4f80e851406854e8d91 Mon Sep 17 00:00:00 2001 From: Junwen Yao Date: Tue, 9 Nov 2021 18:16:57 -0800 Subject: [PATCH] Autologging functionality for scikit-learn integration with XGBoost (Part 1) (#4954) * xgb_sklearn_save_load Signed-off-by: Junwen Yao * address review + fix lint Signed-off-by: Junwen Yao * address review + update doc Signed-off-by: Junwen Yao * address review Signed-off-by: Junwen Yao * address review, keep cmt Signed-off-by: Junwen Yao * address review Signed-off-by: Junwen Yao * address review: add back data in pyfunc Signed-off-by: Junwen Yao * update doc Signed-off-by: Junwen Yao * update cmt Signed-off-by: Junwen Yao * resolve conflict in master Signed-off-by: Junwen Yao * address review: update cmt Signed-off-by: Junwen Yao --- mlflow/xgboost.py | 46 +++++++++++--- tests/xgboost/test_xgboost_model_export.py | 74 ++++++++++++++++++++++ 2 files changed, 110 insertions(+), 10 deletions(-) diff --git a/mlflow/xgboost.py b/mlflow/xgboost.py index 583964665ee6d..a6d67a45f9347 100644 --- a/mlflow/xgboost.py +++ b/mlflow/xgboost.py @@ -32,6 +32,7 @@ from mlflow.models.signature import ModelSignature from mlflow.models.utils import _save_example from mlflow.tracking.artifact_utils import _download_artifact_from_uri +from mlflow.utils import _get_fully_qualified_class_name from mlflow.utils.environment import ( _mlflow_conda_env, _validate_env_arguments, @@ -152,14 +153,19 @@ def save_model( # Save an XGBoost model xgb_model.save_model(model_data_path) - + xgb_model_class = _get_fully_qualified_class_name(xgb_model) pyfunc.add_to_model( mlflow_model, loader_module="mlflow.xgboost", data=model_data_subpath, env=_CONDA_ENV_FILE_NAME, ) - mlflow_model.add_flavor(FLAVOR_NAME, xgb_version=xgb.__version__, data=model_data_subpath) + mlflow_model.add_flavor( + FLAVOR_NAME, + xgb_version=xgb.__version__, + data=model_data_subpath, + model_class=xgb_model_class, + ) mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME)) if conda_env is None: @@ -255,10 +261,28 @@ def log_model( def _load_model(path): - import xgboost as xgb + """ + Load Model Implementation. - model = xgb.Booster() - model.load_model(os.path.abspath(path)) + :param path: Local filesystem path to + the MLflow Model with the ``xgboost`` flavor (MLflow < 1.22.0) or + the top-level MLflow Model directory (MLflow >= 1.22.0). + """ + import importlib + + model_dir = os.path.dirname(path) if os.path.isfile(path) else path + flavor_conf = _get_flavor_configuration(model_path=model_dir, flavor_name=FLAVOR_NAME) + + # XGBoost models saved in MLflow >=1.22.0 have `model_class` + # in the XGBoost flavor configuration to specify its XGBoost model class. + # When loading models, we first get the XGBoost model from + # its flavor configuration and then create an instance based on its class. + model_class = flavor_conf.get("model_class", "xgboost.core.Booster") + xgb_model_path = os.path.join(model_dir, flavor_conf.get("data")) + + module, cls = model_class.rsplit(".", maxsplit=1) + model = getattr(importlib.import_module(module), cls)() + model.load_model(xgb_model_path) return model @@ -289,12 +313,11 @@ def load_model(model_uri, dst_path=None): This directory must already exist. If unspecified, a local output path will be created. - :return: An XGBoost model (an instance of `xgboost.Booster`_) + :return: An XGBoost model. An instance of either `xgboost.Booster`_ or XGBoost scikit-learn + models, depending on the saved model class specification. """ local_model_path = _download_artifact_from_uri(artifact_uri=model_uri, output_path=dst_path) - flavor_conf = _get_flavor_configuration(model_path=local_model_path, flavor_name=FLAVOR_NAME) - xgb_model_file_path = os.path.join(local_model_path, flavor_conf.get("data", "model.xgb")) - return _load_model(path=xgb_model_file_path) + return _load_model(path=local_model_path) class _XGBModelWrapper: @@ -304,7 +327,10 @@ def __init__(self, xgb_model): def predict(self, dataframe): import xgboost as xgb - return self.xgb_model.predict(xgb.DMatrix(dataframe)) + if isinstance(self.xgb_model, xgb.Booster): + return self.xgb_model.predict(xgb.DMatrix(dataframe)) + else: + return self.xgb_model.predict(dataframe) @experimental diff --git a/tests/xgboost/test_xgboost_model_export.py b/tests/xgboost/test_xgboost_model_export.py index 8c8996e1ad3dc..6ce08e3cbd08b 100644 --- a/tests/xgboost/test_xgboost_model_export.py +++ b/tests/xgboost/test_xgboost_model_export.py @@ -49,6 +49,16 @@ def xgb_model(): return ModelWithData(model=model, inference_dataframe=X, inference_dmatrix=dtrain) +@pytest.fixture(scope="session") +def xgb_sklearn_model(): + wine = datasets.load_wine() + X = pd.DataFrame(wine.data, columns=wine.feature_names) + y = pd.Series(wine.target) + regressor = xgb.XGBRegressor(n_estimators=10) + regressor.fit(X, y) + return ModelWithData(model=regressor, inference_dataframe=X, inference_dmatrix=None) + + @pytest.fixture def model_path(tmpdir): return os.path.join(str(tmpdir), "model") @@ -80,6 +90,24 @@ def test_model_save_load(xgb_model, model_path): ) +@pytest.mark.large +def test_sklearn_model_save_load(xgb_sklearn_model, model_path): + model = xgb_sklearn_model.model + mlflow.xgboost.save_model(xgb_model=model, path=model_path) + reloaded_model = mlflow.xgboost.load_model(model_uri=model_path) + reloaded_pyfunc = pyfunc.load_pyfunc(model_uri=model_path) + + np.testing.assert_array_almost_equal( + model.predict(xgb_sklearn_model.inference_dataframe), + reloaded_model.predict(xgb_sklearn_model.inference_dataframe), + ) + + np.testing.assert_array_almost_equal( + reloaded_model.predict(xgb_sklearn_model.inference_dataframe), + reloaded_pyfunc.predict(xgb_sklearn_model.inference_dataframe), + ) + + @pytest.mark.large def test_signature_and_examples_are_saved_correctly(xgb_model): model = xgb_model.model @@ -452,3 +480,49 @@ def test_pyfunc_serve_and_score_sklearn(model): ) scores = pd.read_json(resp.content, orient="records").values.squeeze() np.testing.assert_array_equal(scores, model.predict(X.head(3))) + + +@pytest.mark.large +def test_load_pyfunc_succeeds_for_older_models_with_pyfunc_data_field(xgb_model, model_path): + """ + This test verifies that xgboost models saved in older versions of MLflow are loaded + successfully by ``mlflow.pyfunc.load_model``. These older models specify a pyfunc ``data`` + field referring directly to a XGBoost model file. Newer models also have the + ``model_class`` in XGBoost flavor. + """ + model = xgb_model.model + mlflow.xgboost.save_model(xgb_model=model, path=model_path) + + model_conf_path = os.path.join(model_path, "MLmodel") + model_conf = Model.load(model_conf_path) + pyfunc_conf = model_conf.flavors.get(pyfunc.FLAVOR_NAME) + xgboost_conf = model_conf.flavors.get(mlflow.xgboost.FLAVOR_NAME) + assert xgboost_conf is not None + assert "model_class" in xgboost_conf + assert "data" in xgboost_conf + assert pyfunc_conf is not None + assert "model_class" not in pyfunc_conf + assert pyfunc.DATA in pyfunc_conf + + # test old MLmodel conf + model_conf.flavors["xgboost"] = {"xgb_version": xgb.__version__, "data": "model.xgb"} + model_conf.save(model_conf_path) + model_conf = Model.load(model_conf_path) + xgboost_conf = model_conf.flavors.get(mlflow.xgboost.FLAVOR_NAME) + assert "data" in xgboost_conf + assert xgboost_conf["data"] == "model.xgb" + + reloaded_pyfunc = pyfunc.load_model(model_uri=model_path) + assert isinstance(reloaded_pyfunc._model_impl.xgb_model, xgb.Booster) + reloaded_xgb = mlflow.xgboost.load_model(model_uri=model_path) + assert isinstance(reloaded_xgb, xgb.Booster) + + np.testing.assert_array_almost_equal( + xgb_model.model.predict(xgb_model.inference_dmatrix), + reloaded_pyfunc.predict(xgb_model.inference_dataframe), + ) + + np.testing.assert_array_almost_equal( + reloaded_xgb.predict(xgb_model.inference_dmatrix), + reloaded_pyfunc.predict(xgb_model.inference_dataframe), + )