Autologging functionality for scikit-learn integration with XGBoost (…

…Part 1) (#4954) * xgb_sklearn_save_load Signed-off-by: Junwen Yao <jwyiao@gmail.com> * address review + fix lint Signed-off-by: Junwen Yao <jwyiao@gmail.com> * address review + update doc Signed-off-by: Junwen Yao <jwyiao@gmail.com> * address review Signed-off-by: Junwen Yao <jwyiao@gmail.com> * address review, keep cmt Signed-off-by: Junwen Yao <jwyiao@gmail.com> * address review Signed-off-by: Junwen Yao <jwyiao@gmail.com> * address review: add back data in pyfunc Signed-off-by: Junwen Yao <jwyiao@gmail.com> * update doc Signed-off-by: Junwen Yao <jwyiao@gmail.com> * update cmt Signed-off-by: Junwen Yao <jwyiao@gmail.com> * resolve conflict in master Signed-off-by: Junwen Yao <jwyiao@gmail.com> * address review: update cmt Signed-off-by: Junwen Yao <jwyiao@gmail.com>
mlflow · Nov 10, 2021 · 1fa5433 · 1fa5433
1 parent 2b5b6b9
commit 1fa5433
Show file tree

Hide file tree

Showing 2 changed files with 110 additions and 10 deletions.
diff --git a/mlflow/xgboost.py b/mlflow/xgboost.py
@@ -32,6 +32,7 @@
 from mlflow.models.signature import ModelSignature
 from mlflow.models.utils import _save_example
 from mlflow.tracking.artifact_utils import _download_artifact_from_uri
+from mlflow.utils import _get_fully_qualified_class_name
 from mlflow.utils.environment import (
     _mlflow_conda_env,
     _validate_env_arguments,
@@ -152,14 +153,19 @@ def save_model(
 
     # Save an XGBoost model
     xgb_model.save_model(model_data_path)
-
+    xgb_model_class = _get_fully_qualified_class_name(xgb_model)
     pyfunc.add_to_model(
         mlflow_model,
         loader_module="mlflow.xgboost",
         data=model_data_subpath,
         env=_CONDA_ENV_FILE_NAME,
     )
-    mlflow_model.add_flavor(FLAVOR_NAME, xgb_version=xgb.__version__, data=model_data_subpath)
+    mlflow_model.add_flavor(
+        FLAVOR_NAME,
+        xgb_version=xgb.__version__,
+        data=model_data_subpath,
+        model_class=xgb_model_class,
+    )
     mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME))
 
     if conda_env is None:
@@ -255,10 +261,28 @@ def log_model(
 
 
 def _load_model(path):
-    import xgboost as xgb
+    """
+    Load Model Implementation.
 
-    model = xgb.Booster()
-    model.load_model(os.path.abspath(path))
+    :param path: Local filesystem path to
+                    the MLflow Model with the ``xgboost`` flavor (MLflow < 1.22.0) or
+                    the top-level MLflow Model directory (MLflow >= 1.22.0).
+    """
+    import importlib
+
+    model_dir = os.path.dirname(path) if os.path.isfile(path) else path
+    flavor_conf = _get_flavor_configuration(model_path=model_dir, flavor_name=FLAVOR_NAME)
+
+    # XGBoost models saved in MLflow >=1.22.0 have `model_class`
+    # in the XGBoost flavor configuration to specify its XGBoost model class.
+    # When loading models, we first get the XGBoost model from
+    # its flavor configuration and then create an instance based on its class.
+    model_class = flavor_conf.get("model_class", "xgboost.core.Booster")
+    xgb_model_path = os.path.join(model_dir, flavor_conf.get("data"))
+
+    module, cls = model_class.rsplit(".", maxsplit=1)
+    model = getattr(importlib.import_module(module), cls)()
+    model.load_model(xgb_model_path)
     return model
 
 
@@ -289,12 +313,11 @@ def load_model(model_uri, dst_path=None):
                      This directory must already exist. If unspecified, a local output
                      path will be created.
 
-    :return: An XGBoost model (an instance of `xgboost.Booster`_)
+    :return: An XGBoost model. An instance of either `xgboost.Booster`_ or XGBoost scikit-learn
+             models, depending on the saved model class specification.
     """
     local_model_path = _download_artifact_from_uri(artifact_uri=model_uri, output_path=dst_path)
-    flavor_conf = _get_flavor_configuration(model_path=local_model_path, flavor_name=FLAVOR_NAME)
-    xgb_model_file_path = os.path.join(local_model_path, flavor_conf.get("data", "model.xgb"))
-    return _load_model(path=xgb_model_file_path)
+    return _load_model(path=local_model_path)
 
 
 class _XGBModelWrapper:
@@ -304,7 +327,10 @@ def __init__(self, xgb_model):
     def predict(self, dataframe):
         import xgboost as xgb
 
-        return self.xgb_model.predict(xgb.DMatrix(dataframe))
+        if isinstance(self.xgb_model, xgb.Booster):
+            return self.xgb_model.predict(xgb.DMatrix(dataframe))
+        else:
+            return self.xgb_model.predict(dataframe)
 
 
 @experimental

diff --git a/tests/xgboost/test_xgboost_model_export.py b/tests/xgboost/test_xgboost_model_export.py
@@ -49,6 +49,16 @@ def xgb_model():
     return ModelWithData(model=model, inference_dataframe=X, inference_dmatrix=dtrain)
 
 
+@pytest.fixture(scope="session")
+def xgb_sklearn_model():
+    wine = datasets.load_wine()
+    X = pd.DataFrame(wine.data, columns=wine.feature_names)
+    y = pd.Series(wine.target)
+    regressor = xgb.XGBRegressor(n_estimators=10)
+    regressor.fit(X, y)
+    return ModelWithData(model=regressor, inference_dataframe=X, inference_dmatrix=None)
+
+
 @pytest.fixture
 def model_path(tmpdir):
     return os.path.join(str(tmpdir), "model")
@@ -80,6 +90,24 @@ def test_model_save_load(xgb_model, model_path):
     )
 
 
+@pytest.mark.large
+def test_sklearn_model_save_load(xgb_sklearn_model, model_path):
+    model = xgb_sklearn_model.model
+    mlflow.xgboost.save_model(xgb_model=model, path=model_path)
+    reloaded_model = mlflow.xgboost.load_model(model_uri=model_path)
+    reloaded_pyfunc = pyfunc.load_pyfunc(model_uri=model_path)
+
+    np.testing.assert_array_almost_equal(
+        model.predict(xgb_sklearn_model.inference_dataframe),
+        reloaded_model.predict(xgb_sklearn_model.inference_dataframe),
+    )
+
+    np.testing.assert_array_almost_equal(
+        reloaded_model.predict(xgb_sklearn_model.inference_dataframe),
+        reloaded_pyfunc.predict(xgb_sklearn_model.inference_dataframe),
+    )
+
+
 @pytest.mark.large
 def test_signature_and_examples_are_saved_correctly(xgb_model):
     model = xgb_model.model
@@ -452,3 +480,49 @@ def test_pyfunc_serve_and_score_sklearn(model):
     )
     scores = pd.read_json(resp.content, orient="records").values.squeeze()
     np.testing.assert_array_equal(scores, model.predict(X.head(3)))
+
+
+@pytest.mark.large
+def test_load_pyfunc_succeeds_for_older_models_with_pyfunc_data_field(xgb_model, model_path):
+    """
+    This test verifies that xgboost models saved in older versions of MLflow are loaded
+    successfully by ``mlflow.pyfunc.load_model``. These older models specify a pyfunc ``data``
+    field referring directly to a XGBoost model file. Newer models also have the
+    ``model_class`` in XGBoost flavor.
+    """
+    model = xgb_model.model
+    mlflow.xgboost.save_model(xgb_model=model, path=model_path)
+
+    model_conf_path = os.path.join(model_path, "MLmodel")
+    model_conf = Model.load(model_conf_path)
+    pyfunc_conf = model_conf.flavors.get(pyfunc.FLAVOR_NAME)
+    xgboost_conf = model_conf.flavors.get(mlflow.xgboost.FLAVOR_NAME)
+    assert xgboost_conf is not None
+    assert "model_class" in xgboost_conf
+    assert "data" in xgboost_conf
+    assert pyfunc_conf is not None
+    assert "model_class" not in pyfunc_conf
+    assert pyfunc.DATA in pyfunc_conf
+
+    # test old MLmodel conf
+    model_conf.flavors["xgboost"] = {"xgb_version": xgb.__version__, "data": "model.xgb"}
+    model_conf.save(model_conf_path)
+    model_conf = Model.load(model_conf_path)
+    xgboost_conf = model_conf.flavors.get(mlflow.xgboost.FLAVOR_NAME)
+    assert "data" in xgboost_conf
+    assert xgboost_conf["data"] == "model.xgb"
+
+    reloaded_pyfunc = pyfunc.load_model(model_uri=model_path)
+    assert isinstance(reloaded_pyfunc._model_impl.xgb_model, xgb.Booster)
+    reloaded_xgb = mlflow.xgboost.load_model(model_uri=model_path)
+    assert isinstance(reloaded_xgb, xgb.Booster)
+
+    np.testing.assert_array_almost_equal(
+        xgb_model.model.predict(xgb_model.inference_dmatrix),
+        reloaded_pyfunc.predict(xgb_model.inference_dataframe),
+    )
+
+    np.testing.assert_array_almost_equal(
+        reloaded_xgb.predict(xgb_model.inference_dmatrix),
+        reloaded_pyfunc.predict(xgb_model.inference_dataframe),
+    )