mlflow · harupy · Nov 10, 2021 · Oct 28, 2021 · Oct 28, 2021 · Oct 30, 2021
diff --git a/mlflow/xgboost.py b/mlflow/xgboost.py
@@ -152,14 +152,14 @@ def save_model(
 
     # Save an XGBoost model
     xgb_model.save_model(model_data_path)
-
+    xgb_model_class = xgb_model.__class__.__name__
     pyfunc.add_to_model(
         mlflow_model,
         loader_module="mlflow.xgboost",
-        data=model_data_subpath,
+        model_class=xgb_model_class,
         env=_CONDA_ENV_FILE_NAME,
     )
-    mlflow_model.add_flavor(FLAVOR_NAME, xgb_version=xgb.__version__, data=model_data_subpath)
+    mlflow_model.add_flavor(FLAVOR_NAME, xgb_version=xgb.__version__, model_class=xgb_model_class)
     mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME))
 
     if conda_env is None:
@@ -257,8 +257,27 @@ def log_model(
 def _load_model(path):
     import xgboost as xgb
 
-    model = xgb.Booster()
-    model.load_model(os.path.abspath(path))
+    if os.path.isfile(path):
+        # xgboost Booster models saved in MLflow (<x.x.x) specify
+        # the ``data`` field within its flavor configuration.
+        # For these models, the ``path`` parameter of ``_load_pyfunc()``
+        # refers directly to a Booster model object.
+        # In this case, we create a Booster() instance and load model weights.
+        model = xgb.Booster()
+        model.load_model(os.path.abspath(path))
+    else:
+        # In contrast, xgboost models saved in new MLflow (>=x.x.x) do not
+        # specify the ``data`` field within its flavor configuration.
+        # We use ``model_class`` to specify its sklearn model class.
+        # For these models, the ``path`` parameter of ``load_pyfunc()``
+        # refers to the top-level MLflow Model directory.
+        # In this case, we first get the xgboost sklearn model from
+        # its flavor configuration and then create an instance based on its class.
+        flavor_conf = _get_flavor_configuration(model_path=path, flavor_name=FLAVOR_NAME)
+        model_class = flavor_conf.get("model_class", "Booster")
+        model_path = os.path.join(path, "model.xgb")
+        model = getattr(xgb, model_class)()
+        model.load_model(model_path)
     return model
 
 
@@ -290,7 +309,11 @@ def load_model(model_uri):
     """
     local_model_path = _download_artifact_from_uri(artifact_uri=model_uri)
     flavor_conf = _get_flavor_configuration(model_path=local_model_path, flavor_name=FLAVOR_NAME)
-    xgb_model_file_path = os.path.join(local_model_path, flavor_conf.get("data", "model.xgb"))
+    xgb_model_file_path = (
+        os.path.join(local_model_path, flavor_conf["data"])
+        if "data" in flavor_conf
+        else local_model_path
+    )
     return _load_model(path=xgb_model_file_path)
 
 
@@ -301,7 +324,10 @@ def __init__(self, xgb_model):
     def predict(self, dataframe):
         import xgboost as xgb
 
-        return self.xgb_model.predict(xgb.DMatrix(dataframe))
+        if isinstance(self.xgb_model, xgb.Booster):
+            return self.xgb_model.predict(xgb.DMatrix(dataframe))
+        else:
+            return self.xgb_model.predict(dataframe)
 
 
 @experimental

diff --git a/tests/xgboost/test_xgboost_model_export.py b/tests/xgboost/test_xgboost_model_export.py
@@ -49,6 +49,16 @@ def xgb_model():
     return ModelWithData(model=model, inference_dataframe=X, inference_dmatrix=dtrain)
 
 
+@pytest.fixture(scope="session")
+def xgb_sklearn_model():
+    boston = datasets.load_boston()
+    X = pd.DataFrame(boston.data, columns=boston.feature_names)
+    y = pd.Series(boston.target)
+    regressor = xgb.XGBRegressor(n_estimators=10)
+    regressor.fit(X, y)
+    return ModelWithData(model=regressor, inference_dataframe=X, inference_dmatrix=None)
+
+
 @pytest.fixture
 def model_path(tmpdir):
     return os.path.join(str(tmpdir), "model")
@@ -80,6 +90,24 @@ def test_model_save_load(xgb_model, model_path):
     )
 
 
+@pytest.mark.large
+def test_sklearn_model_save_load(xgb_sklearn_model, model_path):
+    model = xgb_sklearn_model.model
+    mlflow.xgboost.save_model(xgb_model=model, path=model_path)
+    reloaded_model = mlflow.xgboost.load_model(model_uri=model_path)
+    reloaded_pyfunc = pyfunc.load_pyfunc(model_uri=model_path)
+
+    np.testing.assert_array_almost_equal(
+        model.predict(xgb_sklearn_model.inference_dataframe),
+        reloaded_model.predict(xgb_sklearn_model.inference_dataframe),
+    )
+
+    np.testing.assert_array_almost_equal(
+        reloaded_model.predict(xgb_sklearn_model.inference_dataframe),
+        reloaded_pyfunc.predict(xgb_sklearn_model.inference_dataframe),
+    )
+
+
 @pytest.mark.large
 def test_signature_and_examples_are_saved_correctly(xgb_model):
     model = xgb_model.model
@@ -452,3 +480,42 @@ def test_pyfunc_serve_and_score_sklearn(model):
     )
     scores = pd.read_json(resp.content, orient="records").values.squeeze()
     np.testing.assert_array_equal(scores, model.predict(X.head(3)))
+
+
+@pytest.mark.large
+def test_load_pyfunc_succeeds_for_older_models_with_pyfunc_data_field(xgb_model, model_path):
+    """
+    This test verifies that xgboost models saved in older versions of MLflow are loaded
+    successfully by ``mlflow.pyfunc.load_model``. These older models specify a pyfunc ``data``
+    field referring directly to a serialized scikit-learn model file. In contrast, newer models
+    omit the ``data`` field.
+    """
+    model = xgb_model.model
+    mlflow.xgboost.save_model(xgb_model=model, path=model_path)
+
+    model_conf_path = os.path.join(model_path, "MLmodel")
+    model_conf = Model.load(model_conf_path)
+    pyfunc_conf = model_conf.flavors.get(pyfunc.FLAVOR_NAME)
+    xgboost_conf = model_conf.flavors.get(mlflow.xgboost.FLAVOR_NAME)
+    assert xgboost_conf is not None
+    assert "model_class" in xgboost_conf
+    assert "data" not in xgboost_conf
+    assert pyfunc_conf is not None
+    assert "model_class" in pyfunc_conf
+    assert pyfunc.DATA not in pyfunc_conf
+    pyfunc_conf[pyfunc.DATA] = "model.xgb"
+
+    reloaded_pyfunc = pyfunc.load_pyfunc(model_uri=model_path)
+    assert isinstance(reloaded_pyfunc._model_impl.xgb_model, xgb.Booster)
+    reloaded_xgb = mlflow.xgboost.load_model(model_uri=model_path)
+    assert isinstance(reloaded_xgb, xgb.Booster)
+
+    np.testing.assert_array_almost_equal(
+        xgb_model.model.predict(xgb_model.inference_dmatrix),
+        reloaded_pyfunc.predict(xgb_model.inference_dataframe),
+    )
+
+    np.testing.assert_array_almost_equal(
+        reloaded_xgb.predict(xgb_model.inference_dmatrix),
+        reloaded_pyfunc.predict(xgb_model.inference_dataframe),
+    )