Skip to content

Commit

Permalink
Autologging functionality for scikit-learn integration with XGBoost (…
Browse files Browse the repository at this point in the history
…Part 1) (#4954)

* xgb_sklearn_save_load

Signed-off-by: Junwen Yao <jwyiao@gmail.com>

* address review + fix lint

Signed-off-by: Junwen Yao <jwyiao@gmail.com>

* address review + update doc

Signed-off-by: Junwen Yao <jwyiao@gmail.com>

* address review

Signed-off-by: Junwen Yao <jwyiao@gmail.com>

* address review, keep cmt

Signed-off-by: Junwen Yao <jwyiao@gmail.com>

* address review

Signed-off-by: Junwen Yao <jwyiao@gmail.com>

* address review: add back data in pyfunc

Signed-off-by: Junwen Yao <jwyiao@gmail.com>

* update doc

Signed-off-by: Junwen Yao <jwyiao@gmail.com>

* update cmt

Signed-off-by: Junwen Yao <jwyiao@gmail.com>

* resolve conflict in master

Signed-off-by: Junwen Yao <jwyiao@gmail.com>

* address review: update cmt

Signed-off-by: Junwen Yao <jwyiao@gmail.com>
  • Loading branch information
jwyyy committed Nov 10, 2021
1 parent 2b5b6b9 commit 1fa5433
Show file tree
Hide file tree
Showing 2 changed files with 110 additions and 10 deletions.
46 changes: 36 additions & 10 deletions mlflow/xgboost.py
Expand Up @@ -32,6 +32,7 @@
from mlflow.models.signature import ModelSignature
from mlflow.models.utils import _save_example
from mlflow.tracking.artifact_utils import _download_artifact_from_uri
from mlflow.utils import _get_fully_qualified_class_name
from mlflow.utils.environment import (
_mlflow_conda_env,
_validate_env_arguments,
Expand Down Expand Up @@ -152,14 +153,19 @@ def save_model(

# Save an XGBoost model
xgb_model.save_model(model_data_path)

xgb_model_class = _get_fully_qualified_class_name(xgb_model)
pyfunc.add_to_model(
mlflow_model,
loader_module="mlflow.xgboost",
data=model_data_subpath,
env=_CONDA_ENV_FILE_NAME,
)
mlflow_model.add_flavor(FLAVOR_NAME, xgb_version=xgb.__version__, data=model_data_subpath)
mlflow_model.add_flavor(
FLAVOR_NAME,
xgb_version=xgb.__version__,
data=model_data_subpath,
model_class=xgb_model_class,
)
mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME))

if conda_env is None:
Expand Down Expand Up @@ -255,10 +261,28 @@ def log_model(


def _load_model(path):
import xgboost as xgb
"""
Load Model Implementation.
model = xgb.Booster()
model.load_model(os.path.abspath(path))
:param path: Local filesystem path to
the MLflow Model with the ``xgboost`` flavor (MLflow < 1.22.0) or
the top-level MLflow Model directory (MLflow >= 1.22.0).
"""
import importlib

model_dir = os.path.dirname(path) if os.path.isfile(path) else path
flavor_conf = _get_flavor_configuration(model_path=model_dir, flavor_name=FLAVOR_NAME)

# XGBoost models saved in MLflow >=1.22.0 have `model_class`
# in the XGBoost flavor configuration to specify its XGBoost model class.
# When loading models, we first get the XGBoost model from
# its flavor configuration and then create an instance based on its class.
model_class = flavor_conf.get("model_class", "xgboost.core.Booster")
xgb_model_path = os.path.join(model_dir, flavor_conf.get("data"))

module, cls = model_class.rsplit(".", maxsplit=1)
model = getattr(importlib.import_module(module), cls)()
model.load_model(xgb_model_path)
return model


Expand Down Expand Up @@ -289,12 +313,11 @@ def load_model(model_uri, dst_path=None):
This directory must already exist. If unspecified, a local output
path will be created.
:return: An XGBoost model (an instance of `xgboost.Booster`_)
:return: An XGBoost model. An instance of either `xgboost.Booster`_ or XGBoost scikit-learn
models, depending on the saved model class specification.
"""
local_model_path = _download_artifact_from_uri(artifact_uri=model_uri, output_path=dst_path)
flavor_conf = _get_flavor_configuration(model_path=local_model_path, flavor_name=FLAVOR_NAME)
xgb_model_file_path = os.path.join(local_model_path, flavor_conf.get("data", "model.xgb"))
return _load_model(path=xgb_model_file_path)
return _load_model(path=local_model_path)


class _XGBModelWrapper:
Expand All @@ -304,7 +327,10 @@ def __init__(self, xgb_model):
def predict(self, dataframe):
import xgboost as xgb

return self.xgb_model.predict(xgb.DMatrix(dataframe))
if isinstance(self.xgb_model, xgb.Booster):
return self.xgb_model.predict(xgb.DMatrix(dataframe))
else:
return self.xgb_model.predict(dataframe)


@experimental
Expand Down
74 changes: 74 additions & 0 deletions tests/xgboost/test_xgboost_model_export.py
Expand Up @@ -49,6 +49,16 @@ def xgb_model():
return ModelWithData(model=model, inference_dataframe=X, inference_dmatrix=dtrain)


@pytest.fixture(scope="session")
def xgb_sklearn_model():
wine = datasets.load_wine()
X = pd.DataFrame(wine.data, columns=wine.feature_names)
y = pd.Series(wine.target)
regressor = xgb.XGBRegressor(n_estimators=10)
regressor.fit(X, y)
return ModelWithData(model=regressor, inference_dataframe=X, inference_dmatrix=None)


@pytest.fixture
def model_path(tmpdir):
return os.path.join(str(tmpdir), "model")
Expand Down Expand Up @@ -80,6 +90,24 @@ def test_model_save_load(xgb_model, model_path):
)


@pytest.mark.large
def test_sklearn_model_save_load(xgb_sklearn_model, model_path):
model = xgb_sklearn_model.model
mlflow.xgboost.save_model(xgb_model=model, path=model_path)
reloaded_model = mlflow.xgboost.load_model(model_uri=model_path)
reloaded_pyfunc = pyfunc.load_pyfunc(model_uri=model_path)

np.testing.assert_array_almost_equal(
model.predict(xgb_sklearn_model.inference_dataframe),
reloaded_model.predict(xgb_sklearn_model.inference_dataframe),
)

np.testing.assert_array_almost_equal(
reloaded_model.predict(xgb_sklearn_model.inference_dataframe),
reloaded_pyfunc.predict(xgb_sklearn_model.inference_dataframe),
)


@pytest.mark.large
def test_signature_and_examples_are_saved_correctly(xgb_model):
model = xgb_model.model
Expand Down Expand Up @@ -452,3 +480,49 @@ def test_pyfunc_serve_and_score_sklearn(model):
)
scores = pd.read_json(resp.content, orient="records").values.squeeze()
np.testing.assert_array_equal(scores, model.predict(X.head(3)))


@pytest.mark.large
def test_load_pyfunc_succeeds_for_older_models_with_pyfunc_data_field(xgb_model, model_path):
"""
This test verifies that xgboost models saved in older versions of MLflow are loaded
successfully by ``mlflow.pyfunc.load_model``. These older models specify a pyfunc ``data``
field referring directly to a XGBoost model file. Newer models also have the
``model_class`` in XGBoost flavor.
"""
model = xgb_model.model
mlflow.xgboost.save_model(xgb_model=model, path=model_path)

model_conf_path = os.path.join(model_path, "MLmodel")
model_conf = Model.load(model_conf_path)
pyfunc_conf = model_conf.flavors.get(pyfunc.FLAVOR_NAME)
xgboost_conf = model_conf.flavors.get(mlflow.xgboost.FLAVOR_NAME)
assert xgboost_conf is not None
assert "model_class" in xgboost_conf
assert "data" in xgboost_conf
assert pyfunc_conf is not None
assert "model_class" not in pyfunc_conf
assert pyfunc.DATA in pyfunc_conf

# test old MLmodel conf
model_conf.flavors["xgboost"] = {"xgb_version": xgb.__version__, "data": "model.xgb"}
model_conf.save(model_conf_path)
model_conf = Model.load(model_conf_path)
xgboost_conf = model_conf.flavors.get(mlflow.xgboost.FLAVOR_NAME)
assert "data" in xgboost_conf
assert xgboost_conf["data"] == "model.xgb"

reloaded_pyfunc = pyfunc.load_model(model_uri=model_path)
assert isinstance(reloaded_pyfunc._model_impl.xgb_model, xgb.Booster)
reloaded_xgb = mlflow.xgboost.load_model(model_uri=model_path)
assert isinstance(reloaded_xgb, xgb.Booster)

np.testing.assert_array_almost_equal(
xgb_model.model.predict(xgb_model.inference_dmatrix),
reloaded_pyfunc.predict(xgb_model.inference_dataframe),
)

np.testing.assert_array_almost_equal(
reloaded_xgb.predict(xgb_model.inference_dmatrix),
reloaded_pyfunc.predict(xgb_model.inference_dataframe),
)

0 comments on commit 1fa5433

Please sign in to comment.