Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Autologging functionality for scikit-learn integration with XGBoost (Part 1) #4954

Merged
merged 13 commits into from Nov 10, 2021
57 changes: 42 additions & 15 deletions mlflow/xgboost.py
Expand Up @@ -33,6 +33,7 @@
from mlflow.models.signature import ModelSignature
from mlflow.models.utils import _save_example
from mlflow.tracking.artifact_utils import _download_artifact_from_uri
from mlflow.utils import _get_fully_qualified_class_name
from mlflow.utils.environment import (
_mlflow_conda_env,
_validate_env_arguments,
Expand Down Expand Up @@ -152,14 +153,14 @@ def save_model(

# Save an XGBoost model
xgb_model.save_model(model_data_path)

pyfunc.add_to_model(
mlflow_model,
loader_module="mlflow.xgboost",
xgb_model_class = _get_fully_qualified_class_name(xgb_model)
pyfunc.add_to_model(mlflow_model, loader_module="mlflow.xgboost", env=_CONDA_ENV_FILE_NAME)
mlflow_model.add_flavor(
FLAVOR_NAME,
xgb_version=xgb.__version__,
model_class=xgb_model_class,
data=model_data_subpath,
jwyyy marked this conversation as resolved.
Show resolved Hide resolved
env=_CONDA_ENV_FILE_NAME,
)
mlflow_model.add_flavor(FLAVOR_NAME, xgb_version=xgb.__version__, data=model_data_subpath)
mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME))

if conda_env is None:
Expand Down Expand Up @@ -255,19 +256,43 @@ def log_model(


def _load_model(path):
import xgboost as xgb
"""
Load Model Implementation.

model = xgb.Booster()
model.load_model(os.path.abspath(path))
:param path: Local filesystem path to
the MLflow Model with the ``xgboost`` flavor (MLflow < x.x.x) or
the top-level MLflow Model directory (MLflow >= x.x.x).
"""
import importlib

model_dir = os.path.dirname(path) if os.path.isfile(path) else path
flavor_conf = _get_flavor_configuration(model_path=model_dir, flavor_name=FLAVOR_NAME)

# XGBoost Booster models saved in MLflow (<x.x.x) specify
# the ``data`` field within its pyfunc and XGBoost flavor configurations.
# In contrast, XGBoost models saved in new MLflow (>=x.x.x) do not
# specify the ``data`` field within its pyfunc flavor configuration.
# We also add ``model_class`` in XGBoost flavor configuration to specify
# its XGBoost model class. When loading models, we first get the XGBoost
# model from its flavor configuration and then create an instance based on its class.
jwyyy marked this conversation as resolved.
Show resolved Hide resolved
model_class = flavor_conf.get("model_class", "xgboost.core.Booster")
xgb_model_path = os.path.join(model_dir, flavor_conf.get("data"))

module, cls = model_class.rsplit(".", maxsplit=1)
model = getattr(importlib.import_module(module), cls)()
model.load_model(xgb_model_path)
return model


def _load_pyfunc(path):
"""
Load PyFunc implementation. Called by ``pyfunc.load_pyfunc``.

:param path: Local filesystem path to the MLflow Model with the ``xgboost`` flavor.
:param path: Local filesystem path to
the MLflow Model with the ``xgboost`` flavor (MLflow < x.x.x) or
the top-level MLflow Model directory (MLflow >= x.x.x).
"""

jwyyy marked this conversation as resolved.
Show resolved Hide resolved
return _XGBModelWrapper(_load_model(path))


Expand All @@ -286,12 +311,11 @@ def load_model(model_uri):
`Referencing Artifacts <https://www.mlflow.org/docs/latest/tracking.html#
artifact-locations>`_.

:return: An XGBoost model (an instance of `xgboost.Booster`_)
:return: An XGBoost model. An instance of either `xgboost.Booster`_ or XGBoost scikit-learn
models, depending on the saved model class specification.
"""
local_model_path = _download_artifact_from_uri(artifact_uri=model_uri)
flavor_conf = _get_flavor_configuration(model_path=local_model_path, flavor_name=FLAVOR_NAME)
xgb_model_file_path = os.path.join(local_model_path, flavor_conf.get("data", "model.xgb"))
return _load_model(path=xgb_model_file_path)
return _load_model(path=local_model_path)
jwyyy marked this conversation as resolved.
Show resolved Hide resolved


class _XGBModelWrapper:
Expand All @@ -301,7 +325,10 @@ def __init__(self, xgb_model):
def predict(self, dataframe):
import xgboost as xgb

return self.xgb_model.predict(xgb.DMatrix(dataframe))
if isinstance(self.xgb_model, xgb.Booster):
return self.xgb_model.predict(xgb.DMatrix(dataframe))
else:
return self.xgb_model.predict(dataframe)


@experimental
Expand Down
76 changes: 76 additions & 0 deletions tests/xgboost/test_xgboost_model_export.py
Expand Up @@ -49,6 +49,16 @@ def xgb_model():
return ModelWithData(model=model, inference_dataframe=X, inference_dmatrix=dtrain)


@pytest.fixture(scope="session")
def xgb_sklearn_model():
wine = datasets.load_wine()
X = pd.DataFrame(wine.data, columns=wine.feature_names)
y = pd.Series(wine.target)
regressor = xgb.XGBRegressor(n_estimators=10)
regressor.fit(X, y)
return ModelWithData(model=regressor, inference_dataframe=X, inference_dmatrix=None)


@pytest.fixture
def model_path(tmpdir):
return os.path.join(str(tmpdir), "model")
Expand Down Expand Up @@ -80,6 +90,24 @@ def test_model_save_load(xgb_model, model_path):
)


@pytest.mark.large
def test_sklearn_model_save_load(xgb_sklearn_model, model_path):
model = xgb_sklearn_model.model
mlflow.xgboost.save_model(xgb_model=model, path=model_path)
reloaded_model = mlflow.xgboost.load_model(model_uri=model_path)
reloaded_pyfunc = pyfunc.load_pyfunc(model_uri=model_path)

np.testing.assert_array_almost_equal(
model.predict(xgb_sklearn_model.inference_dataframe),
reloaded_model.predict(xgb_sklearn_model.inference_dataframe),
)

np.testing.assert_array_almost_equal(
reloaded_model.predict(xgb_sklearn_model.inference_dataframe),
reloaded_pyfunc.predict(xgb_sklearn_model.inference_dataframe),
)


@pytest.mark.large
def test_signature_and_examples_are_saved_correctly(xgb_model):
model = xgb_model.model
Expand Down Expand Up @@ -452,3 +480,51 @@ def test_pyfunc_serve_and_score_sklearn(model):
)
scores = pd.read_json(resp.content, orient="records").values.squeeze()
np.testing.assert_array_equal(scores, model.predict(X.head(3)))


@pytest.mark.large
def test_load_pyfunc_succeeds_for_older_models_with_pyfunc_data_field(xgb_model, model_path):
"""
This test verifies that xgboost models saved in older versions of MLflow are loaded
successfully by ``mlflow.pyfunc.load_model``. These older models specify a pyfunc ``data``
field referring directly to a XGBoost model file. In contrast, newer models add the
``model_class`` in XGBoost flavor and omit the ``data`` field in pyfunc flavor.
"""
model = xgb_model.model
mlflow.xgboost.save_model(xgb_model=model, path=model_path)

model_conf_path = os.path.join(model_path, "MLmodel")
model_conf = Model.load(model_conf_path)
pyfunc_conf = model_conf.flavors.get(pyfunc.FLAVOR_NAME)
xgboost_conf = model_conf.flavors.get(mlflow.xgboost.FLAVOR_NAME)
assert xgboost_conf is not None
assert "model_class" in xgboost_conf
assert "data" in xgboost_conf
assert pyfunc_conf is not None
assert "model_class" not in pyfunc_conf
assert pyfunc.DATA not in pyfunc_conf

# test old MLmodel conf
# add ``data`` field to pyfunc flavor and xgboost flavor
pyfunc.add_to_model(model_conf, loader_module="mlflow.xgboost", data="model.xgb")
model_conf.flavors["xgboost"] = {"xgb_version": xgb.__version__, "data": "model.xgb"}
model_conf.save(model_conf_path)
model_conf = Model.load(model_conf_path)
xgboost_conf = model_conf.flavors.get(mlflow.xgboost.FLAVOR_NAME)
assert "data" in xgboost_conf
assert xgboost_conf["data"] == "model.xgb"

reloaded_pyfunc = pyfunc.load_model(model_uri=model_path)
assert isinstance(reloaded_pyfunc._model_impl.xgb_model, xgb.Booster)
reloaded_xgb = mlflow.xgboost.load_model(model_uri=model_path)
assert isinstance(reloaded_xgb, xgb.Booster)

np.testing.assert_array_almost_equal(
xgb_model.model.predict(xgb_model.inference_dmatrix),
reloaded_pyfunc.predict(xgb_model.inference_dataframe),
)

np.testing.assert_array_almost_equal(
reloaded_xgb.predict(xgb_model.inference_dmatrix),
reloaded_pyfunc.predict(xgb_model.inference_dataframe),
)