Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Autologging functionality for scikit-learn integration with LightGBM (Part 1) #5130

Merged
merged 7 commits into from Dec 24, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
32 changes: 24 additions & 8 deletions mlflow/lightgbm.py → mlflow/lightgbm/__init__.py
Expand Up @@ -32,6 +32,7 @@
from mlflow.models.model import MLMODEL_FILE_NAME
from mlflow.models.signature import ModelSignature
from mlflow.models.utils import ModelInputExample, _save_example
from mlflow.utils import _get_fully_qualified_class_name
from mlflow.tracking.artifact_utils import _download_artifact_from_uri
from mlflow.utils.environment import (
_mlflow_conda_env,
Expand Down Expand Up @@ -61,6 +62,8 @@
MlflowAutologgingQueueingClient,
)
from mlflow.tracking._model_registry import DEFAULT_AWAIT_MAX_SLEEP_SECONDS
from mlflow.lightgbm.utils import _save_lgb_model, _load_lgb_model


FLAVOR_NAME = "lightgbm"

Expand Down Expand Up @@ -143,15 +146,20 @@ def save_model(
_save_example(mlflow_model, input_example, path)

# Save a LightGBM model
lgb_model.save_model(model_data_path)
_save_lgb_model(lgb_model, model_data_path)

pyfunc.add_to_model(
mlflow_model,
loader_module="mlflow.lightgbm",
data=model_data_subpath,
env=_CONDA_ENV_FILE_NAME,
)
mlflow_model.add_flavor(FLAVOR_NAME, lgb_version=lgb.__version__, data=model_data_subpath)
mlflow_model.add_flavor(
FLAVOR_NAME,
lgb_version=lgb.__version__,
data=model_data_subpath,
model_class=_get_fully_qualified_class_name(lgb_model),
)
mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME))

if conda_env is None:
Expand Down Expand Up @@ -251,9 +259,18 @@ def log_model(


def _load_model(path):
import lightgbm as lgb
"""
:param path: Local filesystem path to
the MLflow Model with the ``lightgbm`` flavor (MLflow < x.x.x) or
the top-level MLflow Model directory (MLflow >= x.x.x).
"""
model_dir = os.path.dirname(path) if os.path.isfile(path) else path
flavor_conf = _get_flavor_configuration(model_path=model_dir, flavor_name=FLAVOR_NAME)

model_class = flavor_conf.get("model_class", "lightgbm.basic.Booster")
lgb_model_path = os.path.join(model_dir, flavor_conf.get("data"))

return lgb.Booster(model_file=path)
return _load_lgb_model(model_class, lgb_model_path)


def _load_pyfunc(path):
Expand Down Expand Up @@ -283,12 +300,11 @@ def load_model(model_uri, dst_path=None):
This directory must already exist. If unspecified, a local output
path will be created.

:return: A LightGBM model (an instance of `lightgbm.Booster`_).
:return: A LightGBM model (an instance of `lightgbm.Booster`_) or LightGBM scikit-learn
models, depending on the saved model class specification.
jwyyy marked this conversation as resolved.
Show resolved Hide resolved
"""
local_model_path = _download_artifact_from_uri(artifact_uri=model_uri, output_path=dst_path)
flavor_conf = _get_flavor_configuration(model_path=local_model_path, flavor_name=FLAVOR_NAME)
lgb_model_file_path = os.path.join(local_model_path, flavor_conf.get("data", "model.lgb"))
return _load_model(path=lgb_model_file_path)
return _load_model(path=local_model_path)


class _LGBModelWrapper:
Expand Down
110 changes: 110 additions & 0 deletions mlflow/lightgbm/utils.py
@@ -0,0 +1,110 @@
import importlib
import json
import os.path
import warnings
import numpy as np


def _label_encoder_to_json(le):
"""Returns a JSON compatible dictionary"""
meta = {}
for k, v in le.__dict__.items():
if isinstance(v, np.ndarray):
meta[k] = v.tolist()
else:
meta[k] = v
return meta


def _label_encoder_from_json(doc):
"""Load the encoder back from a JSON compatible dict"""
from lightgbm.compat import _LGBMLabelEncoder

le = _LGBMLabelEncoder()
meta = {}
for k, v in doc.items():
if k == "classes_":
le.classes_ = np.array(v) if v is not None else None
continue
meta[k] = v
le.__dict__.update(meta)
return le


def _save_lgb_attr(model_dir, fname, attr_dict):
with open(os.path.join(model_dir, "{}.json".format(fname)), "w") as f:
json.dump(attr_dict, f)


def _load_lgb_attr(model_dir, fname):
try:
with open(os.path.join(model_dir, "{}.json".format(fname))) as f:
attr = json.load(f)
return attr
except IOError:
return None


def _save_lgb_model(lgb_model, model_path) -> None:
Copy link
Collaborator

@dbczumar dbczumar Dec 14, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Following from #5130 (comment), I think it's probably safest to use pickle to save / load the model, since the LightGBM developers could make breaking changes that break the serialization / deserialization code. Thank you so much for taking the time to reach out to the LightGBM community and get some insight into the recommended best practices here.

Can we test out serialization / deserialization with custom objective functions (see docs for eval_metric here: https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html#lightgbm.LGBMClassifier.fit) and make sure that pickle works successfully? If it doesn't, we may want to use cloudpickle.

import lightgbm as lgb

model_dir = os.path.dirname(model_path)

if not isinstance(lgb_model, lgb.Booster):
meta = {}
for k, v in lgb_model.__dict__.items():
if k == "_le":
meta["_le"] = _label_encoder_to_json(v) if v else None
continue
if k == "_Booster":
continue
if k == "_classes" and v is not None:
meta["_classes"] = v.tolist()
continue
if k == "_class_map" and v:
py_dict = {}
for clazz, encoded in v.items():
py_dict[int(clazz)] = int(encoded)
v = py_dict
try:
json.dumps({k: v})
meta[k] = v
except TypeError:
warnings.warn(str(k) + " is not saved in Scikit-Learn meta.", UserWarning)
_save_lgb_attr(model_dir, "scikit-learn", meta)
lgb_model = lgb_model._Booster

lgb_model.save_model(model_path)
_save_lgb_attr(model_dir, "params", lgb_model.params)


def _load_lgb_model(lgb_model_class, model_path):
import lightgbm as lgb

module, cls = lgb_model_class.rsplit(".", maxsplit=1)
model_dir = os.path.dirname(model_path)
sk_attr = _load_lgb_attr(model_dir, "scikit-learn")
bst_params = _load_lgb_attr(model_dir, "params")

booster = lgb.Booster(model_file=model_path, params=bst_params)

if sk_attr is None:
warnings.warn("Loading a native LightGBM model with Scikit-Learn interface.")
return booster

sk_model = getattr(importlib.import_module(module), cls)()
states = {}
for k, v in sk_attr.items():
if k == "_le":
sk_model._le = _label_encoder_from_json(v)
continue
if k == "_classes":
sk_model._classes = np.array(v)
continue
states[k] = v
sk_model.__dict__.update(states)
# Delete the attribute after load
booster.set_attr(scikit_learn=None)
sk_model._Booster = booster

return sk_model
78 changes: 77 additions & 1 deletion tests/lightgbm/test_lightgbm_model_export.py
Expand Up @@ -50,6 +50,18 @@ def lgb_model():
return ModelWithData(model=model, inference_dataframe=X)


@pytest.fixture(scope="session")
def lgb_sklearn_model():
iris = datasets.load_iris()
X = pd.DataFrame(
iris.data[:, :2], columns=iris.feature_names[:2] # we only take the first two features.
)
y = iris.target
model = lgb.LGBMClassifier(n_estimators=10)
model.fit(X, y)
return ModelWithData(model=model, inference_dataframe=X)


@pytest.fixture
def model_path(tmpdir):
return os.path.join(str(tmpdir), "model")
Expand All @@ -68,7 +80,7 @@ def test_model_save_load(lgb_model, model_path):

mlflow.lightgbm.save_model(lgb_model=model, path=model_path)
reloaded_model = mlflow.lightgbm.load_model(model_uri=model_path)
reloaded_pyfunc = pyfunc.load_pyfunc(model_uri=model_path)
reloaded_pyfunc = pyfunc.load_model(model_uri=model_path)

np.testing.assert_array_almost_equal(
model.predict(lgb_model.inference_dataframe),
Expand All @@ -81,6 +93,24 @@ def test_model_save_load(lgb_model, model_path):
)


@pytest.mark.large
def test_sklearn_model_save_load(lgb_sklearn_model, model_path):
model = lgb_sklearn_model.model
mlflow.lightgbm.save_model(lgb_model=model, path=model_path)
reloaded_model = mlflow.lightgbm.load_model(model_uri=model_path)
reloaded_pyfunc = pyfunc.load_model(model_uri=model_path)

np.testing.assert_array_almost_equal(
model.predict(lgb_sklearn_model.inference_dataframe),
reloaded_model.predict(lgb_sklearn_model.inference_dataframe),
)

np.testing.assert_array_almost_equal(
reloaded_model.predict(lgb_sklearn_model.inference_dataframe),
reloaded_pyfunc.predict(lgb_sklearn_model.inference_dataframe),
)


def test_signature_and_examples_are_saved_correctly(lgb_model):
model = lgb_model.model
X = lgb_model.inference_dataframe
Expand Down Expand Up @@ -398,3 +428,49 @@ def test_pyfunc_serve_and_score_sklearn(model):
)
scores = pd.read_json(resp.content, orient="records").values.squeeze()
np.testing.assert_array_equal(scores, model.predict(X.head(3)))


@pytest.mark.large
def test_load_pyfunc_succeeds_for_older_models_with_pyfunc_data_field(lgb_model, model_path):
"""
This test verifies that LightGBM models saved in older versions of MLflow are loaded
successfully by ``mlflow.pyfunc.load_model``. These older models specify a pyfunc ``data``
field referring directly to a LightGBM model file. Newer models also have the
``model_class`` in LightGBM flavor.
"""
model = lgb_model.model
mlflow.lightgbm.save_model(lgb_model=model, path=model_path)

model_conf_path = os.path.join(model_path, "MLmodel")
model_conf = Model.load(model_conf_path)
pyfunc_conf = model_conf.flavors.get(pyfunc.FLAVOR_NAME)
lgb_conf = model_conf.flavors.get(mlflow.lightgbm.FLAVOR_NAME)
assert lgb_conf is not None
assert "model_class" in lgb_conf
assert "data" in lgb_conf
assert pyfunc_conf is not None
assert "model_class" not in pyfunc_conf
assert pyfunc.DATA in pyfunc_conf

# test old MLmodel conf
model_conf.flavors["lightgbm"] = {"lgb_version": lgb.__version__, "data": "model.lgb"}
model_conf.save(model_conf_path)
model_conf = Model.load(model_conf_path)
lgb_conf = model_conf.flavors.get(mlflow.lightgbm.FLAVOR_NAME)
assert "data" in lgb_conf
assert lgb_conf["data"] == "model.lgb"

reloaded_pyfunc = pyfunc.load_model(model_uri=model_path)
assert isinstance(reloaded_pyfunc._model_impl.lgb_model, lgb.Booster)
reloaded_lgb = mlflow.lightgbm.load_model(model_uri=model_path)
assert isinstance(reloaded_lgb, lgb.Booster)

np.testing.assert_array_almost_equal(
lgb_model.model.predict(lgb_model.inference_dataframe),
reloaded_pyfunc.predict(lgb_model.inference_dataframe),
)

np.testing.assert_array_almost_equal(
reloaded_lgb.predict(lgb_model.inference_dataframe),
reloaded_pyfunc.predict(lgb_model.inference_dataframe),
)