mlflow · harupy · Dec 24, 2021 · Nov 30, 2021 · Nov 30, 2021 · Nov 30, 2021
diff --git a/mlflow/lightgbm.py → mlflow/lightgbm/__init__.py b/mlflow/lightgbm.py → mlflow/lightgbm/__init__.py
@@ -32,6 +32,7 @@
 from mlflow.models.model import MLMODEL_FILE_NAME
 from mlflow.models.signature import ModelSignature
 from mlflow.models.utils import ModelInputExample, _save_example
+from mlflow.utils import _get_fully_qualified_class_name
 from mlflow.tracking.artifact_utils import _download_artifact_from_uri
 from mlflow.utils.environment import (
     _mlflow_conda_env,
@@ -61,6 +62,8 @@
     MlflowAutologgingQueueingClient,
 )
 from mlflow.tracking._model_registry import DEFAULT_AWAIT_MAX_SLEEP_SECONDS
+from mlflow.lightgbm.utils import _save_lgb_model, _load_lgb_model
+
 
 FLAVOR_NAME = "lightgbm"
 
@@ -143,15 +146,20 @@ def save_model(
         _save_example(mlflow_model, input_example, path)
 
     # Save a LightGBM model
-    lgb_model.save_model(model_data_path)
+    _save_lgb_model(lgb_model, model_data_path)
 
     pyfunc.add_to_model(
         mlflow_model,
         loader_module="mlflow.lightgbm",
         data=model_data_subpath,
         env=_CONDA_ENV_FILE_NAME,
     )
-    mlflow_model.add_flavor(FLAVOR_NAME, lgb_version=lgb.__version__, data=model_data_subpath)
+    mlflow_model.add_flavor(
+        FLAVOR_NAME,
+        lgb_version=lgb.__version__,
+        data=model_data_subpath,
+        model_class=_get_fully_qualified_class_name(lgb_model),
+    )
     mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME))
 
     if conda_env is None:
@@ -251,9 +259,18 @@ def log_model(
 
 
 def _load_model(path):
-    import lightgbm as lgb
+    """
+    :param path: Local filesystem path to
+                 the MLflow Model with the ``lightgbm`` flavor (MLflow < x.x.x) or
+                 the top-level MLflow Model directory (MLflow >= x.x.x).
+    """
+    model_dir = os.path.dirname(path) if os.path.isfile(path) else path
+    flavor_conf = _get_flavor_configuration(model_path=model_dir, flavor_name=FLAVOR_NAME)
+
+    model_class = flavor_conf.get("model_class", "lightgbm.basic.Booster")
+    lgb_model_path = os.path.join(model_dir, flavor_conf.get("data"))
 
-    return lgb.Booster(model_file=path)
+    return _load_lgb_model(model_class, lgb_model_path)
 
 
 def _load_pyfunc(path):
@@ -283,12 +300,11 @@ def load_model(model_uri, dst_path=None):
                      This directory must already exist. If unspecified, a local output
                      path will be created.
 
-    :return: A LightGBM model (an instance of `lightgbm.Booster`_).
+    :return: A LightGBM model (an instance of `lightgbm.Booster`_) or LightGBM scikit-learn
+             models, depending on the saved model class specification.
     """
     local_model_path = _download_artifact_from_uri(artifact_uri=model_uri, output_path=dst_path)
-    flavor_conf = _get_flavor_configuration(model_path=local_model_path, flavor_name=FLAVOR_NAME)
-    lgb_model_file_path = os.path.join(local_model_path, flavor_conf.get("data", "model.lgb"))
-    return _load_model(path=lgb_model_file_path)
+    return _load_model(path=local_model_path)
 
 
 class _LGBModelWrapper:

diff --git a/mlflow/lightgbm/utils.py b/mlflow/lightgbm/utils.py
@@ -0,0 +1,110 @@
+import importlib
+import json
+import os.path
+import warnings
+import numpy as np
+
+
+def _label_encoder_to_json(le):
+    """Returns a JSON compatible dictionary"""
+    meta = {}
+    for k, v in le.__dict__.items():
+        if isinstance(v, np.ndarray):
+            meta[k] = v.tolist()
+        else:
+            meta[k] = v
+    return meta
+
+
+def _label_encoder_from_json(doc):
+    """Load the encoder back from a JSON compatible dict"""
+    from lightgbm.compat import _LGBMLabelEncoder
+
+    le = _LGBMLabelEncoder()
+    meta = {}
+    for k, v in doc.items():
+        if k == "classes_":
+            le.classes_ = np.array(v) if v is not None else None
+            continue
+        meta[k] = v
+    le.__dict__.update(meta)
+    return le
+
+
+def _save_lgb_attr(model_dir, fname, attr_dict):
+    with open(os.path.join(model_dir, "{}.json".format(fname)), "w") as f:
+        json.dump(attr_dict, f)
+
+
+def _load_lgb_attr(model_dir, fname):
+    try:
+        with open(os.path.join(model_dir, "{}.json".format(fname))) as f:
+            attr = json.load(f)
+        return attr
+    except IOError:
+        return None
+
+
+def _save_lgb_model(lgb_model, model_path) -> None:
+    import lightgbm as lgb
+
+    model_dir = os.path.dirname(model_path)
+
+    if not isinstance(lgb_model, lgb.Booster):
+        meta = {}
+        for k, v in lgb_model.__dict__.items():
+            if k == "_le":
+                meta["_le"] = _label_encoder_to_json(v) if v else None
+                continue
+            if k == "_Booster":
+                continue
+            if k == "_classes" and v is not None:
+                meta["_classes"] = v.tolist()
+                continue
+            if k == "_class_map" and v:
+                py_dict = {}
+                for clazz, encoded in v.items():
+                    py_dict[int(clazz)] = int(encoded)
+                v = py_dict
+            try:
+                json.dumps({k: v})
+                meta[k] = v
+            except TypeError:
+                warnings.warn(str(k) + " is not saved in Scikit-Learn meta.", UserWarning)
+        _save_lgb_attr(model_dir, "scikit-learn", meta)
+        lgb_model = lgb_model._Booster
+
+    lgb_model.save_model(model_path)
+    _save_lgb_attr(model_dir, "params", lgb_model.params)
+
+
+def _load_lgb_model(lgb_model_class, model_path):
+    import lightgbm as lgb
+
+    module, cls = lgb_model_class.rsplit(".", maxsplit=1)
+    model_dir = os.path.dirname(model_path)
+    sk_attr = _load_lgb_attr(model_dir, "scikit-learn")
+    bst_params = _load_lgb_attr(model_dir, "params")
+
+    booster = lgb.Booster(model_file=model_path, params=bst_params)
+
+    if sk_attr is None:
+        warnings.warn("Loading a native LightGBM model with Scikit-Learn interface.")
+        return booster
+
+    sk_model = getattr(importlib.import_module(module), cls)()
+    states = {}
+    for k, v in sk_attr.items():
+        if k == "_le":
+            sk_model._le = _label_encoder_from_json(v)
+            continue
+        if k == "_classes":
+            sk_model._classes = np.array(v)
+            continue
+        states[k] = v
+    sk_model.__dict__.update(states)
+    # Delete the attribute after load
+    booster.set_attr(scikit_learn=None)
+    sk_model._Booster = booster
+
+    return sk_model
diff --git a/tests/lightgbm/test_lightgbm_model_export.py b/tests/lightgbm/test_lightgbm_model_export.py
@@ -50,6 +50,18 @@ def lgb_model():
     return ModelWithData(model=model, inference_dataframe=X)
 
 
+@pytest.fixture(scope="session")
+def lgb_sklearn_model():
+    iris = datasets.load_iris()
+    X = pd.DataFrame(
+        iris.data[:, :2], columns=iris.feature_names[:2]  # we only take the first two features.
+    )
+    y = iris.target
+    model = lgb.LGBMClassifier(n_estimators=10)
+    model.fit(X, y)
+    return ModelWithData(model=model, inference_dataframe=X)
+
+
 @pytest.fixture
 def model_path(tmpdir):
     return os.path.join(str(tmpdir), "model")
@@ -68,7 +80,7 @@ def test_model_save_load(lgb_model, model_path):
 
     mlflow.lightgbm.save_model(lgb_model=model, path=model_path)
     reloaded_model = mlflow.lightgbm.load_model(model_uri=model_path)
-    reloaded_pyfunc = pyfunc.load_pyfunc(model_uri=model_path)
+    reloaded_pyfunc = pyfunc.load_model(model_uri=model_path)
 
     np.testing.assert_array_almost_equal(
         model.predict(lgb_model.inference_dataframe),
@@ -81,6 +93,24 @@ def test_model_save_load(lgb_model, model_path):
     )
 
 
+@pytest.mark.large
+def test_sklearn_model_save_load(lgb_sklearn_model, model_path):
+    model = lgb_sklearn_model.model
+    mlflow.lightgbm.save_model(lgb_model=model, path=model_path)
+    reloaded_model = mlflow.lightgbm.load_model(model_uri=model_path)
+    reloaded_pyfunc = pyfunc.load_model(model_uri=model_path)
+
+    np.testing.assert_array_almost_equal(
+        model.predict(lgb_sklearn_model.inference_dataframe),
+        reloaded_model.predict(lgb_sklearn_model.inference_dataframe),
+    )
+
+    np.testing.assert_array_almost_equal(
+        reloaded_model.predict(lgb_sklearn_model.inference_dataframe),
+        reloaded_pyfunc.predict(lgb_sklearn_model.inference_dataframe),
+    )
+
+
 def test_signature_and_examples_are_saved_correctly(lgb_model):
     model = lgb_model.model
     X = lgb_model.inference_dataframe
@@ -398,3 +428,49 @@ def test_pyfunc_serve_and_score_sklearn(model):
     )
     scores = pd.read_json(resp.content, orient="records").values.squeeze()
     np.testing.assert_array_equal(scores, model.predict(X.head(3)))
+
+
+@pytest.mark.large
+def test_load_pyfunc_succeeds_for_older_models_with_pyfunc_data_field(lgb_model, model_path):
+    """
+    This test verifies that LightGBM models saved in older versions of MLflow are loaded
+    successfully by ``mlflow.pyfunc.load_model``. These older models specify a pyfunc ``data``
+    field referring directly to a LightGBM model file. Newer models also have the
+    ``model_class`` in LightGBM flavor.
+    """
+    model = lgb_model.model
+    mlflow.lightgbm.save_model(lgb_model=model, path=model_path)
+
+    model_conf_path = os.path.join(model_path, "MLmodel")
+    model_conf = Model.load(model_conf_path)
+    pyfunc_conf = model_conf.flavors.get(pyfunc.FLAVOR_NAME)
+    lgb_conf = model_conf.flavors.get(mlflow.lightgbm.FLAVOR_NAME)
+    assert lgb_conf is not None
+    assert "model_class" in lgb_conf
+    assert "data" in lgb_conf
+    assert pyfunc_conf is not None
+    assert "model_class" not in pyfunc_conf
+    assert pyfunc.DATA in pyfunc_conf
+
+    # test old MLmodel conf
+    model_conf.flavors["lightgbm"] = {"lgb_version": lgb.__version__, "data": "model.lgb"}
+    model_conf.save(model_conf_path)
+    model_conf = Model.load(model_conf_path)
+    lgb_conf = model_conf.flavors.get(mlflow.lightgbm.FLAVOR_NAME)
+    assert "data" in lgb_conf
+    assert lgb_conf["data"] == "model.lgb"
+
+    reloaded_pyfunc = pyfunc.load_model(model_uri=model_path)
+    assert isinstance(reloaded_pyfunc._model_impl.lgb_model, lgb.Booster)
+    reloaded_lgb = mlflow.lightgbm.load_model(model_uri=model_path)
+    assert isinstance(reloaded_lgb, lgb.Booster)
+
+    np.testing.assert_array_almost_equal(
+        lgb_model.model.predict(lgb_model.inference_dataframe),
+        reloaded_pyfunc.predict(lgb_model.inference_dataframe),
+    )
+
+    np.testing.assert_array_almost_equal(
+        reloaded_lgb.predict(lgb_model.inference_dataframe),
+        reloaded_pyfunc.predict(lgb_model.inference_dataframe),
+    )