From ce9c5e3261297d20da601f935314c43702f8f096 Mon Sep 17 00:00:00 2001
From: Junwen Yao <jwyiao@gmail.com>
Date: Tue, 30 Nov 2021 10:38:46 -0800
Subject: [PATCH 1/7] init commit

Signed-off-by: Junwen Yao <jwyiao@gmail.com>
---
 mlflow/{lightgbm.py => lightgbm/__init__.py} |  33 +-
 mlflow/lightgbm/utils.py                     | 106 ++++
 tests/lightgbm/test_lightgbm_autolog.py      | 538 -------------------
 tests/lightgbm/test_lightgbm_model_export.py |  78 ++-
 4 files changed, 208 insertions(+), 547 deletions(-)
 rename mlflow/{lightgbm.py => lightgbm/__init__.py} (95%)
 create mode 100644 mlflow/lightgbm/utils.py
 delete mode 100644 tests/lightgbm/test_lightgbm_autolog.py

diff --git a/mlflow/lightgbm.py b/mlflow/lightgbm/__init__.py
similarity index 95%
rename from mlflow/lightgbm.py
rename to mlflow/lightgbm/__init__.py
index 764241f867442..d717c419d0874 100644
--- a/mlflow/lightgbm.py
+++ b/mlflow/lightgbm/__init__.py
@@ -18,6 +18,7 @@
     https://lightgbm.readthedocs.io/en/latest/Python-API.html#scikit-learn-api
 """
 import os
+
 import yaml
 import json
 import tempfile
@@ -32,6 +33,7 @@
 from mlflow.models.model import MLMODEL_FILE_NAME
 from mlflow.models.signature import ModelSignature
 from mlflow.models.utils import ModelInputExample, _save_example
+from mlflow.utils import _get_fully_qualified_class_name
 from mlflow.tracking.artifact_utils import _download_artifact_from_uri
 from mlflow.utils.environment import (
     _mlflow_conda_env,
@@ -61,6 +63,8 @@
     MlflowAutologgingQueueingClient,
 )
 from mlflow.tracking._model_registry import DEFAULT_AWAIT_MAX_SLEEP_SECONDS
+from mlflow.lightgbm.utils import _save_lgb_model, _load_lgb_model
+
 
 FLAVOR_NAME = "lightgbm"
 
@@ -143,7 +147,7 @@ def save_model(
         _save_example(mlflow_model, input_example, path)
 
     # Save a LightGBM model
-    lgb_model.save_model(model_data_path)
+    _save_lgb_model(lgb_model, model_data_path)
 
     pyfunc.add_to_model(
         mlflow_model,
@@ -151,7 +155,12 @@ def save_model(
         data=model_data_subpath,
         env=_CONDA_ENV_FILE_NAME,
     )
-    mlflow_model.add_flavor(FLAVOR_NAME, lgb_version=lgb.__version__, data=model_data_subpath)
+    mlflow_model.add_flavor(
+        FLAVOR_NAME,
+        lgb_version=lgb.__version__,
+        data=model_data_subpath,
+        model_class=_get_fully_qualified_class_name(lgb_model),
+    )
     mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME))
 
     if conda_env is None:
@@ -251,9 +260,18 @@ def log_model(
 
 
 def _load_model(path):
-    import lightgbm as lgb
+    """
+    :param path: Local filesystem path to
+                 the MLflow Model with the ``lightgbm`` flavor (MLflow < x.x.x) or
+                 the top-level MLflow Model directory (MLflow >= x.x.x).
+    """
+    model_dir = os.path.dirname(path) if os.path.isfile(path) else path
+    flavor_conf = _get_flavor_configuration(model_path=model_dir, flavor_name=FLAVOR_NAME)
+
+    model_class = flavor_conf.get("model_class", "lightgbm.basic.Booster")
+    lgb_model_path = os.path.join(model_dir, flavor_conf.get("data"))
 
-    return lgb.Booster(model_file=path)
+    return _load_lgb_model(model_class, lgb_model_path)
 
 
 def _load_pyfunc(path):
@@ -283,12 +301,11 @@ def load_model(model_uri, dst_path=None):
                      This directory must already exist. If unspecified, a local output
                      path will be created.
 
-    :return: A LightGBM model (an instance of `lightgbm.Booster`_).
+    :return: A LightGBM model (an instance of `lightgbm.Booster`_) or LightGBM scikit-learn
+             models, depending on the saved model class specification.
     """
     local_model_path = _download_artifact_from_uri(artifact_uri=model_uri, output_path=dst_path)
-    flavor_conf = _get_flavor_configuration(model_path=local_model_path, flavor_name=FLAVOR_NAME)
-    lgb_model_file_path = os.path.join(local_model_path, flavor_conf.get("data", "model.lgb"))
-    return _load_model(path=lgb_model_file_path)
+    return _load_model(path=local_model_path)
 
 
 class _LGBModelWrapper:
diff --git a/mlflow/lightgbm/utils.py b/mlflow/lightgbm/utils.py
new file mode 100644
index 0000000000000..f2a338c3cac17
--- /dev/null
+++ b/mlflow/lightgbm/utils.py
@@ -0,0 +1,106 @@
+import importlib
+import json
+import os.path
+import warnings
+import numpy as np
+import lightgbm as lgb
+from lightgbm.compat import _LGBMLabelEncoder
+
+
+def _label_encoder_to_json(le):
+    """Returns a JSON compatible dictionary"""
+    meta = {}
+    for k, v in le.__dict__.items():
+        if isinstance(v, np.ndarray):
+            meta[k] = v.tolist()
+        else:
+            meta[k] = v
+    return meta
+
+
+def _label_encoder_from_json(doc):
+    """Load the encoder back from a JSON compatible dict"""
+    le = _LGBMLabelEncoder()
+    meta = {}
+    for k, v in doc.items():
+        if k == "classes_":
+            le.classes_ = np.array(v) if v is not None else None
+            continue
+        meta[k] = v
+    le.__dict__.update(meta)
+    return le
+
+
+def _save_lgb_attr(model_dir, fname, attr_dict):
+    with open(os.path.join(model_dir, "{}.json".format(fname)), "w") as f:
+        json.dump(attr_dict, f)
+
+
+def _load_lgb_attr(model_dir, fname):
+    try:
+        with open(os.path.join(model_dir, "{}.json".format(fname))) as f:
+            attr = json.load(f)
+        return attr
+    except IOError:
+        return None
+
+
+def _save_lgb_model(lgb_model, model_path) -> None:
+    model_dir = os.path.dirname(model_path)
+
+    if not isinstance(lgb_model, lgb.Booster):
+        meta = {}
+        for k, v in lgb_model.__dict__.items():
+            if k == "_le":
+                meta["_le"] = _label_encoder_to_json(v) if v else None
+                continue
+            if k == "_Booster":
+                continue
+            if k == "_classes" and v is not None:
+                meta["_classes"] = v.tolist()
+                continue
+            if k == "_class_map" and v:
+                py_dict = {}
+                for clazz, encoded in v.items():
+                    py_dict[int(clazz)] = int(encoded)
+                v = py_dict
+            try:
+                json.dumps({k: v})
+                meta[k] = v
+            except TypeError:
+                warnings.warn(str(k) + " is not saved in Scikit-Learn meta.", UserWarning)
+        _save_lgb_attr(model_dir, "scikit-learn", meta)
+        lgb_model = lgb_model._Booster
+
+    lgb_model.save_model(model_path)
+    _save_lgb_attr(model_dir, "params", lgb_model.params)
+
+
+def _load_lgb_model(lgb_model_class, model_path):
+    module, cls = lgb_model_class.rsplit(".", maxsplit=1)
+    model_dir = os.path.dirname(model_path)
+    sk_attr = _load_lgb_attr(model_dir, "scikit-learn")
+    bst_params = _load_lgb_attr(model_dir, "params")
+
+    booster = lgb.Booster(model_file=model_path, params=bst_params)
+
+    if sk_attr is None:
+        warnings.warn("Loading a native LightGBM model with Scikit-Learn interface.")
+        return booster
+
+    sk_model = getattr(importlib.import_module(module), cls)()
+    states = {}
+    for k, v in sk_attr.items():
+        if k == "_le":
+            sk_model._le = _label_encoder_from_json(v)
+            continue
+        if k == "_classes":
+            sk_model._classes = np.array(v)
+            continue
+        states[k] = v
+    sk_model.__dict__.update(states)
+    # Delete the attribute after load
+    booster.set_attr(scikit_learn=None)
+    sk_model._Booster = booster
+
+    return sk_model
diff --git a/tests/lightgbm/test_lightgbm_autolog.py b/tests/lightgbm/test_lightgbm_autolog.py
deleted file mode 100644
index 697e0255d0c9c..0000000000000
--- a/tests/lightgbm/test_lightgbm_autolog.py
+++ /dev/null
@@ -1,538 +0,0 @@
-import os
-import json
-import functools
-import pickle
-import pytest
-import yaml
-import numpy as np
-import pandas as pd
-from sklearn import datasets
-import lightgbm as lgb
-import matplotlib as mpl
-from packaging.version import Version
-
-import mlflow
-import mlflow.lightgbm
-from mlflow.lightgbm import _autolog_callback
-from mlflow.models import Model
-from mlflow.models.utils import _read_example
-from mlflow.utils.autologging_utils import picklable_exception_safe_function, BatchMetricsLogger
-from unittest.mock import patch
-
-mpl.use("Agg")
-
-
-def get_latest_run():
-    client = mlflow.tracking.MlflowClient()
-    return client.get_run(client.list_run_infos(experiment_id="0")[0].run_id)
-
-
-def get_model_conf(artifact_uri, model_subpath="model"):
-    model_conf_path = os.path.join(artifact_uri, model_subpath, "MLmodel")
-    return Model.load(model_conf_path)
-
-
-@pytest.fixture(scope="session")
-def bst_params():
-    return {
-        "objective": "multiclass",
-        "num_class": 3,
-    }
-
-
-@pytest.fixture(scope="session")
-def train_set():
-    iris = datasets.load_iris()
-    X = pd.DataFrame(iris.data[:, :2], columns=iris.feature_names[:2])
-    y = iris.target
-    # set free_raw_data False to use raw data later.
-    return lgb.Dataset(X, y, free_raw_data=False)
-
-
-@pytest.mark.large
-def test_lgb_autolog_ends_auto_created_run(bst_params, train_set):
-    mlflow.lightgbm.autolog()
-    lgb.train(bst_params, train_set, num_boost_round=1)
-    assert mlflow.active_run() is None
-
-
-@pytest.mark.large
-def test_lgb_autolog_persists_manually_created_run(bst_params, train_set):
-    mlflow.lightgbm.autolog()
-    with mlflow.start_run() as run:
-        lgb.train(bst_params, train_set, num_boost_round=1)
-        assert mlflow.active_run()
-        assert mlflow.active_run().info.run_id == run.info.run_id
-
-
-@pytest.mark.large
-def test_lgb_autolog_logs_default_params(bst_params, train_set):
-    mlflow.lightgbm.autolog()
-    lgb.train(bst_params, train_set)
-    run = get_latest_run()
-    params = run.data.params
-
-    expected_params = {
-        "num_boost_round": 100,
-        "feature_name": "auto",
-        "categorical_feature": "auto",
-        "verbose_eval": (
-            # The default value of `verbose_eval` in `lightgbm.train` has been changed to 'warn'
-            # in this PR: https://github.com/microsoft/LightGBM/pull/4577
-            "warn"
-            if Version(lgb.__version__) > Version("3.2.1")
-            else True
-        ),
-        "keep_training_booster": False,
-    }
-    expected_params.update(bst_params)
-
-    for key, val in expected_params.items():
-        assert key in params
-        assert params[key] == str(val)
-
-    unlogged_params = [
-        "params",
-        "train_set",
-        "valid_sets",
-        "valid_names",
-        "fobj",
-        "feval",
-        "init_model",
-        "evals_result",
-        "learning_rates",
-        "callbacks",
-    ]
-
-    for param in unlogged_params:
-        assert param not in params
-
-
-@pytest.mark.large
-def test_lgb_autolog_logs_specified_params(bst_params, train_set):
-    mlflow.lightgbm.autolog()
-    expected_params = {
-        "num_boost_round": 10,
-        "early_stopping_rounds": 5,
-        "verbose_eval": False,
-    }
-    lgb.train(bst_params, train_set, valid_sets=[train_set], **expected_params)
-    run = get_latest_run()
-    params = run.data.params
-
-    expected_params.update(bst_params)
-
-    for key, val in expected_params.items():
-        assert key in params
-        assert params[key] == str(val)
-
-    unlogged_params = [
-        "params",
-        "train_set",
-        "valid_sets",
-        "valid_names",
-        "fobj",
-        "feval",
-        "init_model",
-        "evals_result",
-        "learning_rates",
-        "callbacks",
-    ]
-
-    for param in unlogged_params:
-        assert param not in params
-
-
-@pytest.mark.large
-def test_lgb_autolog_logs_metrics_with_validation_data(bst_params, train_set):
-    mlflow.lightgbm.autolog()
-    evals_result = {}
-    lgb.train(
-        bst_params,
-        train_set,
-        num_boost_round=10,
-        valid_sets=[train_set],
-        valid_names=["train"],
-        evals_result=evals_result,
-    )
-    run = get_latest_run()
-    data = run.data
-    client = mlflow.tracking.MlflowClient()
-    metric_key = "train-multi_logloss"
-    metric_history = [x.value for x in client.get_metric_history(run.info.run_id, metric_key)]
-    assert metric_key in data.metrics
-    assert len(metric_history) == 10
-    assert metric_history == evals_result["train"]["multi_logloss"]
-
-
-@pytest.mark.large
-def test_lgb_autolog_logs_metrics_with_multi_validation_data(bst_params, train_set):
-    mlflow.lightgbm.autolog()
-    evals_result = {}
-    # If we use [train_set, train_set] here, LightGBM ignores the first dataset.
-    # To avoid that, create a new Dataset object.
-    valid_sets = [train_set, lgb.Dataset(train_set.data)]
-    valid_names = ["train", "valid"]
-    lgb.train(
-        bst_params,
-        train_set,
-        num_boost_round=10,
-        valid_sets=valid_sets,
-        valid_names=valid_names,
-        evals_result=evals_result,
-    )
-    run = get_latest_run()
-    data = run.data
-    client = mlflow.tracking.MlflowClient()
-    for valid_name in valid_names:
-        metric_key = "{}-multi_logloss".format(valid_name)
-        metric_history = [x.value for x in client.get_metric_history(run.info.run_id, metric_key)]
-        assert metric_key in data.metrics
-        assert len(metric_history) == 10
-        assert metric_history == evals_result[valid_name]["multi_logloss"]
-
-
-@pytest.mark.large
-def test_lgb_autolog_logs_metrics_with_multi_metrics(bst_params, train_set):
-    mlflow.lightgbm.autolog()
-    evals_result = {}
-    params = {"metric": ["multi_error", "multi_logloss"]}
-    params.update(bst_params)
-    valid_sets = [train_set]
-    valid_names = ["train"]
-    lgb.train(
-        params,
-        train_set,
-        num_boost_round=10,
-        valid_sets=valid_sets,
-        valid_names=valid_names,
-        evals_result=evals_result,
-    )
-    run = get_latest_run()
-    data = run.data
-    client = mlflow.tracking.MlflowClient()
-    for metric_name in params["metric"]:
-        metric_key = "{}-{}".format(valid_names[0], metric_name)
-        metric_history = [x.value for x in client.get_metric_history(run.info.run_id, metric_key)]
-        assert metric_key in data.metrics
-        assert len(metric_history) == 10
-        assert metric_history == evals_result["train"][metric_name]
-
-
-@pytest.mark.large
-def test_lgb_autolog_logs_metrics_with_multi_validation_data_and_metrics(bst_params, train_set):
-    mlflow.lightgbm.autolog()
-    evals_result = {}
-    params = {"metric": ["multi_error", "multi_logloss"]}
-    params.update(bst_params)
-    valid_sets = [train_set, lgb.Dataset(train_set.data)]
-    valid_names = ["train", "valid"]
-    lgb.train(
-        params,
-        train_set,
-        num_boost_round=10,
-        valid_sets=valid_sets,
-        valid_names=valid_names,
-        evals_result=evals_result,
-    )
-    run = get_latest_run()
-    data = run.data
-    client = mlflow.tracking.MlflowClient()
-    for valid_name in valid_names:
-        for metric_name in params["metric"]:
-            metric_key = "{}-{}".format(valid_name, metric_name)
-            metric_history = [
-                x.value for x in client.get_metric_history(run.info.run_id, metric_key)
-            ]
-            assert metric_key in data.metrics
-            assert len(metric_history) == 10
-            assert metric_history == evals_result[valid_name][metric_name]
-
-
-@pytest.mark.large
-def test_lgb_autolog_batch_metrics_logger_logs_expected_metrics(bst_params, train_set):
-    patched_metrics_data = []
-
-    # Mock patching BatchMetricsLogger.record_metrics()
-    # to ensure that expected metrics are being logged.
-    original = BatchMetricsLogger.record_metrics
-
-    with patch(
-        "mlflow.utils.autologging_utils.BatchMetricsLogger.record_metrics", autospec=True
-    ) as record_metrics_mock:
-
-        def record_metrics_side_effect(self, metrics, step=None):
-            patched_metrics_data.extend(metrics.items())
-            original(self, metrics, step)
-
-        record_metrics_mock.side_effect = record_metrics_side_effect
-
-        mlflow.lightgbm.autolog()
-        evals_result = {}
-        params = {"metric": ["multi_error", "multi_logloss"]}
-        params.update(bst_params)
-        valid_sets = [train_set, lgb.Dataset(train_set.data)]
-        valid_names = ["train", "valid"]
-        lgb.train(
-            params,
-            train_set,
-            num_boost_round=10,
-            valid_sets=valid_sets,
-            valid_names=valid_names,
-            evals_result=evals_result,
-        )
-
-    run = get_latest_run()
-    original_metrics = run.data.metrics
-    patched_metrics_data = dict(patched_metrics_data)
-    for metric_name in original_metrics:
-        assert metric_name in patched_metrics_data
-        assert original_metrics[metric_name] == patched_metrics_data[metric_name]
-
-    assert "train-multi_logloss" in original_metrics
-    assert "train-multi_logloss" in patched_metrics_data
-
-
-@pytest.mark.large
-def test_lgb_autolog_logs_metrics_with_early_stopping(bst_params, train_set):
-    mlflow.lightgbm.autolog()
-    evals_result = {}
-    params = {"metric": ["multi_error", "multi_logloss"]}
-    params.update(bst_params)
-    valid_sets = [train_set, lgb.Dataset(train_set.data)]
-    valid_names = ["train", "valid"]
-    model = lgb.train(
-        params,
-        train_set,
-        num_boost_round=10,
-        early_stopping_rounds=5,
-        valid_sets=valid_sets,
-        valid_names=valid_names,
-        evals_result=evals_result,
-    )
-    run = get_latest_run()
-    data = run.data
-    client = mlflow.tracking.MlflowClient()
-    assert "best_iteration" in data.metrics
-    assert int(data.metrics["best_iteration"]) == model.best_iteration
-    assert "stopped_iteration" in data.metrics
-    assert int(data.metrics["stopped_iteration"]) == len(evals_result["train"]["multi_logloss"])
-
-    for valid_name in valid_names:
-        for metric_name in params["metric"]:
-            metric_key = "{}-{}".format(valid_name, metric_name)
-            metric_history = [
-                x.value for x in client.get_metric_history(run.info.run_id, metric_key)
-            ]
-            assert metric_key in data.metrics
-
-            best_metrics = evals_result[valid_name][metric_name][model.best_iteration - 1]
-            assert metric_history == evals_result[valid_name][metric_name] + [best_metrics]
-
-
-@pytest.mark.large
-def test_lgb_autolog_logs_feature_importance(bst_params, train_set):
-    mlflow.lightgbm.autolog()
-    model = lgb.train(bst_params, train_set, num_boost_round=10)
-    run = get_latest_run()
-    run_id = run.info.run_id
-    artifacts_dir = run.info.artifact_uri.replace("file://", "")
-    client = mlflow.tracking.MlflowClient()
-    artifacts = [x.path for x in client.list_artifacts(run_id)]
-
-    for imp_type in ["split", "gain"]:
-        plot_name = "feature_importance_{}.png".format(imp_type)
-        assert plot_name in artifacts
-
-        json_name = "feature_importance_{}.json".format(imp_type)
-        assert json_name in artifacts
-
-        json_path = os.path.join(artifacts_dir, json_name)
-        with open(json_path, "r") as f:
-            loaded_imp = json.load(f)
-
-        features = model.feature_name()
-        importance = model.feature_importance(importance_type=imp_type)
-        imp = {ft: imp for ft, imp in zip(features, importance.tolist())}
-
-        assert loaded_imp == imp
-
-
-@pytest.mark.large
-def test_no_figure_is_opened_after_logging(bst_params, train_set):
-    mlflow.lightgbm.autolog()
-    lgb.train(bst_params, train_set, num_boost_round=10)
-    assert mpl.pyplot.get_fignums() == []
-
-
-@pytest.mark.large
-def test_lgb_autolog_loads_model_from_artifact(bst_params, train_set):
-    mlflow.lightgbm.autolog()
-    model = lgb.train(bst_params, train_set, num_boost_round=10)
-    run = get_latest_run()
-    run_id = run.info.run_id
-
-    loaded_model = mlflow.lightgbm.load_model("runs:/{}/model".format(run_id))
-    np.testing.assert_array_almost_equal(
-        model.predict(train_set.data), loaded_model.predict(train_set.data)
-    )
-
-
-@pytest.mark.large
-def test_lgb_autolog_gets_input_example(bst_params):
-    # we need to check the example input against the initial input given to train function.
-    # we can't use the train_set fixture for this as it defines free_raw_data=False but this
-    # feature should work even if it is True
-    iris = datasets.load_iris()
-    X = pd.DataFrame(iris.data[:, :2], columns=iris.feature_names[:2])
-    y = iris.target
-    dataset = lgb.Dataset(X, y, free_raw_data=True)
-
-    mlflow.lightgbm.autolog(log_input_examples=True)
-    lgb.train(bst_params, dataset)
-    run = get_latest_run()
-
-    model_path = os.path.join(run.info.artifact_uri, "model")
-    model_conf = Model.load(os.path.join(model_path, "MLmodel"))
-
-    input_example = _read_example(model_conf, model_path)
-
-    assert input_example.equals(X[:5])
-
-    pyfunc_model = mlflow.pyfunc.load_model(os.path.join(run.info.artifact_uri, "model"))
-
-    # make sure reloading the input_example and predicting on it does not error
-    pyfunc_model.predict(input_example)
-
-
-@pytest.mark.large
-def test_lgb_autolog_infers_model_signature_correctly(bst_params):
-    iris = datasets.load_iris()
-    X = pd.DataFrame(iris.data[:, :2], columns=iris.feature_names[:2])
-    y = iris.target
-    dataset = lgb.Dataset(X, y, free_raw_data=True)
-
-    mlflow.lightgbm.autolog(log_model_signatures=True)
-    lgb.train(bst_params, dataset)
-    run = get_latest_run()
-    run_id = run.info.run_id
-    artifacts_dir = run.info.artifact_uri.replace("file://", "")
-    client = mlflow.tracking.MlflowClient()
-    artifacts = [x.path for x in client.list_artifacts(run_id, "model")]
-
-    ml_model_filename = "MLmodel"
-    assert str(os.path.join("model", ml_model_filename)) in artifacts
-    ml_model_path = os.path.join(artifacts_dir, "model", ml_model_filename)
-
-    data = None
-    with open(ml_model_path, "r") as f:
-        data = yaml.load(f, Loader=yaml.FullLoader)
-
-    assert data is not None
-    assert "signature" in data
-    signature = data["signature"]
-    assert signature is not None
-
-    assert "inputs" in signature
-    assert json.loads(signature["inputs"]) == [
-        {"name": "sepal length (cm)", "type": "double"},
-        {"name": "sepal width (cm)", "type": "double"},
-    ]
-
-    assert "outputs" in signature
-    assert json.loads(signature["outputs"]) == [
-        {"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1, 3]}},
-    ]
-
-
-@pytest.mark.large
-def test_lgb_autolog_continues_logging_even_if_signature_inference_fails(tmpdir):
-    tmp_csv = tmpdir.join("data.csv")
-    tmp_csv.write("2,6.4,2.8,5.6,2.2\n")
-    tmp_csv.write("1,5.0,2.3,3.3,1.0\n")
-    tmp_csv.write("2,4.9,2.5,4.5,1.7\n")
-    tmp_csv.write("0,4.9,3.1,1.5,0.1\n")
-    tmp_csv.write("0,5.7,3.8,1.7,0.3\n")
-
-    # signature and input example inference should fail here since the dataset is given
-    #   as a file path
-    dataset = lgb.Dataset(tmp_csv.strpath)
-
-    bst_params = {
-        "objective": "multiclass",
-        "num_class": 3,
-    }
-
-    mlflow.lightgbm.autolog(log_model_signatures=True)
-    lgb.train(bst_params, dataset)
-    run = get_latest_run()
-    run_id = run.info.run_id
-    artifacts_dir = run.info.artifact_uri.replace("file://", "")
-    client = mlflow.tracking.MlflowClient()
-    artifacts = [x.path for x in client.list_artifacts(run_id, "model")]
-
-    ml_model_filename = "MLmodel"
-    assert os.path.join("model", ml_model_filename) in artifacts
-    ml_model_path = os.path.join(artifacts_dir, "model", ml_model_filename)
-
-    data = None
-    with open(ml_model_path, "r") as f:
-        data = yaml.load(f, Loader=yaml.FullLoader)
-
-    assert data is not None
-    assert "run_id" in data
-    assert "signature" not in data
-
-
-@pytest.mark.large
-@pytest.mark.parametrize("log_input_examples", [True, False])
-@pytest.mark.parametrize("log_model_signatures", [True, False])
-def test_lgb_autolog_configuration_options(bst_params, log_input_examples, log_model_signatures):
-    iris = datasets.load_iris()
-    X = pd.DataFrame(iris.data[:, :2], columns=iris.feature_names[:2])
-    y = iris.target
-
-    with mlflow.start_run() as run:
-        mlflow.lightgbm.autolog(
-            log_input_examples=log_input_examples, log_model_signatures=log_model_signatures
-        )
-        dataset = lgb.Dataset(X, y)
-        lgb.train(bst_params, dataset)
-    model_conf = get_model_conf(run.info.artifact_uri)
-    assert ("saved_input_example_info" in model_conf.to_dict()) == log_input_examples
-    assert ("signature" in model_conf.to_dict()) == log_model_signatures
-
-
-@pytest.mark.large
-@pytest.mark.parametrize("log_models", [True, False])
-def test_lgb_autolog_log_models_configuration(bst_params, log_models):
-    iris = datasets.load_iris()
-    X = pd.DataFrame(iris.data[:, :2], columns=iris.feature_names[:2])
-    y = iris.target
-
-    with mlflow.start_run() as run:
-        mlflow.lightgbm.autolog(log_models=log_models)
-        dataset = lgb.Dataset(X, y)
-        lgb.train(bst_params, dataset)
-
-    run_id = run.info.run_id
-    client = mlflow.tracking.MlflowClient()
-    artifacts = [f.path for f in client.list_artifacts(run_id)]
-    assert ("model" in artifacts) == log_models
-
-
-def test_lgb_autolog_does_not_break_dataset_instantiation_with_data_none():
-    """
-    This test verifies that `lightgbm.Dataset(None)` doesn't fail after patching.
-    LightGBM internally calls `lightgbm.Dataset(None)` to create a subset of `Dataset`:
-    https://github.com/microsoft/LightGBM/blob/v3.0.0/python-package/lightgbm/basic.py#L1381
-    """
-    mlflow.lightgbm.autolog()
-    lgb.Dataset(None)
-
-
-def test_callback_func_is_pickable():
-    cb = picklable_exception_safe_function(
-        functools.partial(_autolog_callback, BatchMetricsLogger(run_id="1234"), eval_results={})
-    )
-    pickle.dumps(cb)
diff --git a/tests/lightgbm/test_lightgbm_model_export.py b/tests/lightgbm/test_lightgbm_model_export.py
index 8b973e1701d4f..b08ece0f67710 100644
--- a/tests/lightgbm/test_lightgbm_model_export.py
+++ b/tests/lightgbm/test_lightgbm_model_export.py
@@ -50,6 +50,18 @@ def lgb_model():
     return ModelWithData(model=model, inference_dataframe=X)
 
 
+@pytest.fixture(scope="session")
+def lgb_sklearn_model():
+    iris = datasets.load_iris()
+    X = pd.DataFrame(
+        iris.data[:, :2], columns=iris.feature_names[:2]  # we only take the first two features.
+    )
+    y = iris.target
+    model = lgb.LGBMClassifier(n_estimators=10)
+    model.fit(X, y)
+    return ModelWithData(model=model, inference_dataframe=X)
+
+
 @pytest.fixture
 def model_path(tmpdir):
     return os.path.join(str(tmpdir), "model")
@@ -68,7 +80,7 @@ def test_model_save_load(lgb_model, model_path):
 
     mlflow.lightgbm.save_model(lgb_model=model, path=model_path)
     reloaded_model = mlflow.lightgbm.load_model(model_uri=model_path)
-    reloaded_pyfunc = pyfunc.load_pyfunc(model_uri=model_path)
+    reloaded_pyfunc = pyfunc.load_model(model_uri=model_path)
 
     np.testing.assert_array_almost_equal(
         model.predict(lgb_model.inference_dataframe),
@@ -81,6 +93,24 @@ def test_model_save_load(lgb_model, model_path):
     )
 
 
+@pytest.mark.large
+def test_sklearn_model_save_load(lgb_sklearn_model, model_path):
+    model = lgb_sklearn_model.model
+    mlflow.lightgbm.save_model(lgb_model=model, path=model_path)
+    reloaded_model = mlflow.lightgbm.load_model(model_uri=model_path)
+    reloaded_pyfunc = pyfunc.load_model(model_uri=model_path)
+
+    np.testing.assert_array_almost_equal(
+        model.predict(lgb_sklearn_model.inference_dataframe),
+        reloaded_model.predict(lgb_sklearn_model.inference_dataframe),
+    )
+
+    np.testing.assert_array_almost_equal(
+        reloaded_model.predict(lgb_sklearn_model.inference_dataframe),
+        reloaded_pyfunc.predict(lgb_sklearn_model.inference_dataframe),
+    )
+
+
 def test_signature_and_examples_are_saved_correctly(lgb_model):
     model = lgb_model.model
     X = lgb_model.inference_dataframe
@@ -398,3 +428,49 @@ def test_pyfunc_serve_and_score_sklearn(model):
     )
     scores = pd.read_json(resp.content, orient="records").values.squeeze()
     np.testing.assert_array_equal(scores, model.predict(X.head(3)))
+
+
+@pytest.mark.large
+def test_load_pyfunc_succeeds_for_older_models_with_pyfunc_data_field(lgb_model, model_path):
+    """
+    This test verifies that LightGBM models saved in older versions of MLflow are loaded
+    successfully by ``mlflow.pyfunc.load_model``. These older models specify a pyfunc ``data``
+    field referring directly to a LightGBM model file. Newer models also have the
+    ``model_class`` in LightGBM flavor.
+    """
+    model = lgb_model.model
+    mlflow.lightgbm.save_model(lgb_model=model, path=model_path)
+
+    model_conf_path = os.path.join(model_path, "MLmodel")
+    model_conf = Model.load(model_conf_path)
+    pyfunc_conf = model_conf.flavors.get(pyfunc.FLAVOR_NAME)
+    lgb_conf = model_conf.flavors.get(mlflow.lightgbm.FLAVOR_NAME)
+    assert lgb_conf is not None
+    assert "model_class" in lgb_conf
+    assert "data" in lgb_conf
+    assert pyfunc_conf is not None
+    assert "model_class" not in pyfunc_conf
+    assert pyfunc.DATA in pyfunc_conf
+
+    # test old MLmodel conf
+    model_conf.flavors["lightgbm"] = {"lgb_version": lgb.__version__, "data": "model.lgb"}
+    model_conf.save(model_conf_path)
+    model_conf = Model.load(model_conf_path)
+    lgb_conf = model_conf.flavors.get(mlflow.lightgbm.FLAVOR_NAME)
+    assert "data" in lgb_conf
+    assert lgb_conf["data"] == "model.lgb"
+
+    reloaded_pyfunc = pyfunc.load_model(model_uri=model_path)
+    assert isinstance(reloaded_pyfunc._model_impl.lgb_model, lgb.Booster)
+    reloaded_lgb = mlflow.lightgbm.load_model(model_uri=model_path)
+    assert isinstance(reloaded_lgb, lgb.Booster)
+
+    np.testing.assert_array_almost_equal(
+        lgb_model.model.predict(lgb_model.inference_dataframe),
+        reloaded_pyfunc.predict(lgb_model.inference_dataframe),
+    )
+
+    np.testing.assert_array_almost_equal(
+        reloaded_lgb.predict(lgb_model.inference_dataframe),
+        reloaded_pyfunc.predict(lgb_model.inference_dataframe),
+    )

From af4bcf12239ebdad2f80981d14190f3b689b00ef Mon Sep 17 00:00:00 2001
From: Junwen Yao <jwyiao@gmail.com>
Date: Tue, 30 Nov 2021 10:40:46 -0800
Subject: [PATCH 2/7] restore test

Signed-off-by: Junwen Yao <jwyiao@gmail.com>
---
 tests/lightgbm/test_lightgbm_autolog.py | 538 ++++++++++++++++++++++++
 1 file changed, 538 insertions(+)
 create mode 100644 tests/lightgbm/test_lightgbm_autolog.py

diff --git a/tests/lightgbm/test_lightgbm_autolog.py b/tests/lightgbm/test_lightgbm_autolog.py
new file mode 100644
index 0000000000000..697e0255d0c9c
--- /dev/null
+++ b/tests/lightgbm/test_lightgbm_autolog.py
@@ -0,0 +1,538 @@
+import os
+import json
+import functools
+import pickle
+import pytest
+import yaml
+import numpy as np
+import pandas as pd
+from sklearn import datasets
+import lightgbm as lgb
+import matplotlib as mpl
+from packaging.version import Version
+
+import mlflow
+import mlflow.lightgbm
+from mlflow.lightgbm import _autolog_callback
+from mlflow.models import Model
+from mlflow.models.utils import _read_example
+from mlflow.utils.autologging_utils import picklable_exception_safe_function, BatchMetricsLogger
+from unittest.mock import patch
+
+mpl.use("Agg")
+
+
+def get_latest_run():
+    client = mlflow.tracking.MlflowClient()
+    return client.get_run(client.list_run_infos(experiment_id="0")[0].run_id)
+
+
+def get_model_conf(artifact_uri, model_subpath="model"):
+    model_conf_path = os.path.join(artifact_uri, model_subpath, "MLmodel")
+    return Model.load(model_conf_path)
+
+
+@pytest.fixture(scope="session")
+def bst_params():
+    return {
+        "objective": "multiclass",
+        "num_class": 3,
+    }
+
+
+@pytest.fixture(scope="session")
+def train_set():
+    iris = datasets.load_iris()
+    X = pd.DataFrame(iris.data[:, :2], columns=iris.feature_names[:2])
+    y = iris.target
+    # set free_raw_data False to use raw data later.
+    return lgb.Dataset(X, y, free_raw_data=False)
+
+
+@pytest.mark.large
+def test_lgb_autolog_ends_auto_created_run(bst_params, train_set):
+    mlflow.lightgbm.autolog()
+    lgb.train(bst_params, train_set, num_boost_round=1)
+    assert mlflow.active_run() is None
+
+
+@pytest.mark.large
+def test_lgb_autolog_persists_manually_created_run(bst_params, train_set):
+    mlflow.lightgbm.autolog()
+    with mlflow.start_run() as run:
+        lgb.train(bst_params, train_set, num_boost_round=1)
+        assert mlflow.active_run()
+        assert mlflow.active_run().info.run_id == run.info.run_id
+
+
+@pytest.mark.large
+def test_lgb_autolog_logs_default_params(bst_params, train_set):
+    mlflow.lightgbm.autolog()
+    lgb.train(bst_params, train_set)
+    run = get_latest_run()
+    params = run.data.params
+
+    expected_params = {
+        "num_boost_round": 100,
+        "feature_name": "auto",
+        "categorical_feature": "auto",
+        "verbose_eval": (
+            # The default value of `verbose_eval` in `lightgbm.train` has been changed to 'warn'
+            # in this PR: https://github.com/microsoft/LightGBM/pull/4577
+            "warn"
+            if Version(lgb.__version__) > Version("3.2.1")
+            else True
+        ),
+        "keep_training_booster": False,
+    }
+    expected_params.update(bst_params)
+
+    for key, val in expected_params.items():
+        assert key in params
+        assert params[key] == str(val)
+
+    unlogged_params = [
+        "params",
+        "train_set",
+        "valid_sets",
+        "valid_names",
+        "fobj",
+        "feval",
+        "init_model",
+        "evals_result",
+        "learning_rates",
+        "callbacks",
+    ]
+
+    for param in unlogged_params:
+        assert param not in params
+
+
+@pytest.mark.large
+def test_lgb_autolog_logs_specified_params(bst_params, train_set):
+    mlflow.lightgbm.autolog()
+    expected_params = {
+        "num_boost_round": 10,
+        "early_stopping_rounds": 5,
+        "verbose_eval": False,
+    }
+    lgb.train(bst_params, train_set, valid_sets=[train_set], **expected_params)
+    run = get_latest_run()
+    params = run.data.params
+
+    expected_params.update(bst_params)
+
+    for key, val in expected_params.items():
+        assert key in params
+        assert params[key] == str(val)
+
+    unlogged_params = [
+        "params",
+        "train_set",
+        "valid_sets",
+        "valid_names",
+        "fobj",
+        "feval",
+        "init_model",
+        "evals_result",
+        "learning_rates",
+        "callbacks",
+    ]
+
+    for param in unlogged_params:
+        assert param not in params
+
+
+@pytest.mark.large
+def test_lgb_autolog_logs_metrics_with_validation_data(bst_params, train_set):
+    mlflow.lightgbm.autolog()
+    evals_result = {}
+    lgb.train(
+        bst_params,
+        train_set,
+        num_boost_round=10,
+        valid_sets=[train_set],
+        valid_names=["train"],
+        evals_result=evals_result,
+    )
+    run = get_latest_run()
+    data = run.data
+    client = mlflow.tracking.MlflowClient()
+    metric_key = "train-multi_logloss"
+    metric_history = [x.value for x in client.get_metric_history(run.info.run_id, metric_key)]
+    assert metric_key in data.metrics
+    assert len(metric_history) == 10
+    assert metric_history == evals_result["train"]["multi_logloss"]
+
+
+@pytest.mark.large
+def test_lgb_autolog_logs_metrics_with_multi_validation_data(bst_params, train_set):
+    mlflow.lightgbm.autolog()
+    evals_result = {}
+    # If we use [train_set, train_set] here, LightGBM ignores the first dataset.
+    # To avoid that, create a new Dataset object.
+    valid_sets = [train_set, lgb.Dataset(train_set.data)]
+    valid_names = ["train", "valid"]
+    lgb.train(
+        bst_params,
+        train_set,
+        num_boost_round=10,
+        valid_sets=valid_sets,
+        valid_names=valid_names,
+        evals_result=evals_result,
+    )
+    run = get_latest_run()
+    data = run.data
+    client = mlflow.tracking.MlflowClient()
+    for valid_name in valid_names:
+        metric_key = "{}-multi_logloss".format(valid_name)
+        metric_history = [x.value for x in client.get_metric_history(run.info.run_id, metric_key)]
+        assert metric_key in data.metrics
+        assert len(metric_history) == 10
+        assert metric_history == evals_result[valid_name]["multi_logloss"]
+
+
+@pytest.mark.large
+def test_lgb_autolog_logs_metrics_with_multi_metrics(bst_params, train_set):
+    mlflow.lightgbm.autolog()
+    evals_result = {}
+    params = {"metric": ["multi_error", "multi_logloss"]}
+    params.update(bst_params)
+    valid_sets = [train_set]
+    valid_names = ["train"]
+    lgb.train(
+        params,
+        train_set,
+        num_boost_round=10,
+        valid_sets=valid_sets,
+        valid_names=valid_names,
+        evals_result=evals_result,
+    )
+    run = get_latest_run()
+    data = run.data
+    client = mlflow.tracking.MlflowClient()
+    for metric_name in params["metric"]:
+        metric_key = "{}-{}".format(valid_names[0], metric_name)
+        metric_history = [x.value for x in client.get_metric_history(run.info.run_id, metric_key)]
+        assert metric_key in data.metrics
+        assert len(metric_history) == 10
+        assert metric_history == evals_result["train"][metric_name]
+
+
+@pytest.mark.large
+def test_lgb_autolog_logs_metrics_with_multi_validation_data_and_metrics(bst_params, train_set):
+    mlflow.lightgbm.autolog()
+    evals_result = {}
+    params = {"metric": ["multi_error", "multi_logloss"]}
+    params.update(bst_params)
+    valid_sets = [train_set, lgb.Dataset(train_set.data)]
+    valid_names = ["train", "valid"]
+    lgb.train(
+        params,
+        train_set,
+        num_boost_round=10,
+        valid_sets=valid_sets,
+        valid_names=valid_names,
+        evals_result=evals_result,
+    )
+    run = get_latest_run()
+    data = run.data
+    client = mlflow.tracking.MlflowClient()
+    for valid_name in valid_names:
+        for metric_name in params["metric"]:
+            metric_key = "{}-{}".format(valid_name, metric_name)
+            metric_history = [
+                x.value for x in client.get_metric_history(run.info.run_id, metric_key)
+            ]
+            assert metric_key in data.metrics
+            assert len(metric_history) == 10
+            assert metric_history == evals_result[valid_name][metric_name]
+
+
+@pytest.mark.large
+def test_lgb_autolog_batch_metrics_logger_logs_expected_metrics(bst_params, train_set):
+    patched_metrics_data = []
+
+    # Mock patching BatchMetricsLogger.record_metrics()
+    # to ensure that expected metrics are being logged.
+    original = BatchMetricsLogger.record_metrics
+
+    with patch(
+        "mlflow.utils.autologging_utils.BatchMetricsLogger.record_metrics", autospec=True
+    ) as record_metrics_mock:
+
+        def record_metrics_side_effect(self, metrics, step=None):
+            patched_metrics_data.extend(metrics.items())
+            original(self, metrics, step)
+
+        record_metrics_mock.side_effect = record_metrics_side_effect
+
+        mlflow.lightgbm.autolog()
+        evals_result = {}
+        params = {"metric": ["multi_error", "multi_logloss"]}
+        params.update(bst_params)
+        valid_sets = [train_set, lgb.Dataset(train_set.data)]
+        valid_names = ["train", "valid"]
+        lgb.train(
+            params,
+            train_set,
+            num_boost_round=10,
+            valid_sets=valid_sets,
+            valid_names=valid_names,
+            evals_result=evals_result,
+        )
+
+    run = get_latest_run()
+    original_metrics = run.data.metrics
+    patched_metrics_data = dict(patched_metrics_data)
+    for metric_name in original_metrics:
+        assert metric_name in patched_metrics_data
+        assert original_metrics[metric_name] == patched_metrics_data[metric_name]
+
+    assert "train-multi_logloss" in original_metrics
+    assert "train-multi_logloss" in patched_metrics_data
+
+
+@pytest.mark.large
+def test_lgb_autolog_logs_metrics_with_early_stopping(bst_params, train_set):
+    mlflow.lightgbm.autolog()
+    evals_result = {}
+    params = {"metric": ["multi_error", "multi_logloss"]}
+    params.update(bst_params)
+    valid_sets = [train_set, lgb.Dataset(train_set.data)]
+    valid_names = ["train", "valid"]
+    model = lgb.train(
+        params,
+        train_set,
+        num_boost_round=10,
+        early_stopping_rounds=5,
+        valid_sets=valid_sets,
+        valid_names=valid_names,
+        evals_result=evals_result,
+    )
+    run = get_latest_run()
+    data = run.data
+    client = mlflow.tracking.MlflowClient()
+    assert "best_iteration" in data.metrics
+    assert int(data.metrics["best_iteration"]) == model.best_iteration
+    assert "stopped_iteration" in data.metrics
+    assert int(data.metrics["stopped_iteration"]) == len(evals_result["train"]["multi_logloss"])
+
+    for valid_name in valid_names:
+        for metric_name in params["metric"]:
+            metric_key = "{}-{}".format(valid_name, metric_name)
+            metric_history = [
+                x.value for x in client.get_metric_history(run.info.run_id, metric_key)
+            ]
+            assert metric_key in data.metrics
+
+            best_metrics = evals_result[valid_name][metric_name][model.best_iteration - 1]
+            assert metric_history == evals_result[valid_name][metric_name] + [best_metrics]
+
+
+@pytest.mark.large
+def test_lgb_autolog_logs_feature_importance(bst_params, train_set):
+    mlflow.lightgbm.autolog()
+    model = lgb.train(bst_params, train_set, num_boost_round=10)
+    run = get_latest_run()
+    run_id = run.info.run_id
+    artifacts_dir = run.info.artifact_uri.replace("file://", "")
+    client = mlflow.tracking.MlflowClient()
+    artifacts = [x.path for x in client.list_artifacts(run_id)]
+
+    for imp_type in ["split", "gain"]:
+        plot_name = "feature_importance_{}.png".format(imp_type)
+        assert plot_name in artifacts
+
+        json_name = "feature_importance_{}.json".format(imp_type)
+        assert json_name in artifacts
+
+        json_path = os.path.join(artifacts_dir, json_name)
+        with open(json_path, "r") as f:
+            loaded_imp = json.load(f)
+
+        features = model.feature_name()
+        importance = model.feature_importance(importance_type=imp_type)
+        imp = {ft: imp for ft, imp in zip(features, importance.tolist())}
+
+        assert loaded_imp == imp
+
+
+@pytest.mark.large
+def test_no_figure_is_opened_after_logging(bst_params, train_set):
+    mlflow.lightgbm.autolog()
+    lgb.train(bst_params, train_set, num_boost_round=10)
+    assert mpl.pyplot.get_fignums() == []
+
+
+@pytest.mark.large
+def test_lgb_autolog_loads_model_from_artifact(bst_params, train_set):
+    mlflow.lightgbm.autolog()
+    model = lgb.train(bst_params, train_set, num_boost_round=10)
+    run = get_latest_run()
+    run_id = run.info.run_id
+
+    loaded_model = mlflow.lightgbm.load_model("runs:/{}/model".format(run_id))
+    np.testing.assert_array_almost_equal(
+        model.predict(train_set.data), loaded_model.predict(train_set.data)
+    )
+
+
+@pytest.mark.large
+def test_lgb_autolog_gets_input_example(bst_params):
+    # we need to check the example input against the initial input given to train function.
+    # we can't use the train_set fixture for this as it defines free_raw_data=False but this
+    # feature should work even if it is True
+    iris = datasets.load_iris()
+    X = pd.DataFrame(iris.data[:, :2], columns=iris.feature_names[:2])
+    y = iris.target
+    dataset = lgb.Dataset(X, y, free_raw_data=True)
+
+    mlflow.lightgbm.autolog(log_input_examples=True)
+    lgb.train(bst_params, dataset)
+    run = get_latest_run()
+
+    model_path = os.path.join(run.info.artifact_uri, "model")
+    model_conf = Model.load(os.path.join(model_path, "MLmodel"))
+
+    input_example = _read_example(model_conf, model_path)
+
+    assert input_example.equals(X[:5])
+
+    pyfunc_model = mlflow.pyfunc.load_model(os.path.join(run.info.artifact_uri, "model"))
+
+    # make sure reloading the input_example and predicting on it does not error
+    pyfunc_model.predict(input_example)
+
+
+@pytest.mark.large
+def test_lgb_autolog_infers_model_signature_correctly(bst_params):
+    iris = datasets.load_iris()
+    X = pd.DataFrame(iris.data[:, :2], columns=iris.feature_names[:2])
+    y = iris.target
+    dataset = lgb.Dataset(X, y, free_raw_data=True)
+
+    mlflow.lightgbm.autolog(log_model_signatures=True)
+    lgb.train(bst_params, dataset)
+    run = get_latest_run()
+    run_id = run.info.run_id
+    artifacts_dir = run.info.artifact_uri.replace("file://", "")
+    client = mlflow.tracking.MlflowClient()
+    artifacts = [x.path for x in client.list_artifacts(run_id, "model")]
+
+    ml_model_filename = "MLmodel"
+    assert str(os.path.join("model", ml_model_filename)) in artifacts
+    ml_model_path = os.path.join(artifacts_dir, "model", ml_model_filename)
+
+    data = None
+    with open(ml_model_path, "r") as f:
+        data = yaml.load(f, Loader=yaml.FullLoader)
+
+    assert data is not None
+    assert "signature" in data
+    signature = data["signature"]
+    assert signature is not None
+
+    assert "inputs" in signature
+    assert json.loads(signature["inputs"]) == [
+        {"name": "sepal length (cm)", "type": "double"},
+        {"name": "sepal width (cm)", "type": "double"},
+    ]
+
+    assert "outputs" in signature
+    assert json.loads(signature["outputs"]) == [
+        {"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1, 3]}},
+    ]
+
+
+@pytest.mark.large
+def test_lgb_autolog_continues_logging_even_if_signature_inference_fails(tmpdir):
+    tmp_csv = tmpdir.join("data.csv")
+    tmp_csv.write("2,6.4,2.8,5.6,2.2\n")
+    tmp_csv.write("1,5.0,2.3,3.3,1.0\n")
+    tmp_csv.write("2,4.9,2.5,4.5,1.7\n")
+    tmp_csv.write("0,4.9,3.1,1.5,0.1\n")
+    tmp_csv.write("0,5.7,3.8,1.7,0.3\n")
+
+    # signature and input example inference should fail here since the dataset is given
+    #   as a file path
+    dataset = lgb.Dataset(tmp_csv.strpath)
+
+    bst_params = {
+        "objective": "multiclass",
+        "num_class": 3,
+    }
+
+    mlflow.lightgbm.autolog(log_model_signatures=True)
+    lgb.train(bst_params, dataset)
+    run = get_latest_run()
+    run_id = run.info.run_id
+    artifacts_dir = run.info.artifact_uri.replace("file://", "")
+    client = mlflow.tracking.MlflowClient()
+    artifacts = [x.path for x in client.list_artifacts(run_id, "model")]
+
+    ml_model_filename = "MLmodel"
+    assert os.path.join("model", ml_model_filename) in artifacts
+    ml_model_path = os.path.join(artifacts_dir, "model", ml_model_filename)
+
+    data = None
+    with open(ml_model_path, "r") as f:
+        data = yaml.load(f, Loader=yaml.FullLoader)
+
+    assert data is not None
+    assert "run_id" in data
+    assert "signature" not in data
+
+
+@pytest.mark.large
+@pytest.mark.parametrize("log_input_examples", [True, False])
+@pytest.mark.parametrize("log_model_signatures", [True, False])
+def test_lgb_autolog_configuration_options(bst_params, log_input_examples, log_model_signatures):
+    iris = datasets.load_iris()
+    X = pd.DataFrame(iris.data[:, :2], columns=iris.feature_names[:2])
+    y = iris.target
+
+    with mlflow.start_run() as run:
+        mlflow.lightgbm.autolog(
+            log_input_examples=log_input_examples, log_model_signatures=log_model_signatures
+        )
+        dataset = lgb.Dataset(X, y)
+        lgb.train(bst_params, dataset)
+    model_conf = get_model_conf(run.info.artifact_uri)
+    assert ("saved_input_example_info" in model_conf.to_dict()) == log_input_examples
+    assert ("signature" in model_conf.to_dict()) == log_model_signatures
+
+
+@pytest.mark.large
+@pytest.mark.parametrize("log_models", [True, False])
+def test_lgb_autolog_log_models_configuration(bst_params, log_models):
+    iris = datasets.load_iris()
+    X = pd.DataFrame(iris.data[:, :2], columns=iris.feature_names[:2])
+    y = iris.target
+
+    with mlflow.start_run() as run:
+        mlflow.lightgbm.autolog(log_models=log_models)
+        dataset = lgb.Dataset(X, y)
+        lgb.train(bst_params, dataset)
+
+    run_id = run.info.run_id
+    client = mlflow.tracking.MlflowClient()
+    artifacts = [f.path for f in client.list_artifacts(run_id)]
+    assert ("model" in artifacts) == log_models
+
+
+def test_lgb_autolog_does_not_break_dataset_instantiation_with_data_none():
+    """
+    This test verifies that `lightgbm.Dataset(None)` doesn't fail after patching.
+    LightGBM internally calls `lightgbm.Dataset(None)` to create a subset of `Dataset`:
+    https://github.com/microsoft/LightGBM/blob/v3.0.0/python-package/lightgbm/basic.py#L1381
+    """
+    mlflow.lightgbm.autolog()
+    lgb.Dataset(None)
+
+
+def test_callback_func_is_pickable():
+    cb = picklable_exception_safe_function(
+        functools.partial(_autolog_callback, BatchMetricsLogger(run_id="1234"), eval_results={})
+    )
+    pickle.dumps(cb)

From c804a8195bbd519286fd5fbaa8909fd18ac0b1e3 Mon Sep 17 00:00:00 2001
From: Junwen Yao <jwyiao@gmail.com>
Date: Tue, 30 Nov 2021 11:51:42 -0800
Subject: [PATCH 3/7] fix doc

Signed-off-by: Junwen Yao <jwyiao@gmail.com>
---
 mlflow/lightgbm/__init__.py | 1 -
 mlflow/lightgbm/utils.py    | 8 ++++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/mlflow/lightgbm/__init__.py b/mlflow/lightgbm/__init__.py
index d717c419d0874..ba8c361ff2f0b 100644
--- a/mlflow/lightgbm/__init__.py
+++ b/mlflow/lightgbm/__init__.py
@@ -18,7 +18,6 @@
     https://lightgbm.readthedocs.io/en/latest/Python-API.html#scikit-learn-api
 """
 import os
-
 import yaml
 import json
 import tempfile
diff --git a/mlflow/lightgbm/utils.py b/mlflow/lightgbm/utils.py
index f2a338c3cac17..e4364b193afa9 100644
--- a/mlflow/lightgbm/utils.py
+++ b/mlflow/lightgbm/utils.py
@@ -3,8 +3,6 @@
 import os.path
 import warnings
 import numpy as np
-import lightgbm as lgb
-from lightgbm.compat import _LGBMLabelEncoder
 
 
 def _label_encoder_to_json(le):
@@ -20,6 +18,8 @@ def _label_encoder_to_json(le):
 
 def _label_encoder_from_json(doc):
     """Load the encoder back from a JSON compatible dict"""
+    from lightgbm.compat import _LGBMLabelEncoder
+
     le = _LGBMLabelEncoder()
     meta = {}
     for k, v in doc.items():
@@ -46,6 +46,8 @@ def _load_lgb_attr(model_dir, fname):
 
 
 def _save_lgb_model(lgb_model, model_path) -> None:
+    import lightgbm as lgb
+
     model_dir = os.path.dirname(model_path)
 
     if not isinstance(lgb_model, lgb.Booster):
@@ -77,6 +79,8 @@ def _save_lgb_model(lgb_model, model_path) -> None:
 
 
 def _load_lgb_model(lgb_model_class, model_path):
+    import lightgbm as lgb
+
     module, cls = lgb_model_class.rsplit(".", maxsplit=1)
     model_dir = os.path.dirname(model_path)
     sk_attr = _load_lgb_attr(model_dir, "scikit-learn")

From aa4337b4bcc22f65d9e5b5b54b4b0895c3e0112e Mon Sep 17 00:00:00 2001
From: Junwen Yao <jwyiao@gmail.com>
Date: Wed, 15 Dec 2021 16:04:29 -0800
Subject: [PATCH 4/7] address review: use cloudpickle

Signed-off-by: Junwen Yao <jwyiao@gmail.com>
---
 mlflow/lightgbm.py | 620 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 620 insertions(+)
 create mode 100644 mlflow/lightgbm.py

diff --git a/mlflow/lightgbm.py b/mlflow/lightgbm.py
new file mode 100644
index 0000000000000..397cde1341b22
--- /dev/null
+++ b/mlflow/lightgbm.py
@@ -0,0 +1,620 @@
+"""
+The ``mlflow.lightgbm`` module provides an API for logging and loading LightGBM models.
+This module exports LightGBM models with the following flavors:
+
+LightGBM (native) format
+    This is the main flavor that can be loaded back into LightGBM.
+:py:mod:`mlflow.pyfunc`
+    Produced for use by generic pyfunc-based deployment tools and batch inference.
+
+.. _lightgbm.Booster:
+    https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html#lightgbm.Booster
+.. _lightgbm.Booster.save_model:
+    https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html
+    #lightgbm.Booster.save_model
+.. _lightgbm.train:
+    https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.train.html#lightgbm-train
+.. _scikit-learn API:
+    https://lightgbm.readthedocs.io/en/latest/Python-API.html#scikit-learn-api
+"""
+import os
+import yaml
+import json
+import tempfile
+import shutil
+import logging
+import functools
+from copy import deepcopy
+
+import mlflow
+from mlflow import pyfunc
+from mlflow.models import Model, infer_signature
+from mlflow.models.model import MLMODEL_FILE_NAME
+from mlflow.models.signature import ModelSignature
+from mlflow.models.utils import ModelInputExample, _save_example
+from mlflow.tracking.artifact_utils import _download_artifact_from_uri
+from mlflow.utils import _get_fully_qualified_class_name
+from mlflow.utils.environment import (
+    _mlflow_conda_env,
+    _validate_env_arguments,
+    _process_pip_requirements,
+    _process_conda_env,
+    _CONDA_ENV_FILE_NAME,
+    _REQUIREMENTS_FILE_NAME,
+    _CONSTRAINTS_FILE_NAME,
+)
+from mlflow.utils.requirements_utils import _get_pinned_requirement
+from mlflow.utils.file_utils import write_to
+from mlflow.utils.docstring_utils import format_docstring, LOG_MODEL_PARAM_DOCS
+from mlflow.utils.model_utils import _get_flavor_configuration
+from mlflow.exceptions import MlflowException
+from mlflow.utils.arguments_utils import _get_arg_names
+from mlflow.utils.autologging_utils import (
+    autologging_integration,
+    safe_patch,
+    picklable_exception_safe_function,
+    get_mlflow_run_params_for_fn_args,
+    INPUT_EXAMPLE_SAMPLE_ROWS,
+    resolve_input_example_and_signature,
+    InputExampleInfo,
+    ENSURE_AUTOLOGGING_ENABLED_TEXT,
+    batch_metrics_logger,
+    MlflowAutologgingQueueingClient,
+)
+from mlflow.tracking._model_registry import DEFAULT_AWAIT_MAX_SLEEP_SECONDS
+
+FLAVOR_NAME = "lightgbm"
+
+_logger = logging.getLogger(__name__)
+
+
+def get_default_pip_requirements():
+    """
+    :return: A list of default pip requirements for MLflow Models produced by this flavor.
+             Calls to :func:`save_model()` and :func:`log_model()` produce a pip environment
+             that, at minimum, contains these requirements.
+    """
+    return [_get_pinned_requirement("lightgbm"), _get_pinned_requirement("cloudpickle")]
+
+
+def get_default_conda_env():
+    """
+    :return: The default Conda environment for MLflow Models produced by calls to
+             :func:`save_model()` and :func:`log_model()`.
+    """
+    return _mlflow_conda_env(additional_pip_deps=get_default_pip_requirements())
+
+
+@format_docstring(LOG_MODEL_PARAM_DOCS.format(package_name=FLAVOR_NAME))
+def save_model(
+    lgb_model,
+    path,
+    conda_env=None,
+    mlflow_model=None,
+    signature: ModelSignature = None,
+    input_example: ModelInputExample = None,
+    pip_requirements=None,
+    extra_pip_requirements=None,
+):
+    """
+    Save a LightGBM model to a path on the local file system.
+
+    :param lgb_model: LightGBM model (an instance of `lightgbm.Booster`_) to be saved.
+                      Note that models that implement the `scikit-learn API`_  are not supported.
+    :param path: Local path where the model is to be saved.
+    :param conda_env: {{ conda_env }}
+    :param mlflow_model: :py:mod:`mlflow.models.Model` this flavor is being added to.
+
+    :param signature: :py:class:`ModelSignature <mlflow.models.ModelSignature>`
+                      describes model input and output :py:class:`Schema <mlflow.types.Schema>`.
+                      The model signature can be :py:func:`inferred <mlflow.models.infer_signature>`
+                      from datasets with valid model input (e.g. the training dataset with target
+                      column omitted) and valid model output (e.g. model predictions generated on
+                      the training dataset), for example:
+
+                      .. code-block:: python
+
+                        from mlflow.models.signature import infer_signature
+                        train = df.drop_column("target_label")
+                        predictions = ... # compute model predictions
+                        signature = infer_signature(train, predictions)
+    :param input_example: Input example provides one or several instances of valid
+                          model input. The example can be used as a hint of what data to feed the
+                          model. The given example will be converted to a Pandas DataFrame and then
+                          serialized to json using the Pandas split-oriented format. Bytes are
+                          base64-encoded.
+    :param pip_requirements: {{ pip_requirements }}
+    :param extra_pip_requirements: {{ extra_pip_requirements }}
+    """
+    import lightgbm as lgb
+
+    _validate_env_arguments(conda_env, pip_requirements, extra_pip_requirements)
+
+    path = os.path.abspath(path)
+    if os.path.exists(path):
+        raise MlflowException("Path '{}' already exists".format(path))
+    if isinstance(lgb_model, lgb.Booster):
+        model_data_subpath = "model.lgb"
+    else:
+        model_data_subpath = "model.pkl"
+    model_data_path = os.path.join(path, model_data_subpath)
+    os.makedirs(path)
+    if mlflow_model is None:
+        mlflow_model = Model()
+    if signature is not None:
+        mlflow_model.signature = signature
+    if input_example is not None:
+        _save_example(mlflow_model, input_example, path)
+
+    # Save a LightGBM model
+    _save_model(lgb_model, model_data_path)
+
+    lgb_model_class = _get_fully_qualified_class_name(lgb_model)
+    pyfunc.add_to_model(
+        mlflow_model,
+        loader_module="mlflow.lightgbm",
+        data=model_data_subpath,
+        env=_CONDA_ENV_FILE_NAME,
+    )
+    mlflow_model.add_flavor(
+        FLAVOR_NAME,
+        lgb_version=lgb.__version__,
+        data=model_data_subpath,
+        model_class=lgb_model_class,
+    )
+    mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME))
+
+    if conda_env is None:
+        if pip_requirements is None:
+            default_reqs = get_default_pip_requirements()
+            # To ensure `_load_pyfunc` can successfully load the model during the dependency
+            # inference, `mlflow_model.save` must be called beforehand to save an MLmodel file.
+            inferred_reqs = mlflow.models.infer_pip_requirements(
+                path,
+                FLAVOR_NAME,
+                fallback=default_reqs,
+            )
+            default_reqs = sorted(set(inferred_reqs).union(default_reqs))
+        else:
+            default_reqs = None
+        conda_env, pip_requirements, pip_constraints = _process_pip_requirements(
+            default_reqs,
+            pip_requirements,
+            extra_pip_requirements,
+        )
+    else:
+        conda_env, pip_requirements, pip_constraints = _process_conda_env(conda_env)
+
+    with open(os.path.join(path, _CONDA_ENV_FILE_NAME), "w") as f:
+        yaml.safe_dump(conda_env, stream=f, default_flow_style=False)
+
+    # Save `constraints.txt` if necessary
+    if pip_constraints:
+        write_to(os.path.join(path, _CONSTRAINTS_FILE_NAME), "\n".join(pip_constraints))
+
+    # Save `requirements.txt`
+    write_to(os.path.join(path, _REQUIREMENTS_FILE_NAME), "\n".join(pip_requirements))
+
+
+def _save_model(lgb_model, model_path):
+    # LightGBM Boosters are saved using the built-in method `save_model()`,
+    # whereas LightGBM scikit-learn models are serialized using Cloudpickle.
+    import lightgbm as lgb
+
+    if isinstance(lgb_model, lgb.Booster):
+        lgb_model.save_model(model_path)
+    else:
+        import cloudpickle
+
+        with open(model_path, "wb") as out:
+            cloudpickle.dump(lgb_model, out)
+
+
+@format_docstring(LOG_MODEL_PARAM_DOCS.format(package_name=FLAVOR_NAME))
+def log_model(
+    lgb_model,
+    artifact_path,
+    conda_env=None,
+    registered_model_name=None,
+    signature: ModelSignature = None,
+    input_example: ModelInputExample = None,
+    await_registration_for=DEFAULT_AWAIT_MAX_SLEEP_SECONDS,
+    pip_requirements=None,
+    extra_pip_requirements=None,
+    **kwargs,
+):
+    """
+    Log a LightGBM model as an MLflow artifact for the current run.
+
+    :param lgb_model: LightGBM model (an instance of `lightgbm.Booster`_) to be saved.
+                      Note that models that implement the `scikit-learn API`_  are not supported.
+    :param artifact_path: Run-relative artifact path.
+    :param conda_env: {{ conda_env }}
+    :param registered_model_name: If given, create a model version under
+                                  ``registered_model_name``, also creating a registered model if one
+                                  with the given name does not exist.
+
+    :param signature: :py:class:`ModelSignature <mlflow.models.ModelSignature>`
+                      describes model input and output :py:class:`Schema <mlflow.types.Schema>`.
+                      The model signature can be :py:func:`inferred <mlflow.models.infer_signature>`
+                      from datasets with valid model input (e.g. the training dataset with target
+                      column omitted) and valid model output (e.g. model predictions generated on
+                      the training dataset), for example:
+
+                      .. code-block:: python
+
+                        from mlflow.models.signature import infer_signature
+                        train = df.drop_column("target_label")
+                        predictions = ... # compute model predictions
+                        signature = infer_signature(train, predictions)
+    :param input_example: Input example provides one or several instances of valid
+                          model input. The example can be used as a hint of what data to feed the
+                          model. The given example will be converted to a Pandas DataFrame and then
+                          serialized to json using the Pandas split-oriented format. Bytes are
+                          base64-encoded.
+    :param await_registration_for: Number of seconds to wait for the model version to finish
+                            being created and is in ``READY`` status. By default, the function
+                            waits for five minutes. Specify 0 or None to skip waiting.
+    :param pip_requirements: {{ pip_requirements }}
+    :param extra_pip_requirements: {{ extra_pip_requirements }}
+    :param kwargs: kwargs to pass to `lightgbm.Booster.save_model`_ method.
+    """
+    Model.log(
+        artifact_path=artifact_path,
+        flavor=mlflow.lightgbm,
+        registered_model_name=registered_model_name,
+        lgb_model=lgb_model,
+        conda_env=conda_env,
+        signature=signature,
+        input_example=input_example,
+        await_registration_for=await_registration_for,
+        pip_requirements=pip_requirements,
+        extra_pip_requirements=extra_pip_requirements,
+        **kwargs,
+    )
+
+
+def _load_model(path):
+    """
+    Load Model Implementation.
+    :param path: Local filesystem path to
+                    the MLflow Model with the ``lightgbm`` flavor (MLflow < 1.23.0) or
+                    the top-level MLflow Model directory (MLflow >= 1.23.0).
+    """
+
+    model_dir = os.path.dirname(path) if os.path.isfile(path) else path
+    flavor_conf = _get_flavor_configuration(model_path=model_dir, flavor_name=FLAVOR_NAME)
+
+    model_class = flavor_conf.get("model_class", "lightgbm.basic.Booster")
+    lgb_model_path = os.path.join(model_dir, flavor_conf.get("data"))
+
+    if model_class == "lightgbm.basic.Booster":
+        import lightgbm as lgb
+
+        model = lgb.Booster(model_file=lgb_model_path)
+    else:
+        # LightGBM scikit-learn models are deserialized using Cloudpickle.
+        import cloudpickle
+
+        with open(lgb_model_path, "rb") as f:
+            model = cloudpickle.load(f)
+
+    return model
+
+
+def _load_pyfunc(path):
+    """
+    Load PyFunc implementation. Called by ``pyfunc.load_pyfunc``.
+
+    :param path: Local filesystem path to the MLflow Model with the ``lightgbm`` flavor.
+    """
+    return _LGBModelWrapper(_load_model(path))
+
+
+def load_model(model_uri, dst_path=None):
+    """
+    Load a LightGBM model from a local file or a run.
+
+    :param model_uri: The location, in URI format, of the MLflow model. For example:
+
+                      - ``/Users/me/path/to/local/model``
+                      - ``relative/path/to/local/model``
+                      - ``s3://my_bucket/path/to/model``
+                      - ``runs:/<mlflow_run_id>/run-relative/path/to/model``
+
+                      For more information about supported URI schemes, see
+                      `Referencing Artifacts <https://www.mlflow.org/docs/latest/tracking.html#
+                      artifact-locations>`_.
+    :param dst_path: The local filesystem path to which to download the model artifact.
+                     This directory must already exist. If unspecified, a local output
+                     path will be created.
+
+    :return: A LightGBM model (an instance of `lightgbm.Booster`_).
+    """
+    local_model_path = _download_artifact_from_uri(artifact_uri=model_uri, output_path=dst_path)
+    return _load_model(path=local_model_path)
+
+
+class _LGBModelWrapper:
+    def __init__(self, lgb_model):
+        self.lgb_model = lgb_model
+
+    def predict(self, dataframe):
+        return self.lgb_model.predict(dataframe)
+
+
+def _autolog_callback(env, metrics_logger, eval_results):
+    res = {}
+    for data_name, eval_name, value, _ in env.evaluation_result_list:
+        key = data_name + "-" + eval_name
+        res[key] = value
+    metrics_logger.record_metrics(res, env.iteration)
+    eval_results.append(res)
+
+
+@autologging_integration(FLAVOR_NAME)
+def autolog(
+    log_input_examples=False,
+    log_model_signatures=True,
+    log_models=True,
+    disable=False,
+    exclusive=False,
+    disable_for_unsupported_versions=False,
+    silent=False,
+):  # pylint: disable=unused-argument
+    """
+    Enables (or disables) and configures autologging from LightGBM to MLflow. Logs the following:
+
+    - parameters specified in `lightgbm.train`_.
+    - metrics on each iteration (if ``valid_sets`` specified).
+    - metrics at the best iteration (if ``early_stopping_rounds`` specified).
+    - feature importance (both "split" and "gain") as JSON files and plots.
+    - trained model, including:
+        - an example of valid input.
+        - inferred signature of the inputs and outputs of the model.
+
+    Note that the `scikit-learn API`_ is not supported.
+
+    :param log_input_examples: If ``True``, input examples from training datasets are collected and
+                               logged along with LightGBM model artifacts during training. If
+                               ``False``, input examples are not logged.
+                               Note: Input examples are MLflow model attributes
+                               and are only collected if ``log_models`` is also ``True``.
+    :param log_model_signatures: If ``True``,
+                                 :py:class:`ModelSignatures <mlflow.models.ModelSignature>`
+                                 describing model inputs and outputs are collected and logged along
+                                 with LightGBM model artifacts during training. If ``False``,
+                                 signatures are not logged.
+                                 Note: Model signatures are MLflow model attributes
+                                 and are only collected if ``log_models`` is also ``True``.
+    :param log_models: If ``True``, trained models are logged as MLflow model artifacts.
+                       If ``False``, trained models are not logged.
+                       Input examples and model signatures, which are attributes of MLflow models,
+                       are also omitted when ``log_models`` is ``False``.
+    :param disable: If ``True``, disables the LightGBM autologging integration. If ``False``,
+                    enables the LightGBM autologging integration.
+    :param exclusive: If ``True``, autologged content is not logged to user-created fluent runs.
+                      If ``False``, autologged content is logged to the active fluent run,
+                      which may be user-created.
+    :param disable_for_unsupported_versions: If ``True``, disable autologging for versions of
+                      lightgbm that have not been tested against this version of the MLflow client
+                      or are incompatible.
+    :param silent: If ``True``, suppress all event logs and warnings from MLflow during LightGBM
+                   autologging. If ``False``, show all events and warnings during LightGBM
+                   autologging.
+    """
+    import lightgbm
+    import numpy as np
+
+    # Patching this function so we can get a copy of the data given to Dataset.__init__
+    #   to use as an input example and for inferring the model signature.
+    #   (there is no way to get the data back from a Dataset object once it is consumed by train)
+    # We store it on the Dataset object so the train function is able to read it.
+    def __init__(original, self, *args, **kwargs):
+        data = args[0] if len(args) > 0 else kwargs.get("data")
+
+        if data is not None:
+            try:
+                if isinstance(data, str):
+                    raise Exception(
+                        "cannot gather example input when dataset is loaded from a file."
+                    )
+
+                input_example_info = InputExampleInfo(
+                    input_example=deepcopy(data[:INPUT_EXAMPLE_SAMPLE_ROWS])
+                )
+            except Exception as e:
+                input_example_info = InputExampleInfo(error_msg=str(e))
+
+            setattr(self, "input_example_info", input_example_info)
+
+        original(self, *args, **kwargs)
+
+    def train(original, *args, **kwargs):
+        def record_eval_results(eval_results, metrics_logger):
+            """
+            Create a callback function that records evaluation results.
+            """
+            return picklable_exception_safe_function(
+                functools.partial(
+                    _autolog_callback, metrics_logger=metrics_logger, eval_results=eval_results
+                )
+            )
+
+        def log_feature_importance_plot(features, importance, importance_type):
+            """
+            Log feature importance plot.
+            """
+            import matplotlib.pyplot as plt
+
+            indices = np.argsort(importance)
+            features = np.array(features)[indices]
+            importance = importance[indices]
+            num_features = len(features)
+
+            # If num_features > 10, increase the figure height to prevent the plot
+            # from being too dense.
+            w, h = [6.4, 4.8]  # matplotlib's default figure size
+            h = h + 0.1 * num_features if num_features > 10 else h
+            fig, ax = plt.subplots(figsize=(w, h))
+
+            yloc = np.arange(num_features)
+            ax.barh(yloc, importance, align="center", height=0.5)
+            ax.set_yticks(yloc)
+            ax.set_yticklabels(features)
+            ax.set_xlabel("Importance")
+            ax.set_title("Feature Importance ({})".format(importance_type))
+            fig.tight_layout()
+
+            tmpdir = tempfile.mkdtemp()
+            try:
+                # pylint: disable=undefined-loop-variable
+                filepath = os.path.join(tmpdir, "feature_importance_{}.png".format(imp_type))
+                fig.savefig(filepath)
+                mlflow.log_artifact(filepath)
+            finally:
+                plt.close(fig)
+                shutil.rmtree(tmpdir)
+
+        autologging_client = MlflowAutologgingQueueingClient()
+
+        # logging booster params separately via mlflow.log_params to extract key/value pairs
+        # and make it easier to compare them across runs.
+        booster_params = args[0] if len(args) > 0 else kwargs["params"]
+        autologging_client.log_params(run_id=mlflow.active_run().info.run_id, params=booster_params)
+
+        unlogged_params = [
+            "params",
+            "train_set",
+            "valid_sets",
+            "valid_names",
+            "fobj",
+            "feval",
+            "init_model",
+            "evals_result",
+            "learning_rates",
+            "callbacks",
+        ]
+
+        params_to_log_for_fn = get_mlflow_run_params_for_fn_args(
+            original, args, kwargs, unlogged_params
+        )
+        autologging_client.log_params(
+            run_id=mlflow.active_run().info.run_id, params=params_to_log_for_fn
+        )
+
+        param_logging_operations = autologging_client.flush(synchronous=False)
+
+        all_arg_names = _get_arg_names(original)
+        num_pos_args = len(args)
+
+        # adding a callback that records evaluation results.
+        eval_results = []
+        callbacks_index = all_arg_names.index("callbacks")
+        run_id = mlflow.active_run().info.run_id
+        with batch_metrics_logger(run_id) as metrics_logger:
+            callback = record_eval_results(eval_results, metrics_logger)
+            if num_pos_args >= callbacks_index + 1:
+                tmp_list = list(args)
+                tmp_list[callbacks_index] += [callback]
+                args = tuple(tmp_list)
+            elif "callbacks" in kwargs and kwargs["callbacks"] is not None:
+                kwargs["callbacks"] += [callback]
+            else:
+                kwargs["callbacks"] = [callback]
+
+            # training model
+            model = original(*args, **kwargs)
+
+            # If early_stopping_rounds is present, logging metrics at the best iteration
+            # as extra metrics with the max step + 1.
+            early_stopping_index = all_arg_names.index("early_stopping_rounds")
+            early_stopping = (
+                num_pos_args >= early_stopping_index + 1 or "early_stopping_rounds" in kwargs
+            )
+            if early_stopping:
+                extra_step = len(eval_results)
+                autologging_client.log_metrics(
+                    run_id=mlflow.active_run().info.run_id,
+                    metrics={
+                        "stopped_iteration": extra_step,
+                        # best_iteration is set even if training does not stop early.
+                        "best_iteration": model.best_iteration,
+                    },
+                )
+                # iteration starts from 1 in LightGBM.
+                last_iter_results = eval_results[model.best_iteration - 1]
+                autologging_client.log_metrics(
+                    run_id=mlflow.active_run().info.run_id,
+                    metrics=last_iter_results,
+                    step=extra_step,
+                )
+                early_stopping_logging_operations = autologging_client.flush(synchronous=False)
+
+        # logging feature importance as artifacts.
+        for imp_type in ["split", "gain"]:
+            features = model.feature_name()
+            importance = model.feature_importance(importance_type=imp_type)
+            try:
+                log_feature_importance_plot(features, importance, imp_type)
+            except Exception:
+                _logger.exception(
+                    "Failed to log feature importance plot. LightGBM autologging "
+                    "will ignore the failure and continue. Exception: "
+                )
+
+            imp = {ft: imp for ft, imp in zip(features, importance.tolist())}
+            tmpdir = tempfile.mkdtemp()
+            try:
+                filepath = os.path.join(tmpdir, "feature_importance_{}.json".format(imp_type))
+                with open(filepath, "w") as f:
+                    json.dump(imp, f, indent=2)
+                mlflow.log_artifact(filepath)
+            finally:
+                shutil.rmtree(tmpdir)
+
+        # train_set must exist as the original train function already ran successfully
+        train_set = args[1] if len(args) > 1 else kwargs.get("train_set")
+
+        # it is possible that the dataset was constructed before the patched
+        #   constructor was applied, so we cannot assume the input_example_info exists
+        input_example_info = getattr(train_set, "input_example_info", None)
+
+        def get_input_example():
+            if input_example_info is None:
+                raise Exception(ENSURE_AUTOLOGGING_ENABLED_TEXT)
+            if input_example_info.error_msg is not None:
+                raise Exception(input_example_info.error_msg)
+            return input_example_info.input_example
+
+        def infer_model_signature(input_example):
+            model_output = model.predict(input_example)
+            model_signature = infer_signature(input_example, model_output)
+            return model_signature
+
+        # Whether to automatically log the trained model based on boolean flag.
+        if log_models:
+            # Will only resolve `input_example` and `signature` if `log_models` is `True`.
+            input_example, signature = resolve_input_example_and_signature(
+                get_input_example,
+                infer_model_signature,
+                log_input_examples,
+                log_model_signatures,
+                _logger,
+            )
+
+            log_model(
+                model,
+                artifact_path="model",
+                signature=signature,
+                input_example=input_example,
+            )
+
+        param_logging_operations.await_completion()
+        if early_stopping:
+            early_stopping_logging_operations.await_completion()
+
+        return model
+
+    safe_patch(FLAVOR_NAME, lightgbm, "train", train, manage_run=True)
+    safe_patch(FLAVOR_NAME, lightgbm.Dataset, "__init__", __init__)

From 7397fce69538c3d8555900848c093cef081fae6a Mon Sep 17 00:00:00 2001
From: Junwen Yao <jwyiao@gmail.com>
Date: Wed, 15 Dec 2021 16:07:33 -0800
Subject: [PATCH 5/7] remove prev folders

Signed-off-by: Junwen Yao <jwyiao@gmail.com>
---
 mlflow/lightgbm/__init__.py | 592 ------------------------------------
 mlflow/lightgbm/utils.py    | 110 -------
 2 files changed, 702 deletions(-)
 delete mode 100644 mlflow/lightgbm/__init__.py
 delete mode 100644 mlflow/lightgbm/utils.py

diff --git a/mlflow/lightgbm/__init__.py b/mlflow/lightgbm/__init__.py
deleted file mode 100644
index ba8c361ff2f0b..0000000000000
--- a/mlflow/lightgbm/__init__.py
+++ /dev/null
@@ -1,592 +0,0 @@
-"""
-The ``mlflow.lightgbm`` module provides an API for logging and loading LightGBM models.
-This module exports LightGBM models with the following flavors:
-
-LightGBM (native) format
-    This is the main flavor that can be loaded back into LightGBM.
-:py:mod:`mlflow.pyfunc`
-    Produced for use by generic pyfunc-based deployment tools and batch inference.
-
-.. _lightgbm.Booster:
-    https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html#lightgbm.Booster
-.. _lightgbm.Booster.save_model:
-    https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html
-    #lightgbm.Booster.save_model
-.. _lightgbm.train:
-    https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.train.html#lightgbm-train
-.. _scikit-learn API:
-    https://lightgbm.readthedocs.io/en/latest/Python-API.html#scikit-learn-api
-"""
-import os
-import yaml
-import json
-import tempfile
-import shutil
-import logging
-import functools
-from copy import deepcopy
-
-import mlflow
-from mlflow import pyfunc
-from mlflow.models import Model, infer_signature
-from mlflow.models.model import MLMODEL_FILE_NAME
-from mlflow.models.signature import ModelSignature
-from mlflow.models.utils import ModelInputExample, _save_example
-from mlflow.utils import _get_fully_qualified_class_name
-from mlflow.tracking.artifact_utils import _download_artifact_from_uri
-from mlflow.utils.environment import (
-    _mlflow_conda_env,
-    _validate_env_arguments,
-    _process_pip_requirements,
-    _process_conda_env,
-    _CONDA_ENV_FILE_NAME,
-    _REQUIREMENTS_FILE_NAME,
-    _CONSTRAINTS_FILE_NAME,
-)
-from mlflow.utils.requirements_utils import _get_pinned_requirement
-from mlflow.utils.file_utils import write_to
-from mlflow.utils.docstring_utils import format_docstring, LOG_MODEL_PARAM_DOCS
-from mlflow.utils.model_utils import _get_flavor_configuration
-from mlflow.exceptions import MlflowException
-from mlflow.utils.arguments_utils import _get_arg_names
-from mlflow.utils.autologging_utils import (
-    autologging_integration,
-    safe_patch,
-    picklable_exception_safe_function,
-    get_mlflow_run_params_for_fn_args,
-    INPUT_EXAMPLE_SAMPLE_ROWS,
-    resolve_input_example_and_signature,
-    InputExampleInfo,
-    ENSURE_AUTOLOGGING_ENABLED_TEXT,
-    batch_metrics_logger,
-    MlflowAutologgingQueueingClient,
-)
-from mlflow.tracking._model_registry import DEFAULT_AWAIT_MAX_SLEEP_SECONDS
-from mlflow.lightgbm.utils import _save_lgb_model, _load_lgb_model
-
-
-FLAVOR_NAME = "lightgbm"
-
-_logger = logging.getLogger(__name__)
-
-
-def get_default_pip_requirements():
-    """
-    :return: A list of default pip requirements for MLflow Models produced by this flavor.
-             Calls to :func:`save_model()` and :func:`log_model()` produce a pip environment
-             that, at minimum, contains these requirements.
-    """
-    return [_get_pinned_requirement("lightgbm")]
-
-
-def get_default_conda_env():
-    """
-    :return: The default Conda environment for MLflow Models produced by calls to
-             :func:`save_model()` and :func:`log_model()`.
-    """
-    return _mlflow_conda_env(additional_pip_deps=get_default_pip_requirements())
-
-
-@format_docstring(LOG_MODEL_PARAM_DOCS.format(package_name=FLAVOR_NAME))
-def save_model(
-    lgb_model,
-    path,
-    conda_env=None,
-    mlflow_model=None,
-    signature: ModelSignature = None,
-    input_example: ModelInputExample = None,
-    pip_requirements=None,
-    extra_pip_requirements=None,
-):
-    """
-    Save a LightGBM model to a path on the local file system.
-
-    :param lgb_model: LightGBM model (an instance of `lightgbm.Booster`_) to be saved.
-                      Note that models that implement the `scikit-learn API`_  are not supported.
-    :param path: Local path where the model is to be saved.
-    :param conda_env: {{ conda_env }}
-    :param mlflow_model: :py:mod:`mlflow.models.Model` this flavor is being added to.
-
-    :param signature: :py:class:`ModelSignature <mlflow.models.ModelSignature>`
-                      describes model input and output :py:class:`Schema <mlflow.types.Schema>`.
-                      The model signature can be :py:func:`inferred <mlflow.models.infer_signature>`
-                      from datasets with valid model input (e.g. the training dataset with target
-                      column omitted) and valid model output (e.g. model predictions generated on
-                      the training dataset), for example:
-
-                      .. code-block:: python
-
-                        from mlflow.models.signature import infer_signature
-                        train = df.drop_column("target_label")
-                        predictions = ... # compute model predictions
-                        signature = infer_signature(train, predictions)
-    :param input_example: Input example provides one or several instances of valid
-                          model input. The example can be used as a hint of what data to feed the
-                          model. The given example will be converted to a Pandas DataFrame and then
-                          serialized to json using the Pandas split-oriented format. Bytes are
-                          base64-encoded.
-    :param pip_requirements: {{ pip_requirements }}
-    :param extra_pip_requirements: {{ extra_pip_requirements }}
-    """
-    import lightgbm as lgb
-
-    _validate_env_arguments(conda_env, pip_requirements, extra_pip_requirements)
-
-    path = os.path.abspath(path)
-    if os.path.exists(path):
-        raise MlflowException("Path '{}' already exists".format(path))
-    model_data_subpath = "model.lgb"
-    model_data_path = os.path.join(path, model_data_subpath)
-    os.makedirs(path)
-    if mlflow_model is None:
-        mlflow_model = Model()
-    if signature is not None:
-        mlflow_model.signature = signature
-    if input_example is not None:
-        _save_example(mlflow_model, input_example, path)
-
-    # Save a LightGBM model
-    _save_lgb_model(lgb_model, model_data_path)
-
-    pyfunc.add_to_model(
-        mlflow_model,
-        loader_module="mlflow.lightgbm",
-        data=model_data_subpath,
-        env=_CONDA_ENV_FILE_NAME,
-    )
-    mlflow_model.add_flavor(
-        FLAVOR_NAME,
-        lgb_version=lgb.__version__,
-        data=model_data_subpath,
-        model_class=_get_fully_qualified_class_name(lgb_model),
-    )
-    mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME))
-
-    if conda_env is None:
-        if pip_requirements is None:
-            default_reqs = get_default_pip_requirements()
-            # To ensure `_load_pyfunc` can successfully load the model during the dependency
-            # inference, `mlflow_model.save` must be called beforehand to save an MLmodel file.
-            inferred_reqs = mlflow.models.infer_pip_requirements(
-                path,
-                FLAVOR_NAME,
-                fallback=default_reqs,
-            )
-            default_reqs = sorted(set(inferred_reqs).union(default_reqs))
-        else:
-            default_reqs = None
-        conda_env, pip_requirements, pip_constraints = _process_pip_requirements(
-            default_reqs,
-            pip_requirements,
-            extra_pip_requirements,
-        )
-    else:
-        conda_env, pip_requirements, pip_constraints = _process_conda_env(conda_env)
-
-    with open(os.path.join(path, _CONDA_ENV_FILE_NAME), "w") as f:
-        yaml.safe_dump(conda_env, stream=f, default_flow_style=False)
-
-    # Save `constraints.txt` if necessary
-    if pip_constraints:
-        write_to(os.path.join(path, _CONSTRAINTS_FILE_NAME), "\n".join(pip_constraints))
-
-    # Save `requirements.txt`
-    write_to(os.path.join(path, _REQUIREMENTS_FILE_NAME), "\n".join(pip_requirements))
-
-
-@format_docstring(LOG_MODEL_PARAM_DOCS.format(package_name=FLAVOR_NAME))
-def log_model(
-    lgb_model,
-    artifact_path,
-    conda_env=None,
-    registered_model_name=None,
-    signature: ModelSignature = None,
-    input_example: ModelInputExample = None,
-    await_registration_for=DEFAULT_AWAIT_MAX_SLEEP_SECONDS,
-    pip_requirements=None,
-    extra_pip_requirements=None,
-    **kwargs,
-):
-    """
-    Log a LightGBM model as an MLflow artifact for the current run.
-
-    :param lgb_model: LightGBM model (an instance of `lightgbm.Booster`_) to be saved.
-                      Note that models that implement the `scikit-learn API`_  are not supported.
-    :param artifact_path: Run-relative artifact path.
-    :param conda_env: {{ conda_env }}
-    :param registered_model_name: If given, create a model version under
-                                  ``registered_model_name``, also creating a registered model if one
-                                  with the given name does not exist.
-
-    :param signature: :py:class:`ModelSignature <mlflow.models.ModelSignature>`
-                      describes model input and output :py:class:`Schema <mlflow.types.Schema>`.
-                      The model signature can be :py:func:`inferred <mlflow.models.infer_signature>`
-                      from datasets with valid model input (e.g. the training dataset with target
-                      column omitted) and valid model output (e.g. model predictions generated on
-                      the training dataset), for example:
-
-                      .. code-block:: python
-
-                        from mlflow.models.signature import infer_signature
-                        train = df.drop_column("target_label")
-                        predictions = ... # compute model predictions
-                        signature = infer_signature(train, predictions)
-    :param input_example: Input example provides one or several instances of valid
-                          model input. The example can be used as a hint of what data to feed the
-                          model. The given example will be converted to a Pandas DataFrame and then
-                          serialized to json using the Pandas split-oriented format. Bytes are
-                          base64-encoded.
-    :param await_registration_for: Number of seconds to wait for the model version to finish
-                            being created and is in ``READY`` status. By default, the function
-                            waits for five minutes. Specify 0 or None to skip waiting.
-    :param pip_requirements: {{ pip_requirements }}
-    :param extra_pip_requirements: {{ extra_pip_requirements }}
-    :param kwargs: kwargs to pass to `lightgbm.Booster.save_model`_ method.
-    """
-    Model.log(
-        artifact_path=artifact_path,
-        flavor=mlflow.lightgbm,
-        registered_model_name=registered_model_name,
-        lgb_model=lgb_model,
-        conda_env=conda_env,
-        signature=signature,
-        input_example=input_example,
-        await_registration_for=await_registration_for,
-        pip_requirements=pip_requirements,
-        extra_pip_requirements=extra_pip_requirements,
-        **kwargs,
-    )
-
-
-def _load_model(path):
-    """
-    :param path: Local filesystem path to
-                 the MLflow Model with the ``lightgbm`` flavor (MLflow < x.x.x) or
-                 the top-level MLflow Model directory (MLflow >= x.x.x).
-    """
-    model_dir = os.path.dirname(path) if os.path.isfile(path) else path
-    flavor_conf = _get_flavor_configuration(model_path=model_dir, flavor_name=FLAVOR_NAME)
-
-    model_class = flavor_conf.get("model_class", "lightgbm.basic.Booster")
-    lgb_model_path = os.path.join(model_dir, flavor_conf.get("data"))
-
-    return _load_lgb_model(model_class, lgb_model_path)
-
-
-def _load_pyfunc(path):
-    """
-    Load PyFunc implementation. Called by ``pyfunc.load_pyfunc``.
-
-    :param path: Local filesystem path to the MLflow Model with the ``lightgbm`` flavor.
-    """
-    return _LGBModelWrapper(_load_model(path))
-
-
-def load_model(model_uri, dst_path=None):
-    """
-    Load a LightGBM model from a local file or a run.
-
-    :param model_uri: The location, in URI format, of the MLflow model. For example:
-
-                      - ``/Users/me/path/to/local/model``
-                      - ``relative/path/to/local/model``
-                      - ``s3://my_bucket/path/to/model``
-                      - ``runs:/<mlflow_run_id>/run-relative/path/to/model``
-
-                      For more information about supported URI schemes, see
-                      `Referencing Artifacts <https://www.mlflow.org/docs/latest/tracking.html#
-                      artifact-locations>`_.
-    :param dst_path: The local filesystem path to which to download the model artifact.
-                     This directory must already exist. If unspecified, a local output
-                     path will be created.
-
-    :return: A LightGBM model (an instance of `lightgbm.Booster`_) or LightGBM scikit-learn
-             models, depending on the saved model class specification.
-    """
-    local_model_path = _download_artifact_from_uri(artifact_uri=model_uri, output_path=dst_path)
-    return _load_model(path=local_model_path)
-
-
-class _LGBModelWrapper:
-    def __init__(self, lgb_model):
-        self.lgb_model = lgb_model
-
-    def predict(self, dataframe):
-        return self.lgb_model.predict(dataframe)
-
-
-def _autolog_callback(env, metrics_logger, eval_results):
-    res = {}
-    for data_name, eval_name, value, _ in env.evaluation_result_list:
-        key = data_name + "-" + eval_name
-        res[key] = value
-    metrics_logger.record_metrics(res, env.iteration)
-    eval_results.append(res)
-
-
-@autologging_integration(FLAVOR_NAME)
-def autolog(
-    log_input_examples=False,
-    log_model_signatures=True,
-    log_models=True,
-    disable=False,
-    exclusive=False,
-    disable_for_unsupported_versions=False,
-    silent=False,
-):  # pylint: disable=unused-argument
-    """
-    Enables (or disables) and configures autologging from LightGBM to MLflow. Logs the following:
-
-    - parameters specified in `lightgbm.train`_.
-    - metrics on each iteration (if ``valid_sets`` specified).
-    - metrics at the best iteration (if ``early_stopping_rounds`` specified).
-    - feature importance (both "split" and "gain") as JSON files and plots.
-    - trained model, including:
-        - an example of valid input.
-        - inferred signature of the inputs and outputs of the model.
-
-    Note that the `scikit-learn API`_ is not supported.
-
-    :param log_input_examples: If ``True``, input examples from training datasets are collected and
-                               logged along with LightGBM model artifacts during training. If
-                               ``False``, input examples are not logged.
-                               Note: Input examples are MLflow model attributes
-                               and are only collected if ``log_models`` is also ``True``.
-    :param log_model_signatures: If ``True``,
-                                 :py:class:`ModelSignatures <mlflow.models.ModelSignature>`
-                                 describing model inputs and outputs are collected and logged along
-                                 with LightGBM model artifacts during training. If ``False``,
-                                 signatures are not logged.
-                                 Note: Model signatures are MLflow model attributes
-                                 and are only collected if ``log_models`` is also ``True``.
-    :param log_models: If ``True``, trained models are logged as MLflow model artifacts.
-                       If ``False``, trained models are not logged.
-                       Input examples and model signatures, which are attributes of MLflow models,
-                       are also omitted when ``log_models`` is ``False``.
-    :param disable: If ``True``, disables the LightGBM autologging integration. If ``False``,
-                    enables the LightGBM autologging integration.
-    :param exclusive: If ``True``, autologged content is not logged to user-created fluent runs.
-                      If ``False``, autologged content is logged to the active fluent run,
-                      which may be user-created.
-    :param disable_for_unsupported_versions: If ``True``, disable autologging for versions of
-                      lightgbm that have not been tested against this version of the MLflow client
-                      or are incompatible.
-    :param silent: If ``True``, suppress all event logs and warnings from MLflow during LightGBM
-                   autologging. If ``False``, show all events and warnings during LightGBM
-                   autologging.
-    """
-    import lightgbm
-    import numpy as np
-
-    # Patching this function so we can get a copy of the data given to Dataset.__init__
-    #   to use as an input example and for inferring the model signature.
-    #   (there is no way to get the data back from a Dataset object once it is consumed by train)
-    # We store it on the Dataset object so the train function is able to read it.
-    def __init__(original, self, *args, **kwargs):
-        data = args[0] if len(args) > 0 else kwargs.get("data")
-
-        if data is not None:
-            try:
-                if isinstance(data, str):
-                    raise Exception(
-                        "cannot gather example input when dataset is loaded from a file."
-                    )
-
-                input_example_info = InputExampleInfo(
-                    input_example=deepcopy(data[:INPUT_EXAMPLE_SAMPLE_ROWS])
-                )
-            except Exception as e:
-                input_example_info = InputExampleInfo(error_msg=str(e))
-
-            setattr(self, "input_example_info", input_example_info)
-
-        original(self, *args, **kwargs)
-
-    def train(original, *args, **kwargs):
-        def record_eval_results(eval_results, metrics_logger):
-            """
-            Create a callback function that records evaluation results.
-            """
-            return picklable_exception_safe_function(
-                functools.partial(
-                    _autolog_callback, metrics_logger=metrics_logger, eval_results=eval_results
-                )
-            )
-
-        def log_feature_importance_plot(features, importance, importance_type):
-            """
-            Log feature importance plot.
-            """
-            import matplotlib.pyplot as plt
-
-            indices = np.argsort(importance)
-            features = np.array(features)[indices]
-            importance = importance[indices]
-            num_features = len(features)
-
-            # If num_features > 10, increase the figure height to prevent the plot
-            # from being too dense.
-            w, h = [6.4, 4.8]  # matplotlib's default figure size
-            h = h + 0.1 * num_features if num_features > 10 else h
-            fig, ax = plt.subplots(figsize=(w, h))
-
-            yloc = np.arange(num_features)
-            ax.barh(yloc, importance, align="center", height=0.5)
-            ax.set_yticks(yloc)
-            ax.set_yticklabels(features)
-            ax.set_xlabel("Importance")
-            ax.set_title("Feature Importance ({})".format(importance_type))
-            fig.tight_layout()
-
-            tmpdir = tempfile.mkdtemp()
-            try:
-                # pylint: disable=undefined-loop-variable
-                filepath = os.path.join(tmpdir, "feature_importance_{}.png".format(imp_type))
-                fig.savefig(filepath)
-                mlflow.log_artifact(filepath)
-            finally:
-                plt.close(fig)
-                shutil.rmtree(tmpdir)
-
-        autologging_client = MlflowAutologgingQueueingClient()
-
-        # logging booster params separately via mlflow.log_params to extract key/value pairs
-        # and make it easier to compare them across runs.
-        booster_params = args[0] if len(args) > 0 else kwargs["params"]
-        autologging_client.log_params(run_id=mlflow.active_run().info.run_id, params=booster_params)
-
-        unlogged_params = [
-            "params",
-            "train_set",
-            "valid_sets",
-            "valid_names",
-            "fobj",
-            "feval",
-            "init_model",
-            "evals_result",
-            "learning_rates",
-            "callbacks",
-        ]
-
-        params_to_log_for_fn = get_mlflow_run_params_for_fn_args(
-            original, args, kwargs, unlogged_params
-        )
-        autologging_client.log_params(
-            run_id=mlflow.active_run().info.run_id, params=params_to_log_for_fn
-        )
-
-        param_logging_operations = autologging_client.flush(synchronous=False)
-
-        all_arg_names = _get_arg_names(original)
-        num_pos_args = len(args)
-
-        # adding a callback that records evaluation results.
-        eval_results = []
-        callbacks_index = all_arg_names.index("callbacks")
-        run_id = mlflow.active_run().info.run_id
-        with batch_metrics_logger(run_id) as metrics_logger:
-            callback = record_eval_results(eval_results, metrics_logger)
-            if num_pos_args >= callbacks_index + 1:
-                tmp_list = list(args)
-                tmp_list[callbacks_index] += [callback]
-                args = tuple(tmp_list)
-            elif "callbacks" in kwargs and kwargs["callbacks"] is not None:
-                kwargs["callbacks"] += [callback]
-            else:
-                kwargs["callbacks"] = [callback]
-
-            # training model
-            model = original(*args, **kwargs)
-
-            # If early_stopping_rounds is present, logging metrics at the best iteration
-            # as extra metrics with the max step + 1.
-            early_stopping_index = all_arg_names.index("early_stopping_rounds")
-            early_stopping = (
-                num_pos_args >= early_stopping_index + 1 or "early_stopping_rounds" in kwargs
-            )
-            if early_stopping:
-                extra_step = len(eval_results)
-                autologging_client.log_metrics(
-                    run_id=mlflow.active_run().info.run_id,
-                    metrics={
-                        "stopped_iteration": extra_step,
-                        # best_iteration is set even if training does not stop early.
-                        "best_iteration": model.best_iteration,
-                    },
-                )
-                # iteration starts from 1 in LightGBM.
-                last_iter_results = eval_results[model.best_iteration - 1]
-                autologging_client.log_metrics(
-                    run_id=mlflow.active_run().info.run_id,
-                    metrics=last_iter_results,
-                    step=extra_step,
-                )
-                early_stopping_logging_operations = autologging_client.flush(synchronous=False)
-
-        # logging feature importance as artifacts.
-        for imp_type in ["split", "gain"]:
-            features = model.feature_name()
-            importance = model.feature_importance(importance_type=imp_type)
-            try:
-                log_feature_importance_plot(features, importance, imp_type)
-            except Exception:
-                _logger.exception(
-                    "Failed to log feature importance plot. LightGBM autologging "
-                    "will ignore the failure and continue. Exception: "
-                )
-
-            imp = {ft: imp for ft, imp in zip(features, importance.tolist())}
-            tmpdir = tempfile.mkdtemp()
-            try:
-                filepath = os.path.join(tmpdir, "feature_importance_{}.json".format(imp_type))
-                with open(filepath, "w") as f:
-                    json.dump(imp, f, indent=2)
-                mlflow.log_artifact(filepath)
-            finally:
-                shutil.rmtree(tmpdir)
-
-        # train_set must exist as the original train function already ran successfully
-        train_set = args[1] if len(args) > 1 else kwargs.get("train_set")
-
-        # it is possible that the dataset was constructed before the patched
-        #   constructor was applied, so we cannot assume the input_example_info exists
-        input_example_info = getattr(train_set, "input_example_info", None)
-
-        def get_input_example():
-            if input_example_info is None:
-                raise Exception(ENSURE_AUTOLOGGING_ENABLED_TEXT)
-            if input_example_info.error_msg is not None:
-                raise Exception(input_example_info.error_msg)
-            return input_example_info.input_example
-
-        def infer_model_signature(input_example):
-            model_output = model.predict(input_example)
-            model_signature = infer_signature(input_example, model_output)
-            return model_signature
-
-        # Whether to automatically log the trained model based on boolean flag.
-        if log_models:
-            # Will only resolve `input_example` and `signature` if `log_models` is `True`.
-            input_example, signature = resolve_input_example_and_signature(
-                get_input_example,
-                infer_model_signature,
-                log_input_examples,
-                log_model_signatures,
-                _logger,
-            )
-
-            log_model(
-                model,
-                artifact_path="model",
-                signature=signature,
-                input_example=input_example,
-            )
-
-        param_logging_operations.await_completion()
-        if early_stopping:
-            early_stopping_logging_operations.await_completion()
-
-        return model
-
-    safe_patch(FLAVOR_NAME, lightgbm, "train", train, manage_run=True)
-    safe_patch(FLAVOR_NAME, lightgbm.Dataset, "__init__", __init__)
diff --git a/mlflow/lightgbm/utils.py b/mlflow/lightgbm/utils.py
deleted file mode 100644
index e4364b193afa9..0000000000000
--- a/mlflow/lightgbm/utils.py
+++ /dev/null
@@ -1,110 +0,0 @@
-import importlib
-import json
-import os.path
-import warnings
-import numpy as np
-
-
-def _label_encoder_to_json(le):
-    """Returns a JSON compatible dictionary"""
-    meta = {}
-    for k, v in le.__dict__.items():
-        if isinstance(v, np.ndarray):
-            meta[k] = v.tolist()
-        else:
-            meta[k] = v
-    return meta
-
-
-def _label_encoder_from_json(doc):
-    """Load the encoder back from a JSON compatible dict"""
-    from lightgbm.compat import _LGBMLabelEncoder
-
-    le = _LGBMLabelEncoder()
-    meta = {}
-    for k, v in doc.items():
-        if k == "classes_":
-            le.classes_ = np.array(v) if v is not None else None
-            continue
-        meta[k] = v
-    le.__dict__.update(meta)
-    return le
-
-
-def _save_lgb_attr(model_dir, fname, attr_dict):
-    with open(os.path.join(model_dir, "{}.json".format(fname)), "w") as f:
-        json.dump(attr_dict, f)
-
-
-def _load_lgb_attr(model_dir, fname):
-    try:
-        with open(os.path.join(model_dir, "{}.json".format(fname))) as f:
-            attr = json.load(f)
-        return attr
-    except IOError:
-        return None
-
-
-def _save_lgb_model(lgb_model, model_path) -> None:
-    import lightgbm as lgb
-
-    model_dir = os.path.dirname(model_path)
-
-    if not isinstance(lgb_model, lgb.Booster):
-        meta = {}
-        for k, v in lgb_model.__dict__.items():
-            if k == "_le":
-                meta["_le"] = _label_encoder_to_json(v) if v else None
-                continue
-            if k == "_Booster":
-                continue
-            if k == "_classes" and v is not None:
-                meta["_classes"] = v.tolist()
-                continue
-            if k == "_class_map" and v:
-                py_dict = {}
-                for clazz, encoded in v.items():
-                    py_dict[int(clazz)] = int(encoded)
-                v = py_dict
-            try:
-                json.dumps({k: v})
-                meta[k] = v
-            except TypeError:
-                warnings.warn(str(k) + " is not saved in Scikit-Learn meta.", UserWarning)
-        _save_lgb_attr(model_dir, "scikit-learn", meta)
-        lgb_model = lgb_model._Booster
-
-    lgb_model.save_model(model_path)
-    _save_lgb_attr(model_dir, "params", lgb_model.params)
-
-
-def _load_lgb_model(lgb_model_class, model_path):
-    import lightgbm as lgb
-
-    module, cls = lgb_model_class.rsplit(".", maxsplit=1)
-    model_dir = os.path.dirname(model_path)
-    sk_attr = _load_lgb_attr(model_dir, "scikit-learn")
-    bst_params = _load_lgb_attr(model_dir, "params")
-
-    booster = lgb.Booster(model_file=model_path, params=bst_params)
-
-    if sk_attr is None:
-        warnings.warn("Loading a native LightGBM model with Scikit-Learn interface.")
-        return booster
-
-    sk_model = getattr(importlib.import_module(module), cls)()
-    states = {}
-    for k, v in sk_attr.items():
-        if k == "_le":
-            sk_model._le = _label_encoder_from_json(v)
-            continue
-        if k == "_classes":
-            sk_model._classes = np.array(v)
-            continue
-        states[k] = v
-    sk_model.__dict__.update(states)
-    # Delete the attribute after load
-    booster.set_attr(scikit_learn=None)
-    sk_model._Booster = booster
-
-    return sk_model

From c212f5533ef2a4b03ce474631028f8b8d4a8126a Mon Sep 17 00:00:00 2001
From: Junwen Yao <jwyiao@gmail.com>
Date: Wed, 22 Dec 2021 09:04:54 -0800
Subject: [PATCH 6/7] address review

Signed-off-by: Junwen Yao <jwyiao@gmail.com>
---
 mlflow/lightgbm.py | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/mlflow/lightgbm.py b/mlflow/lightgbm.py
index 397cde1341b22..f91a82c63efac 100644
--- a/mlflow/lightgbm.py
+++ b/mlflow/lightgbm.py
@@ -68,21 +68,24 @@
 _logger = logging.getLogger(__name__)
 
 
-def get_default_pip_requirements():
+def get_default_pip_requirements(include_cloudpickle=False):
     """
     :return: A list of default pip requirements for MLflow Models produced by this flavor.
              Calls to :func:`save_model()` and :func:`log_model()` produce a pip environment
              that, at minimum, contains these requirements.
     """
-    return [_get_pinned_requirement("lightgbm"), _get_pinned_requirement("cloudpickle")]
+    pip_deps = [_get_pinned_requirement("lightgbm")]
+    if include_cloudpickle:
+        pip_deps.append(_get_pinned_requirement("cloudpickle"))
+    return pip_deps
 
 
-def get_default_conda_env():
+def get_default_conda_env(include_cloudpickle=False):
     """
     :return: The default Conda environment for MLflow Models produced by calls to
              :func:`save_model()` and :func:`log_model()`.
     """
-    return _mlflow_conda_env(additional_pip_deps=get_default_pip_requirements())
+    return _mlflow_conda_env(additional_pip_deps=get_default_pip_requirements(include_cloudpickle))
 
 
 @format_docstring(LOG_MODEL_PARAM_DOCS.format(package_name=FLAVOR_NAME))
@@ -133,10 +136,7 @@ def save_model(
     path = os.path.abspath(path)
     if os.path.exists(path):
         raise MlflowException("Path '{}' already exists".format(path))
-    if isinstance(lgb_model, lgb.Booster):
-        model_data_subpath = "model.lgb"
-    else:
-        model_data_subpath = "model.pkl"
+    model_data_subpath = "model.lgb" if isinstance(lgb_model, lgb.Booster) else "model.pkl"
     model_data_path = os.path.join(path, model_data_subpath)
     os.makedirs(path)
     if mlflow_model is None:
@@ -146,8 +146,8 @@ def save_model(
     if input_example is not None:
         _save_example(mlflow_model, input_example, path)
 
-    # Save a LightGBM model
-    _save_model(lgb_model, model_data_path)
+    # Save a LightGBM model and retrieve its model type
+    is_sklearn_model = _save_model(lgb_model, model_data_path)
 
     lgb_model_class = _get_fully_qualified_class_name(lgb_model)
     pyfunc.add_to_model(
@@ -166,7 +166,7 @@ def save_model(
 
     if conda_env is None:
         if pip_requirements is None:
-            default_reqs = get_default_pip_requirements()
+            default_reqs = get_default_pip_requirements(include_cloudpickle=is_sklearn_model)
             # To ensure `_load_pyfunc` can successfully load the model during the dependency
             # inference, `mlflow_model.save` must be called beforehand to save an MLmodel file.
             inferred_reqs = mlflow.models.infer_pip_requirements(
@@ -197,17 +197,24 @@ def save_model(
 
 
 def _save_model(lgb_model, model_path):
-    # LightGBM Boosters are saved using the built-in method `save_model()`,
-    # whereas LightGBM scikit-learn models are serialized using Cloudpickle.
+    """
+    LightGBM Boosters are saved using the built-in method `save_model()`,
+    whereas LightGBM scikit-learn models are serialized using Cloudpickle.
+
+    :return: A boolean value indicating whether the save model is a scikit-learn
+             model. The returned value will be passed to `get_default_pip_requirements`.
+    """
     import lightgbm as lgb
 
     if isinstance(lgb_model, lgb.Booster):
         lgb_model.save_model(model_path)
+        return False
     else:
         import cloudpickle
 
         with open(model_path, "wb") as out:
             cloudpickle.dump(lgb_model, out)
+        return True
 
 
 @format_docstring(LOG_MODEL_PARAM_DOCS.format(package_name=FLAVOR_NAME))
@@ -329,7 +336,8 @@ def load_model(model_uri, dst_path=None):
                      This directory must already exist. If unspecified, a local output
                      path will be created.
 
-    :return: A LightGBM model (an instance of `lightgbm.Booster`_).
+    :return: A LightGBM model (an instance of `lightgbm.Booster`_) or a LightGBM scikit-learn
+             model, depending on the saved model class specification.
     """
     local_model_path = _download_artifact_from_uri(artifact_uri=model_uri, output_path=dst_path)
     return _load_model(path=local_model_path)

From 2b3102905923378185c2088d5a28547db437884d Mon Sep 17 00:00:00 2001
From: Junwen Yao <jwyiao@gmail.com>
Date: Wed, 22 Dec 2021 14:25:34 -0800
Subject: [PATCH 7/7] a better soln

Signed-off-by: Junwen Yao <jwyiao@gmail.com>
---
 mlflow/lightgbm.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/mlflow/lightgbm.py b/mlflow/lightgbm.py
index f91a82c63efac..b8ab5df94d88d 100644
--- a/mlflow/lightgbm.py
+++ b/mlflow/lightgbm.py
@@ -146,8 +146,8 @@ def save_model(
     if input_example is not None:
         _save_example(mlflow_model, input_example, path)
 
-    # Save a LightGBM model and retrieve its model type
-    is_sklearn_model = _save_model(lgb_model, model_data_path)
+    # Save a LightGBM model
+    _save_model(lgb_model, model_data_path)
 
     lgb_model_class = _get_fully_qualified_class_name(lgb_model)
     pyfunc.add_to_model(
@@ -166,7 +166,9 @@ def save_model(
 
     if conda_env is None:
         if pip_requirements is None:
-            default_reqs = get_default_pip_requirements(include_cloudpickle=is_sklearn_model)
+            default_reqs = get_default_pip_requirements(
+                include_cloudpickle=not isinstance(lgb_model, lgb.Booster)
+            )
             # To ensure `_load_pyfunc` can successfully load the model during the dependency
             # inference, `mlflow_model.save` must be called beforehand to save an MLmodel file.
             inferred_reqs = mlflow.models.infer_pip_requirements(
@@ -200,21 +202,16 @@ def _save_model(lgb_model, model_path):
     """
     LightGBM Boosters are saved using the built-in method `save_model()`,
     whereas LightGBM scikit-learn models are serialized using Cloudpickle.
-
-    :return: A boolean value indicating whether the save model is a scikit-learn
-             model. The returned value will be passed to `get_default_pip_requirements`.
     """
     import lightgbm as lgb
 
     if isinstance(lgb_model, lgb.Booster):
         lgb_model.save_model(model_path)
-        return False
     else:
         import cloudpickle
 
         with open(model_path, "wb") as out:
             cloudpickle.dump(lgb_model, out)
-        return True
 
 
 @format_docstring(LOG_MODEL_PARAM_DOCS.format(package_name=FLAVOR_NAME))