From ce9c5e3261297d20da601f935314c43702f8f096 Mon Sep 17 00:00:00 2001 From: Junwen Yao Date: Tue, 30 Nov 2021 10:38:46 -0800 Subject: [PATCH 1/7] init commit Signed-off-by: Junwen Yao --- mlflow/{lightgbm.py => lightgbm/__init__.py} | 33 +- mlflow/lightgbm/utils.py | 106 ++++ tests/lightgbm/test_lightgbm_autolog.py | 538 ------------------- tests/lightgbm/test_lightgbm_model_export.py | 78 ++- 4 files changed, 208 insertions(+), 547 deletions(-) rename mlflow/{lightgbm.py => lightgbm/__init__.py} (95%) create mode 100644 mlflow/lightgbm/utils.py delete mode 100644 tests/lightgbm/test_lightgbm_autolog.py diff --git a/mlflow/lightgbm.py b/mlflow/lightgbm/__init__.py similarity index 95% rename from mlflow/lightgbm.py rename to mlflow/lightgbm/__init__.py index 764241f867442..d717c419d0874 100644 --- a/mlflow/lightgbm.py +++ b/mlflow/lightgbm/__init__.py @@ -18,6 +18,7 @@ https://lightgbm.readthedocs.io/en/latest/Python-API.html#scikit-learn-api """ import os + import yaml import json import tempfile @@ -32,6 +33,7 @@ from mlflow.models.model import MLMODEL_FILE_NAME from mlflow.models.signature import ModelSignature from mlflow.models.utils import ModelInputExample, _save_example +from mlflow.utils import _get_fully_qualified_class_name from mlflow.tracking.artifact_utils import _download_artifact_from_uri from mlflow.utils.environment import ( _mlflow_conda_env, @@ -61,6 +63,8 @@ MlflowAutologgingQueueingClient, ) from mlflow.tracking._model_registry import DEFAULT_AWAIT_MAX_SLEEP_SECONDS +from mlflow.lightgbm.utils import _save_lgb_model, _load_lgb_model + FLAVOR_NAME = "lightgbm" @@ -143,7 +147,7 @@ def save_model( _save_example(mlflow_model, input_example, path) # Save a LightGBM model - lgb_model.save_model(model_data_path) + _save_lgb_model(lgb_model, model_data_path) pyfunc.add_to_model( mlflow_model, @@ -151,7 +155,12 @@ def save_model( data=model_data_subpath, env=_CONDA_ENV_FILE_NAME, ) - mlflow_model.add_flavor(FLAVOR_NAME, lgb_version=lgb.__version__, data=model_data_subpath) + mlflow_model.add_flavor( + FLAVOR_NAME, + lgb_version=lgb.__version__, + data=model_data_subpath, + model_class=_get_fully_qualified_class_name(lgb_model), + ) mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME)) if conda_env is None: @@ -251,9 +260,18 @@ def log_model( def _load_model(path): - import lightgbm as lgb + """ + :param path: Local filesystem path to + the MLflow Model with the ``lightgbm`` flavor (MLflow < x.x.x) or + the top-level MLflow Model directory (MLflow >= x.x.x). + """ + model_dir = os.path.dirname(path) if os.path.isfile(path) else path + flavor_conf = _get_flavor_configuration(model_path=model_dir, flavor_name=FLAVOR_NAME) + + model_class = flavor_conf.get("model_class", "lightgbm.basic.Booster") + lgb_model_path = os.path.join(model_dir, flavor_conf.get("data")) - return lgb.Booster(model_file=path) + return _load_lgb_model(model_class, lgb_model_path) def _load_pyfunc(path): @@ -283,12 +301,11 @@ def load_model(model_uri, dst_path=None): This directory must already exist. If unspecified, a local output path will be created. - :return: A LightGBM model (an instance of `lightgbm.Booster`_). + :return: A LightGBM model (an instance of `lightgbm.Booster`_) or LightGBM scikit-learn + models, depending on the saved model class specification. """ local_model_path = _download_artifact_from_uri(artifact_uri=model_uri, output_path=dst_path) - flavor_conf = _get_flavor_configuration(model_path=local_model_path, flavor_name=FLAVOR_NAME) - lgb_model_file_path = os.path.join(local_model_path, flavor_conf.get("data", "model.lgb")) - return _load_model(path=lgb_model_file_path) + return _load_model(path=local_model_path) class _LGBModelWrapper: diff --git a/mlflow/lightgbm/utils.py b/mlflow/lightgbm/utils.py new file mode 100644 index 0000000000000..f2a338c3cac17 --- /dev/null +++ b/mlflow/lightgbm/utils.py @@ -0,0 +1,106 @@ +import importlib +import json +import os.path +import warnings +import numpy as np +import lightgbm as lgb +from lightgbm.compat import _LGBMLabelEncoder + + +def _label_encoder_to_json(le): + """Returns a JSON compatible dictionary""" + meta = {} + for k, v in le.__dict__.items(): + if isinstance(v, np.ndarray): + meta[k] = v.tolist() + else: + meta[k] = v + return meta + + +def _label_encoder_from_json(doc): + """Load the encoder back from a JSON compatible dict""" + le = _LGBMLabelEncoder() + meta = {} + for k, v in doc.items(): + if k == "classes_": + le.classes_ = np.array(v) if v is not None else None + continue + meta[k] = v + le.__dict__.update(meta) + return le + + +def _save_lgb_attr(model_dir, fname, attr_dict): + with open(os.path.join(model_dir, "{}.json".format(fname)), "w") as f: + json.dump(attr_dict, f) + + +def _load_lgb_attr(model_dir, fname): + try: + with open(os.path.join(model_dir, "{}.json".format(fname))) as f: + attr = json.load(f) + return attr + except IOError: + return None + + +def _save_lgb_model(lgb_model, model_path) -> None: + model_dir = os.path.dirname(model_path) + + if not isinstance(lgb_model, lgb.Booster): + meta = {} + for k, v in lgb_model.__dict__.items(): + if k == "_le": + meta["_le"] = _label_encoder_to_json(v) if v else None + continue + if k == "_Booster": + continue + if k == "_classes" and v is not None: + meta["_classes"] = v.tolist() + continue + if k == "_class_map" and v: + py_dict = {} + for clazz, encoded in v.items(): + py_dict[int(clazz)] = int(encoded) + v = py_dict + try: + json.dumps({k: v}) + meta[k] = v + except TypeError: + warnings.warn(str(k) + " is not saved in Scikit-Learn meta.", UserWarning) + _save_lgb_attr(model_dir, "scikit-learn", meta) + lgb_model = lgb_model._Booster + + lgb_model.save_model(model_path) + _save_lgb_attr(model_dir, "params", lgb_model.params) + + +def _load_lgb_model(lgb_model_class, model_path): + module, cls = lgb_model_class.rsplit(".", maxsplit=1) + model_dir = os.path.dirname(model_path) + sk_attr = _load_lgb_attr(model_dir, "scikit-learn") + bst_params = _load_lgb_attr(model_dir, "params") + + booster = lgb.Booster(model_file=model_path, params=bst_params) + + if sk_attr is None: + warnings.warn("Loading a native LightGBM model with Scikit-Learn interface.") + return booster + + sk_model = getattr(importlib.import_module(module), cls)() + states = {} + for k, v in sk_attr.items(): + if k == "_le": + sk_model._le = _label_encoder_from_json(v) + continue + if k == "_classes": + sk_model._classes = np.array(v) + continue + states[k] = v + sk_model.__dict__.update(states) + # Delete the attribute after load + booster.set_attr(scikit_learn=None) + sk_model._Booster = booster + + return sk_model diff --git a/tests/lightgbm/test_lightgbm_autolog.py b/tests/lightgbm/test_lightgbm_autolog.py deleted file mode 100644 index 697e0255d0c9c..0000000000000 --- a/tests/lightgbm/test_lightgbm_autolog.py +++ /dev/null @@ -1,538 +0,0 @@ -import os -import json -import functools -import pickle -import pytest -import yaml -import numpy as np -import pandas as pd -from sklearn import datasets -import lightgbm as lgb -import matplotlib as mpl -from packaging.version import Version - -import mlflow -import mlflow.lightgbm -from mlflow.lightgbm import _autolog_callback -from mlflow.models import Model -from mlflow.models.utils import _read_example -from mlflow.utils.autologging_utils import picklable_exception_safe_function, BatchMetricsLogger -from unittest.mock import patch - -mpl.use("Agg") - - -def get_latest_run(): - client = mlflow.tracking.MlflowClient() - return client.get_run(client.list_run_infos(experiment_id="0")[0].run_id) - - -def get_model_conf(artifact_uri, model_subpath="model"): - model_conf_path = os.path.join(artifact_uri, model_subpath, "MLmodel") - return Model.load(model_conf_path) - - -@pytest.fixture(scope="session") -def bst_params(): - return { - "objective": "multiclass", - "num_class": 3, - } - - -@pytest.fixture(scope="session") -def train_set(): - iris = datasets.load_iris() - X = pd.DataFrame(iris.data[:, :2], columns=iris.feature_names[:2]) - y = iris.target - # set free_raw_data False to use raw data later. - return lgb.Dataset(X, y, free_raw_data=False) - - -@pytest.mark.large -def test_lgb_autolog_ends_auto_created_run(bst_params, train_set): - mlflow.lightgbm.autolog() - lgb.train(bst_params, train_set, num_boost_round=1) - assert mlflow.active_run() is None - - -@pytest.mark.large -def test_lgb_autolog_persists_manually_created_run(bst_params, train_set): - mlflow.lightgbm.autolog() - with mlflow.start_run() as run: - lgb.train(bst_params, train_set, num_boost_round=1) - assert mlflow.active_run() - assert mlflow.active_run().info.run_id == run.info.run_id - - -@pytest.mark.large -def test_lgb_autolog_logs_default_params(bst_params, train_set): - mlflow.lightgbm.autolog() - lgb.train(bst_params, train_set) - run = get_latest_run() - params = run.data.params - - expected_params = { - "num_boost_round": 100, - "feature_name": "auto", - "categorical_feature": "auto", - "verbose_eval": ( - # The default value of `verbose_eval` in `lightgbm.train` has been changed to 'warn' - # in this PR: https://github.com/microsoft/LightGBM/pull/4577 - "warn" - if Version(lgb.__version__) > Version("3.2.1") - else True - ), - "keep_training_booster": False, - } - expected_params.update(bst_params) - - for key, val in expected_params.items(): - assert key in params - assert params[key] == str(val) - - unlogged_params = [ - "params", - "train_set", - "valid_sets", - "valid_names", - "fobj", - "feval", - "init_model", - "evals_result", - "learning_rates", - "callbacks", - ] - - for param in unlogged_params: - assert param not in params - - -@pytest.mark.large -def test_lgb_autolog_logs_specified_params(bst_params, train_set): - mlflow.lightgbm.autolog() - expected_params = { - "num_boost_round": 10, - "early_stopping_rounds": 5, - "verbose_eval": False, - } - lgb.train(bst_params, train_set, valid_sets=[train_set], **expected_params) - run = get_latest_run() - params = run.data.params - - expected_params.update(bst_params) - - for key, val in expected_params.items(): - assert key in params - assert params[key] == str(val) - - unlogged_params = [ - "params", - "train_set", - "valid_sets", - "valid_names", - "fobj", - "feval", - "init_model", - "evals_result", - "learning_rates", - "callbacks", - ] - - for param in unlogged_params: - assert param not in params - - -@pytest.mark.large -def test_lgb_autolog_logs_metrics_with_validation_data(bst_params, train_set): - mlflow.lightgbm.autolog() - evals_result = {} - lgb.train( - bst_params, - train_set, - num_boost_round=10, - valid_sets=[train_set], - valid_names=["train"], - evals_result=evals_result, - ) - run = get_latest_run() - data = run.data - client = mlflow.tracking.MlflowClient() - metric_key = "train-multi_logloss" - metric_history = [x.value for x in client.get_metric_history(run.info.run_id, metric_key)] - assert metric_key in data.metrics - assert len(metric_history) == 10 - assert metric_history == evals_result["train"]["multi_logloss"] - - -@pytest.mark.large -def test_lgb_autolog_logs_metrics_with_multi_validation_data(bst_params, train_set): - mlflow.lightgbm.autolog() - evals_result = {} - # If we use [train_set, train_set] here, LightGBM ignores the first dataset. - # To avoid that, create a new Dataset object. - valid_sets = [train_set, lgb.Dataset(train_set.data)] - valid_names = ["train", "valid"] - lgb.train( - bst_params, - train_set, - num_boost_round=10, - valid_sets=valid_sets, - valid_names=valid_names, - evals_result=evals_result, - ) - run = get_latest_run() - data = run.data - client = mlflow.tracking.MlflowClient() - for valid_name in valid_names: - metric_key = "{}-multi_logloss".format(valid_name) - metric_history = [x.value for x in client.get_metric_history(run.info.run_id, metric_key)] - assert metric_key in data.metrics - assert len(metric_history) == 10 - assert metric_history == evals_result[valid_name]["multi_logloss"] - - -@pytest.mark.large -def test_lgb_autolog_logs_metrics_with_multi_metrics(bst_params, train_set): - mlflow.lightgbm.autolog() - evals_result = {} - params = {"metric": ["multi_error", "multi_logloss"]} - params.update(bst_params) - valid_sets = [train_set] - valid_names = ["train"] - lgb.train( - params, - train_set, - num_boost_round=10, - valid_sets=valid_sets, - valid_names=valid_names, - evals_result=evals_result, - ) - run = get_latest_run() - data = run.data - client = mlflow.tracking.MlflowClient() - for metric_name in params["metric"]: - metric_key = "{}-{}".format(valid_names[0], metric_name) - metric_history = [x.value for x in client.get_metric_history(run.info.run_id, metric_key)] - assert metric_key in data.metrics - assert len(metric_history) == 10 - assert metric_history == evals_result["train"][metric_name] - - -@pytest.mark.large -def test_lgb_autolog_logs_metrics_with_multi_validation_data_and_metrics(bst_params, train_set): - mlflow.lightgbm.autolog() - evals_result = {} - params = {"metric": ["multi_error", "multi_logloss"]} - params.update(bst_params) - valid_sets = [train_set, lgb.Dataset(train_set.data)] - valid_names = ["train", "valid"] - lgb.train( - params, - train_set, - num_boost_round=10, - valid_sets=valid_sets, - valid_names=valid_names, - evals_result=evals_result, - ) - run = get_latest_run() - data = run.data - client = mlflow.tracking.MlflowClient() - for valid_name in valid_names: - for metric_name in params["metric"]: - metric_key = "{}-{}".format(valid_name, metric_name) - metric_history = [ - x.value for x in client.get_metric_history(run.info.run_id, metric_key) - ] - assert metric_key in data.metrics - assert len(metric_history) == 10 - assert metric_history == evals_result[valid_name][metric_name] - - -@pytest.mark.large -def test_lgb_autolog_batch_metrics_logger_logs_expected_metrics(bst_params, train_set): - patched_metrics_data = [] - - # Mock patching BatchMetricsLogger.record_metrics() - # to ensure that expected metrics are being logged. - original = BatchMetricsLogger.record_metrics - - with patch( - "mlflow.utils.autologging_utils.BatchMetricsLogger.record_metrics", autospec=True - ) as record_metrics_mock: - - def record_metrics_side_effect(self, metrics, step=None): - patched_metrics_data.extend(metrics.items()) - original(self, metrics, step) - - record_metrics_mock.side_effect = record_metrics_side_effect - - mlflow.lightgbm.autolog() - evals_result = {} - params = {"metric": ["multi_error", "multi_logloss"]} - params.update(bst_params) - valid_sets = [train_set, lgb.Dataset(train_set.data)] - valid_names = ["train", "valid"] - lgb.train( - params, - train_set, - num_boost_round=10, - valid_sets=valid_sets, - valid_names=valid_names, - evals_result=evals_result, - ) - - run = get_latest_run() - original_metrics = run.data.metrics - patched_metrics_data = dict(patched_metrics_data) - for metric_name in original_metrics: - assert metric_name in patched_metrics_data - assert original_metrics[metric_name] == patched_metrics_data[metric_name] - - assert "train-multi_logloss" in original_metrics - assert "train-multi_logloss" in patched_metrics_data - - -@pytest.mark.large -def test_lgb_autolog_logs_metrics_with_early_stopping(bst_params, train_set): - mlflow.lightgbm.autolog() - evals_result = {} - params = {"metric": ["multi_error", "multi_logloss"]} - params.update(bst_params) - valid_sets = [train_set, lgb.Dataset(train_set.data)] - valid_names = ["train", "valid"] - model = lgb.train( - params, - train_set, - num_boost_round=10, - early_stopping_rounds=5, - valid_sets=valid_sets, - valid_names=valid_names, - evals_result=evals_result, - ) - run = get_latest_run() - data = run.data - client = mlflow.tracking.MlflowClient() - assert "best_iteration" in data.metrics - assert int(data.metrics["best_iteration"]) == model.best_iteration - assert "stopped_iteration" in data.metrics - assert int(data.metrics["stopped_iteration"]) == len(evals_result["train"]["multi_logloss"]) - - for valid_name in valid_names: - for metric_name in params["metric"]: - metric_key = "{}-{}".format(valid_name, metric_name) - metric_history = [ - x.value for x in client.get_metric_history(run.info.run_id, metric_key) - ] - assert metric_key in data.metrics - - best_metrics = evals_result[valid_name][metric_name][model.best_iteration - 1] - assert metric_history == evals_result[valid_name][metric_name] + [best_metrics] - - -@pytest.mark.large -def test_lgb_autolog_logs_feature_importance(bst_params, train_set): - mlflow.lightgbm.autolog() - model = lgb.train(bst_params, train_set, num_boost_round=10) - run = get_latest_run() - run_id = run.info.run_id - artifacts_dir = run.info.artifact_uri.replace("file://", "") - client = mlflow.tracking.MlflowClient() - artifacts = [x.path for x in client.list_artifacts(run_id)] - - for imp_type in ["split", "gain"]: - plot_name = "feature_importance_{}.png".format(imp_type) - assert plot_name in artifacts - - json_name = "feature_importance_{}.json".format(imp_type) - assert json_name in artifacts - - json_path = os.path.join(artifacts_dir, json_name) - with open(json_path, "r") as f: - loaded_imp = json.load(f) - - features = model.feature_name() - importance = model.feature_importance(importance_type=imp_type) - imp = {ft: imp for ft, imp in zip(features, importance.tolist())} - - assert loaded_imp == imp - - -@pytest.mark.large -def test_no_figure_is_opened_after_logging(bst_params, train_set): - mlflow.lightgbm.autolog() - lgb.train(bst_params, train_set, num_boost_round=10) - assert mpl.pyplot.get_fignums() == [] - - -@pytest.mark.large -def test_lgb_autolog_loads_model_from_artifact(bst_params, train_set): - mlflow.lightgbm.autolog() - model = lgb.train(bst_params, train_set, num_boost_round=10) - run = get_latest_run() - run_id = run.info.run_id - - loaded_model = mlflow.lightgbm.load_model("runs:/{}/model".format(run_id)) - np.testing.assert_array_almost_equal( - model.predict(train_set.data), loaded_model.predict(train_set.data) - ) - - -@pytest.mark.large -def test_lgb_autolog_gets_input_example(bst_params): - # we need to check the example input against the initial input given to train function. - # we can't use the train_set fixture for this as it defines free_raw_data=False but this - # feature should work even if it is True - iris = datasets.load_iris() - X = pd.DataFrame(iris.data[:, :2], columns=iris.feature_names[:2]) - y = iris.target - dataset = lgb.Dataset(X, y, free_raw_data=True) - - mlflow.lightgbm.autolog(log_input_examples=True) - lgb.train(bst_params, dataset) - run = get_latest_run() - - model_path = os.path.join(run.info.artifact_uri, "model") - model_conf = Model.load(os.path.join(model_path, "MLmodel")) - - input_example = _read_example(model_conf, model_path) - - assert input_example.equals(X[:5]) - - pyfunc_model = mlflow.pyfunc.load_model(os.path.join(run.info.artifact_uri, "model")) - - # make sure reloading the input_example and predicting on it does not error - pyfunc_model.predict(input_example) - - -@pytest.mark.large -def test_lgb_autolog_infers_model_signature_correctly(bst_params): - iris = datasets.load_iris() - X = pd.DataFrame(iris.data[:, :2], columns=iris.feature_names[:2]) - y = iris.target - dataset = lgb.Dataset(X, y, free_raw_data=True) - - mlflow.lightgbm.autolog(log_model_signatures=True) - lgb.train(bst_params, dataset) - run = get_latest_run() - run_id = run.info.run_id - artifacts_dir = run.info.artifact_uri.replace("file://", "") - client = mlflow.tracking.MlflowClient() - artifacts = [x.path for x in client.list_artifacts(run_id, "model")] - - ml_model_filename = "MLmodel" - assert str(os.path.join("model", ml_model_filename)) in artifacts - ml_model_path = os.path.join(artifacts_dir, "model", ml_model_filename) - - data = None - with open(ml_model_path, "r") as f: - data = yaml.load(f, Loader=yaml.FullLoader) - - assert data is not None - assert "signature" in data - signature = data["signature"] - assert signature is not None - - assert "inputs" in signature - assert json.loads(signature["inputs"]) == [ - {"name": "sepal length (cm)", "type": "double"}, - {"name": "sepal width (cm)", "type": "double"}, - ] - - assert "outputs" in signature - assert json.loads(signature["outputs"]) == [ - {"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1, 3]}}, - ] - - -@pytest.mark.large -def test_lgb_autolog_continues_logging_even_if_signature_inference_fails(tmpdir): - tmp_csv = tmpdir.join("data.csv") - tmp_csv.write("2,6.4,2.8,5.6,2.2\n") - tmp_csv.write("1,5.0,2.3,3.3,1.0\n") - tmp_csv.write("2,4.9,2.5,4.5,1.7\n") - tmp_csv.write("0,4.9,3.1,1.5,0.1\n") - tmp_csv.write("0,5.7,3.8,1.7,0.3\n") - - # signature and input example inference should fail here since the dataset is given - # as a file path - dataset = lgb.Dataset(tmp_csv.strpath) - - bst_params = { - "objective": "multiclass", - "num_class": 3, - } - - mlflow.lightgbm.autolog(log_model_signatures=True) - lgb.train(bst_params, dataset) - run = get_latest_run() - run_id = run.info.run_id - artifacts_dir = run.info.artifact_uri.replace("file://", "") - client = mlflow.tracking.MlflowClient() - artifacts = [x.path for x in client.list_artifacts(run_id, "model")] - - ml_model_filename = "MLmodel" - assert os.path.join("model", ml_model_filename) in artifacts - ml_model_path = os.path.join(artifacts_dir, "model", ml_model_filename) - - data = None - with open(ml_model_path, "r") as f: - data = yaml.load(f, Loader=yaml.FullLoader) - - assert data is not None - assert "run_id" in data - assert "signature" not in data - - -@pytest.mark.large -@pytest.mark.parametrize("log_input_examples", [True, False]) -@pytest.mark.parametrize("log_model_signatures", [True, False]) -def test_lgb_autolog_configuration_options(bst_params, log_input_examples, log_model_signatures): - iris = datasets.load_iris() - X = pd.DataFrame(iris.data[:, :2], columns=iris.feature_names[:2]) - y = iris.target - - with mlflow.start_run() as run: - mlflow.lightgbm.autolog( - log_input_examples=log_input_examples, log_model_signatures=log_model_signatures - ) - dataset = lgb.Dataset(X, y) - lgb.train(bst_params, dataset) - model_conf = get_model_conf(run.info.artifact_uri) - assert ("saved_input_example_info" in model_conf.to_dict()) == log_input_examples - assert ("signature" in model_conf.to_dict()) == log_model_signatures - - -@pytest.mark.large -@pytest.mark.parametrize("log_models", [True, False]) -def test_lgb_autolog_log_models_configuration(bst_params, log_models): - iris = datasets.load_iris() - X = pd.DataFrame(iris.data[:, :2], columns=iris.feature_names[:2]) - y = iris.target - - with mlflow.start_run() as run: - mlflow.lightgbm.autolog(log_models=log_models) - dataset = lgb.Dataset(X, y) - lgb.train(bst_params, dataset) - - run_id = run.info.run_id - client = mlflow.tracking.MlflowClient() - artifacts = [f.path for f in client.list_artifacts(run_id)] - assert ("model" in artifacts) == log_models - - -def test_lgb_autolog_does_not_break_dataset_instantiation_with_data_none(): - """ - This test verifies that `lightgbm.Dataset(None)` doesn't fail after patching. - LightGBM internally calls `lightgbm.Dataset(None)` to create a subset of `Dataset`: - https://github.com/microsoft/LightGBM/blob/v3.0.0/python-package/lightgbm/basic.py#L1381 - """ - mlflow.lightgbm.autolog() - lgb.Dataset(None) - - -def test_callback_func_is_pickable(): - cb = picklable_exception_safe_function( - functools.partial(_autolog_callback, BatchMetricsLogger(run_id="1234"), eval_results={}) - ) - pickle.dumps(cb) diff --git a/tests/lightgbm/test_lightgbm_model_export.py b/tests/lightgbm/test_lightgbm_model_export.py index 8b973e1701d4f..b08ece0f67710 100644 --- a/tests/lightgbm/test_lightgbm_model_export.py +++ b/tests/lightgbm/test_lightgbm_model_export.py @@ -50,6 +50,18 @@ def lgb_model(): return ModelWithData(model=model, inference_dataframe=X) +@pytest.fixture(scope="session") +def lgb_sklearn_model(): + iris = datasets.load_iris() + X = pd.DataFrame( + iris.data[:, :2], columns=iris.feature_names[:2] # we only take the first two features. + ) + y = iris.target + model = lgb.LGBMClassifier(n_estimators=10) + model.fit(X, y) + return ModelWithData(model=model, inference_dataframe=X) + + @pytest.fixture def model_path(tmpdir): return os.path.join(str(tmpdir), "model") @@ -68,7 +80,7 @@ def test_model_save_load(lgb_model, model_path): mlflow.lightgbm.save_model(lgb_model=model, path=model_path) reloaded_model = mlflow.lightgbm.load_model(model_uri=model_path) - reloaded_pyfunc = pyfunc.load_pyfunc(model_uri=model_path) + reloaded_pyfunc = pyfunc.load_model(model_uri=model_path) np.testing.assert_array_almost_equal( model.predict(lgb_model.inference_dataframe), @@ -81,6 +93,24 @@ def test_model_save_load(lgb_model, model_path): ) +@pytest.mark.large +def test_sklearn_model_save_load(lgb_sklearn_model, model_path): + model = lgb_sklearn_model.model + mlflow.lightgbm.save_model(lgb_model=model, path=model_path) + reloaded_model = mlflow.lightgbm.load_model(model_uri=model_path) + reloaded_pyfunc = pyfunc.load_model(model_uri=model_path) + + np.testing.assert_array_almost_equal( + model.predict(lgb_sklearn_model.inference_dataframe), + reloaded_model.predict(lgb_sklearn_model.inference_dataframe), + ) + + np.testing.assert_array_almost_equal( + reloaded_model.predict(lgb_sklearn_model.inference_dataframe), + reloaded_pyfunc.predict(lgb_sklearn_model.inference_dataframe), + ) + + def test_signature_and_examples_are_saved_correctly(lgb_model): model = lgb_model.model X = lgb_model.inference_dataframe @@ -398,3 +428,49 @@ def test_pyfunc_serve_and_score_sklearn(model): ) scores = pd.read_json(resp.content, orient="records").values.squeeze() np.testing.assert_array_equal(scores, model.predict(X.head(3))) + + +@pytest.mark.large +def test_load_pyfunc_succeeds_for_older_models_with_pyfunc_data_field(lgb_model, model_path): + """ + This test verifies that LightGBM models saved in older versions of MLflow are loaded + successfully by ``mlflow.pyfunc.load_model``. These older models specify a pyfunc ``data`` + field referring directly to a LightGBM model file. Newer models also have the + ``model_class`` in LightGBM flavor. + """ + model = lgb_model.model + mlflow.lightgbm.save_model(lgb_model=model, path=model_path) + + model_conf_path = os.path.join(model_path, "MLmodel") + model_conf = Model.load(model_conf_path) + pyfunc_conf = model_conf.flavors.get(pyfunc.FLAVOR_NAME) + lgb_conf = model_conf.flavors.get(mlflow.lightgbm.FLAVOR_NAME) + assert lgb_conf is not None + assert "model_class" in lgb_conf + assert "data" in lgb_conf + assert pyfunc_conf is not None + assert "model_class" not in pyfunc_conf + assert pyfunc.DATA in pyfunc_conf + + # test old MLmodel conf + model_conf.flavors["lightgbm"] = {"lgb_version": lgb.__version__, "data": "model.lgb"} + model_conf.save(model_conf_path) + model_conf = Model.load(model_conf_path) + lgb_conf = model_conf.flavors.get(mlflow.lightgbm.FLAVOR_NAME) + assert "data" in lgb_conf + assert lgb_conf["data"] == "model.lgb" + + reloaded_pyfunc = pyfunc.load_model(model_uri=model_path) + assert isinstance(reloaded_pyfunc._model_impl.lgb_model, lgb.Booster) + reloaded_lgb = mlflow.lightgbm.load_model(model_uri=model_path) + assert isinstance(reloaded_lgb, lgb.Booster) + + np.testing.assert_array_almost_equal( + lgb_model.model.predict(lgb_model.inference_dataframe), + reloaded_pyfunc.predict(lgb_model.inference_dataframe), + ) + + np.testing.assert_array_almost_equal( + reloaded_lgb.predict(lgb_model.inference_dataframe), + reloaded_pyfunc.predict(lgb_model.inference_dataframe), + ) From af4bcf12239ebdad2f80981d14190f3b689b00ef Mon Sep 17 00:00:00 2001 From: Junwen Yao Date: Tue, 30 Nov 2021 10:40:46 -0800 Subject: [PATCH 2/7] restore test Signed-off-by: Junwen Yao --- tests/lightgbm/test_lightgbm_autolog.py | 538 ++++++++++++++++++++++++ 1 file changed, 538 insertions(+) create mode 100644 tests/lightgbm/test_lightgbm_autolog.py diff --git a/tests/lightgbm/test_lightgbm_autolog.py b/tests/lightgbm/test_lightgbm_autolog.py new file mode 100644 index 0000000000000..697e0255d0c9c --- /dev/null +++ b/tests/lightgbm/test_lightgbm_autolog.py @@ -0,0 +1,538 @@ +import os +import json +import functools +import pickle +import pytest +import yaml +import numpy as np +import pandas as pd +from sklearn import datasets +import lightgbm as lgb +import matplotlib as mpl +from packaging.version import Version + +import mlflow +import mlflow.lightgbm +from mlflow.lightgbm import _autolog_callback +from mlflow.models import Model +from mlflow.models.utils import _read_example +from mlflow.utils.autologging_utils import picklable_exception_safe_function, BatchMetricsLogger +from unittest.mock import patch + +mpl.use("Agg") + + +def get_latest_run(): + client = mlflow.tracking.MlflowClient() + return client.get_run(client.list_run_infos(experiment_id="0")[0].run_id) + + +def get_model_conf(artifact_uri, model_subpath="model"): + model_conf_path = os.path.join(artifact_uri, model_subpath, "MLmodel") + return Model.load(model_conf_path) + + +@pytest.fixture(scope="session") +def bst_params(): + return { + "objective": "multiclass", + "num_class": 3, + } + + +@pytest.fixture(scope="session") +def train_set(): + iris = datasets.load_iris() + X = pd.DataFrame(iris.data[:, :2], columns=iris.feature_names[:2]) + y = iris.target + # set free_raw_data False to use raw data later. + return lgb.Dataset(X, y, free_raw_data=False) + + +@pytest.mark.large +def test_lgb_autolog_ends_auto_created_run(bst_params, train_set): + mlflow.lightgbm.autolog() + lgb.train(bst_params, train_set, num_boost_round=1) + assert mlflow.active_run() is None + + +@pytest.mark.large +def test_lgb_autolog_persists_manually_created_run(bst_params, train_set): + mlflow.lightgbm.autolog() + with mlflow.start_run() as run: + lgb.train(bst_params, train_set, num_boost_round=1) + assert mlflow.active_run() + assert mlflow.active_run().info.run_id == run.info.run_id + + +@pytest.mark.large +def test_lgb_autolog_logs_default_params(bst_params, train_set): + mlflow.lightgbm.autolog() + lgb.train(bst_params, train_set) + run = get_latest_run() + params = run.data.params + + expected_params = { + "num_boost_round": 100, + "feature_name": "auto", + "categorical_feature": "auto", + "verbose_eval": ( + # The default value of `verbose_eval` in `lightgbm.train` has been changed to 'warn' + # in this PR: https://github.com/microsoft/LightGBM/pull/4577 + "warn" + if Version(lgb.__version__) > Version("3.2.1") + else True + ), + "keep_training_booster": False, + } + expected_params.update(bst_params) + + for key, val in expected_params.items(): + assert key in params + assert params[key] == str(val) + + unlogged_params = [ + "params", + "train_set", + "valid_sets", + "valid_names", + "fobj", + "feval", + "init_model", + "evals_result", + "learning_rates", + "callbacks", + ] + + for param in unlogged_params: + assert param not in params + + +@pytest.mark.large +def test_lgb_autolog_logs_specified_params(bst_params, train_set): + mlflow.lightgbm.autolog() + expected_params = { + "num_boost_round": 10, + "early_stopping_rounds": 5, + "verbose_eval": False, + } + lgb.train(bst_params, train_set, valid_sets=[train_set], **expected_params) + run = get_latest_run() + params = run.data.params + + expected_params.update(bst_params) + + for key, val in expected_params.items(): + assert key in params + assert params[key] == str(val) + + unlogged_params = [ + "params", + "train_set", + "valid_sets", + "valid_names", + "fobj", + "feval", + "init_model", + "evals_result", + "learning_rates", + "callbacks", + ] + + for param in unlogged_params: + assert param not in params + + +@pytest.mark.large +def test_lgb_autolog_logs_metrics_with_validation_data(bst_params, train_set): + mlflow.lightgbm.autolog() + evals_result = {} + lgb.train( + bst_params, + train_set, + num_boost_round=10, + valid_sets=[train_set], + valid_names=["train"], + evals_result=evals_result, + ) + run = get_latest_run() + data = run.data + client = mlflow.tracking.MlflowClient() + metric_key = "train-multi_logloss" + metric_history = [x.value for x in client.get_metric_history(run.info.run_id, metric_key)] + assert metric_key in data.metrics + assert len(metric_history) == 10 + assert metric_history == evals_result["train"]["multi_logloss"] + + +@pytest.mark.large +def test_lgb_autolog_logs_metrics_with_multi_validation_data(bst_params, train_set): + mlflow.lightgbm.autolog() + evals_result = {} + # If we use [train_set, train_set] here, LightGBM ignores the first dataset. + # To avoid that, create a new Dataset object. + valid_sets = [train_set, lgb.Dataset(train_set.data)] + valid_names = ["train", "valid"] + lgb.train( + bst_params, + train_set, + num_boost_round=10, + valid_sets=valid_sets, + valid_names=valid_names, + evals_result=evals_result, + ) + run = get_latest_run() + data = run.data + client = mlflow.tracking.MlflowClient() + for valid_name in valid_names: + metric_key = "{}-multi_logloss".format(valid_name) + metric_history = [x.value for x in client.get_metric_history(run.info.run_id, metric_key)] + assert metric_key in data.metrics + assert len(metric_history) == 10 + assert metric_history == evals_result[valid_name]["multi_logloss"] + + +@pytest.mark.large +def test_lgb_autolog_logs_metrics_with_multi_metrics(bst_params, train_set): + mlflow.lightgbm.autolog() + evals_result = {} + params = {"metric": ["multi_error", "multi_logloss"]} + params.update(bst_params) + valid_sets = [train_set] + valid_names = ["train"] + lgb.train( + params, + train_set, + num_boost_round=10, + valid_sets=valid_sets, + valid_names=valid_names, + evals_result=evals_result, + ) + run = get_latest_run() + data = run.data + client = mlflow.tracking.MlflowClient() + for metric_name in params["metric"]: + metric_key = "{}-{}".format(valid_names[0], metric_name) + metric_history = [x.value for x in client.get_metric_history(run.info.run_id, metric_key)] + assert metric_key in data.metrics + assert len(metric_history) == 10 + assert metric_history == evals_result["train"][metric_name] + + +@pytest.mark.large +def test_lgb_autolog_logs_metrics_with_multi_validation_data_and_metrics(bst_params, train_set): + mlflow.lightgbm.autolog() + evals_result = {} + params = {"metric": ["multi_error", "multi_logloss"]} + params.update(bst_params) + valid_sets = [train_set, lgb.Dataset(train_set.data)] + valid_names = ["train", "valid"] + lgb.train( + params, + train_set, + num_boost_round=10, + valid_sets=valid_sets, + valid_names=valid_names, + evals_result=evals_result, + ) + run = get_latest_run() + data = run.data + client = mlflow.tracking.MlflowClient() + for valid_name in valid_names: + for metric_name in params["metric"]: + metric_key = "{}-{}".format(valid_name, metric_name) + metric_history = [ + x.value for x in client.get_metric_history(run.info.run_id, metric_key) + ] + assert metric_key in data.metrics + assert len(metric_history) == 10 + assert metric_history == evals_result[valid_name][metric_name] + + +@pytest.mark.large +def test_lgb_autolog_batch_metrics_logger_logs_expected_metrics(bst_params, train_set): + patched_metrics_data = [] + + # Mock patching BatchMetricsLogger.record_metrics() + # to ensure that expected metrics are being logged. + original = BatchMetricsLogger.record_metrics + + with patch( + "mlflow.utils.autologging_utils.BatchMetricsLogger.record_metrics", autospec=True + ) as record_metrics_mock: + + def record_metrics_side_effect(self, metrics, step=None): + patched_metrics_data.extend(metrics.items()) + original(self, metrics, step) + + record_metrics_mock.side_effect = record_metrics_side_effect + + mlflow.lightgbm.autolog() + evals_result = {} + params = {"metric": ["multi_error", "multi_logloss"]} + params.update(bst_params) + valid_sets = [train_set, lgb.Dataset(train_set.data)] + valid_names = ["train", "valid"] + lgb.train( + params, + train_set, + num_boost_round=10, + valid_sets=valid_sets, + valid_names=valid_names, + evals_result=evals_result, + ) + + run = get_latest_run() + original_metrics = run.data.metrics + patched_metrics_data = dict(patched_metrics_data) + for metric_name in original_metrics: + assert metric_name in patched_metrics_data + assert original_metrics[metric_name] == patched_metrics_data[metric_name] + + assert "train-multi_logloss" in original_metrics + assert "train-multi_logloss" in patched_metrics_data + + +@pytest.mark.large +def test_lgb_autolog_logs_metrics_with_early_stopping(bst_params, train_set): + mlflow.lightgbm.autolog() + evals_result = {} + params = {"metric": ["multi_error", "multi_logloss"]} + params.update(bst_params) + valid_sets = [train_set, lgb.Dataset(train_set.data)] + valid_names = ["train", "valid"] + model = lgb.train( + params, + train_set, + num_boost_round=10, + early_stopping_rounds=5, + valid_sets=valid_sets, + valid_names=valid_names, + evals_result=evals_result, + ) + run = get_latest_run() + data = run.data + client = mlflow.tracking.MlflowClient() + assert "best_iteration" in data.metrics + assert int(data.metrics["best_iteration"]) == model.best_iteration + assert "stopped_iteration" in data.metrics + assert int(data.metrics["stopped_iteration"]) == len(evals_result["train"]["multi_logloss"]) + + for valid_name in valid_names: + for metric_name in params["metric"]: + metric_key = "{}-{}".format(valid_name, metric_name) + metric_history = [ + x.value for x in client.get_metric_history(run.info.run_id, metric_key) + ] + assert metric_key in data.metrics + + best_metrics = evals_result[valid_name][metric_name][model.best_iteration - 1] + assert metric_history == evals_result[valid_name][metric_name] + [best_metrics] + + +@pytest.mark.large +def test_lgb_autolog_logs_feature_importance(bst_params, train_set): + mlflow.lightgbm.autolog() + model = lgb.train(bst_params, train_set, num_boost_round=10) + run = get_latest_run() + run_id = run.info.run_id + artifacts_dir = run.info.artifact_uri.replace("file://", "") + client = mlflow.tracking.MlflowClient() + artifacts = [x.path for x in client.list_artifacts(run_id)] + + for imp_type in ["split", "gain"]: + plot_name = "feature_importance_{}.png".format(imp_type) + assert plot_name in artifacts + + json_name = "feature_importance_{}.json".format(imp_type) + assert json_name in artifacts + + json_path = os.path.join(artifacts_dir, json_name) + with open(json_path, "r") as f: + loaded_imp = json.load(f) + + features = model.feature_name() + importance = model.feature_importance(importance_type=imp_type) + imp = {ft: imp for ft, imp in zip(features, importance.tolist())} + + assert loaded_imp == imp + + +@pytest.mark.large +def test_no_figure_is_opened_after_logging(bst_params, train_set): + mlflow.lightgbm.autolog() + lgb.train(bst_params, train_set, num_boost_round=10) + assert mpl.pyplot.get_fignums() == [] + + +@pytest.mark.large +def test_lgb_autolog_loads_model_from_artifact(bst_params, train_set): + mlflow.lightgbm.autolog() + model = lgb.train(bst_params, train_set, num_boost_round=10) + run = get_latest_run() + run_id = run.info.run_id + + loaded_model = mlflow.lightgbm.load_model("runs:/{}/model".format(run_id)) + np.testing.assert_array_almost_equal( + model.predict(train_set.data), loaded_model.predict(train_set.data) + ) + + +@pytest.mark.large +def test_lgb_autolog_gets_input_example(bst_params): + # we need to check the example input against the initial input given to train function. + # we can't use the train_set fixture for this as it defines free_raw_data=False but this + # feature should work even if it is True + iris = datasets.load_iris() + X = pd.DataFrame(iris.data[:, :2], columns=iris.feature_names[:2]) + y = iris.target + dataset = lgb.Dataset(X, y, free_raw_data=True) + + mlflow.lightgbm.autolog(log_input_examples=True) + lgb.train(bst_params, dataset) + run = get_latest_run() + + model_path = os.path.join(run.info.artifact_uri, "model") + model_conf = Model.load(os.path.join(model_path, "MLmodel")) + + input_example = _read_example(model_conf, model_path) + + assert input_example.equals(X[:5]) + + pyfunc_model = mlflow.pyfunc.load_model(os.path.join(run.info.artifact_uri, "model")) + + # make sure reloading the input_example and predicting on it does not error + pyfunc_model.predict(input_example) + + +@pytest.mark.large +def test_lgb_autolog_infers_model_signature_correctly(bst_params): + iris = datasets.load_iris() + X = pd.DataFrame(iris.data[:, :2], columns=iris.feature_names[:2]) + y = iris.target + dataset = lgb.Dataset(X, y, free_raw_data=True) + + mlflow.lightgbm.autolog(log_model_signatures=True) + lgb.train(bst_params, dataset) + run = get_latest_run() + run_id = run.info.run_id + artifacts_dir = run.info.artifact_uri.replace("file://", "") + client = mlflow.tracking.MlflowClient() + artifacts = [x.path for x in client.list_artifacts(run_id, "model")] + + ml_model_filename = "MLmodel" + assert str(os.path.join("model", ml_model_filename)) in artifacts + ml_model_path = os.path.join(artifacts_dir, "model", ml_model_filename) + + data = None + with open(ml_model_path, "r") as f: + data = yaml.load(f, Loader=yaml.FullLoader) + + assert data is not None + assert "signature" in data + signature = data["signature"] + assert signature is not None + + assert "inputs" in signature + assert json.loads(signature["inputs"]) == [ + {"name": "sepal length (cm)", "type": "double"}, + {"name": "sepal width (cm)", "type": "double"}, + ] + + assert "outputs" in signature + assert json.loads(signature["outputs"]) == [ + {"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1, 3]}}, + ] + + +@pytest.mark.large +def test_lgb_autolog_continues_logging_even_if_signature_inference_fails(tmpdir): + tmp_csv = tmpdir.join("data.csv") + tmp_csv.write("2,6.4,2.8,5.6,2.2\n") + tmp_csv.write("1,5.0,2.3,3.3,1.0\n") + tmp_csv.write("2,4.9,2.5,4.5,1.7\n") + tmp_csv.write("0,4.9,3.1,1.5,0.1\n") + tmp_csv.write("0,5.7,3.8,1.7,0.3\n") + + # signature and input example inference should fail here since the dataset is given + # as a file path + dataset = lgb.Dataset(tmp_csv.strpath) + + bst_params = { + "objective": "multiclass", + "num_class": 3, + } + + mlflow.lightgbm.autolog(log_model_signatures=True) + lgb.train(bst_params, dataset) + run = get_latest_run() + run_id = run.info.run_id + artifacts_dir = run.info.artifact_uri.replace("file://", "") + client = mlflow.tracking.MlflowClient() + artifacts = [x.path for x in client.list_artifacts(run_id, "model")] + + ml_model_filename = "MLmodel" + assert os.path.join("model", ml_model_filename) in artifacts + ml_model_path = os.path.join(artifacts_dir, "model", ml_model_filename) + + data = None + with open(ml_model_path, "r") as f: + data = yaml.load(f, Loader=yaml.FullLoader) + + assert data is not None + assert "run_id" in data + assert "signature" not in data + + +@pytest.mark.large +@pytest.mark.parametrize("log_input_examples", [True, False]) +@pytest.mark.parametrize("log_model_signatures", [True, False]) +def test_lgb_autolog_configuration_options(bst_params, log_input_examples, log_model_signatures): + iris = datasets.load_iris() + X = pd.DataFrame(iris.data[:, :2], columns=iris.feature_names[:2]) + y = iris.target + + with mlflow.start_run() as run: + mlflow.lightgbm.autolog( + log_input_examples=log_input_examples, log_model_signatures=log_model_signatures + ) + dataset = lgb.Dataset(X, y) + lgb.train(bst_params, dataset) + model_conf = get_model_conf(run.info.artifact_uri) + assert ("saved_input_example_info" in model_conf.to_dict()) == log_input_examples + assert ("signature" in model_conf.to_dict()) == log_model_signatures + + +@pytest.mark.large +@pytest.mark.parametrize("log_models", [True, False]) +def test_lgb_autolog_log_models_configuration(bst_params, log_models): + iris = datasets.load_iris() + X = pd.DataFrame(iris.data[:, :2], columns=iris.feature_names[:2]) + y = iris.target + + with mlflow.start_run() as run: + mlflow.lightgbm.autolog(log_models=log_models) + dataset = lgb.Dataset(X, y) + lgb.train(bst_params, dataset) + + run_id = run.info.run_id + client = mlflow.tracking.MlflowClient() + artifacts = [f.path for f in client.list_artifacts(run_id)] + assert ("model" in artifacts) == log_models + + +def test_lgb_autolog_does_not_break_dataset_instantiation_with_data_none(): + """ + This test verifies that `lightgbm.Dataset(None)` doesn't fail after patching. + LightGBM internally calls `lightgbm.Dataset(None)` to create a subset of `Dataset`: + https://github.com/microsoft/LightGBM/blob/v3.0.0/python-package/lightgbm/basic.py#L1381 + """ + mlflow.lightgbm.autolog() + lgb.Dataset(None) + + +def test_callback_func_is_pickable(): + cb = picklable_exception_safe_function( + functools.partial(_autolog_callback, BatchMetricsLogger(run_id="1234"), eval_results={}) + ) + pickle.dumps(cb) From c804a8195bbd519286fd5fbaa8909fd18ac0b1e3 Mon Sep 17 00:00:00 2001 From: Junwen Yao Date: Tue, 30 Nov 2021 11:51:42 -0800 Subject: [PATCH 3/7] fix doc Signed-off-by: Junwen Yao --- mlflow/lightgbm/__init__.py | 1 - mlflow/lightgbm/utils.py | 8 ++++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/mlflow/lightgbm/__init__.py b/mlflow/lightgbm/__init__.py index d717c419d0874..ba8c361ff2f0b 100644 --- a/mlflow/lightgbm/__init__.py +++ b/mlflow/lightgbm/__init__.py @@ -18,7 +18,6 @@ https://lightgbm.readthedocs.io/en/latest/Python-API.html#scikit-learn-api """ import os - import yaml import json import tempfile diff --git a/mlflow/lightgbm/utils.py b/mlflow/lightgbm/utils.py index f2a338c3cac17..e4364b193afa9 100644 --- a/mlflow/lightgbm/utils.py +++ b/mlflow/lightgbm/utils.py @@ -3,8 +3,6 @@ import os.path import warnings import numpy as np -import lightgbm as lgb -from lightgbm.compat import _LGBMLabelEncoder def _label_encoder_to_json(le): @@ -20,6 +18,8 @@ def _label_encoder_to_json(le): def _label_encoder_from_json(doc): """Load the encoder back from a JSON compatible dict""" + from lightgbm.compat import _LGBMLabelEncoder + le = _LGBMLabelEncoder() meta = {} for k, v in doc.items(): @@ -46,6 +46,8 @@ def _load_lgb_attr(model_dir, fname): def _save_lgb_model(lgb_model, model_path) -> None: + import lightgbm as lgb + model_dir = os.path.dirname(model_path) if not isinstance(lgb_model, lgb.Booster): @@ -77,6 +79,8 @@ def _save_lgb_model(lgb_model, model_path) -> None: def _load_lgb_model(lgb_model_class, model_path): + import lightgbm as lgb + module, cls = lgb_model_class.rsplit(".", maxsplit=1) model_dir = os.path.dirname(model_path) sk_attr = _load_lgb_attr(model_dir, "scikit-learn") From aa4337b4bcc22f65d9e5b5b54b4b0895c3e0112e Mon Sep 17 00:00:00 2001 From: Junwen Yao Date: Wed, 15 Dec 2021 16:04:29 -0800 Subject: [PATCH 4/7] address review: use cloudpickle Signed-off-by: Junwen Yao --- mlflow/lightgbm.py | 620 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 620 insertions(+) create mode 100644 mlflow/lightgbm.py diff --git a/mlflow/lightgbm.py b/mlflow/lightgbm.py new file mode 100644 index 0000000000000..397cde1341b22 --- /dev/null +++ b/mlflow/lightgbm.py @@ -0,0 +1,620 @@ +""" +The ``mlflow.lightgbm`` module provides an API for logging and loading LightGBM models. +This module exports LightGBM models with the following flavors: + +LightGBM (native) format + This is the main flavor that can be loaded back into LightGBM. +:py:mod:`mlflow.pyfunc` + Produced for use by generic pyfunc-based deployment tools and batch inference. + +.. _lightgbm.Booster: + https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html#lightgbm.Booster +.. _lightgbm.Booster.save_model: + https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html + #lightgbm.Booster.save_model +.. _lightgbm.train: + https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.train.html#lightgbm-train +.. _scikit-learn API: + https://lightgbm.readthedocs.io/en/latest/Python-API.html#scikit-learn-api +""" +import os +import yaml +import json +import tempfile +import shutil +import logging +import functools +from copy import deepcopy + +import mlflow +from mlflow import pyfunc +from mlflow.models import Model, infer_signature +from mlflow.models.model import MLMODEL_FILE_NAME +from mlflow.models.signature import ModelSignature +from mlflow.models.utils import ModelInputExample, _save_example +from mlflow.tracking.artifact_utils import _download_artifact_from_uri +from mlflow.utils import _get_fully_qualified_class_name +from mlflow.utils.environment import ( + _mlflow_conda_env, + _validate_env_arguments, + _process_pip_requirements, + _process_conda_env, + _CONDA_ENV_FILE_NAME, + _REQUIREMENTS_FILE_NAME, + _CONSTRAINTS_FILE_NAME, +) +from mlflow.utils.requirements_utils import _get_pinned_requirement +from mlflow.utils.file_utils import write_to +from mlflow.utils.docstring_utils import format_docstring, LOG_MODEL_PARAM_DOCS +from mlflow.utils.model_utils import _get_flavor_configuration +from mlflow.exceptions import MlflowException +from mlflow.utils.arguments_utils import _get_arg_names +from mlflow.utils.autologging_utils import ( + autologging_integration, + safe_patch, + picklable_exception_safe_function, + get_mlflow_run_params_for_fn_args, + INPUT_EXAMPLE_SAMPLE_ROWS, + resolve_input_example_and_signature, + InputExampleInfo, + ENSURE_AUTOLOGGING_ENABLED_TEXT, + batch_metrics_logger, + MlflowAutologgingQueueingClient, +) +from mlflow.tracking._model_registry import DEFAULT_AWAIT_MAX_SLEEP_SECONDS + +FLAVOR_NAME = "lightgbm" + +_logger = logging.getLogger(__name__) + + +def get_default_pip_requirements(): + """ + :return: A list of default pip requirements for MLflow Models produced by this flavor. + Calls to :func:`save_model()` and :func:`log_model()` produce a pip environment + that, at minimum, contains these requirements. + """ + return [_get_pinned_requirement("lightgbm"), _get_pinned_requirement("cloudpickle")] + + +def get_default_conda_env(): + """ + :return: The default Conda environment for MLflow Models produced by calls to + :func:`save_model()` and :func:`log_model()`. + """ + return _mlflow_conda_env(additional_pip_deps=get_default_pip_requirements()) + + +@format_docstring(LOG_MODEL_PARAM_DOCS.format(package_name=FLAVOR_NAME)) +def save_model( + lgb_model, + path, + conda_env=None, + mlflow_model=None, + signature: ModelSignature = None, + input_example: ModelInputExample = None, + pip_requirements=None, + extra_pip_requirements=None, +): + """ + Save a LightGBM model to a path on the local file system. + + :param lgb_model: LightGBM model (an instance of `lightgbm.Booster`_) to be saved. + Note that models that implement the `scikit-learn API`_ are not supported. + :param path: Local path where the model is to be saved. + :param conda_env: {{ conda_env }} + :param mlflow_model: :py:mod:`mlflow.models.Model` this flavor is being added to. + + :param signature: :py:class:`ModelSignature ` + describes model input and output :py:class:`Schema `. + The model signature can be :py:func:`inferred ` + from datasets with valid model input (e.g. the training dataset with target + column omitted) and valid model output (e.g. model predictions generated on + the training dataset), for example: + + .. code-block:: python + + from mlflow.models.signature import infer_signature + train = df.drop_column("target_label") + predictions = ... # compute model predictions + signature = infer_signature(train, predictions) + :param input_example: Input example provides one or several instances of valid + model input. The example can be used as a hint of what data to feed the + model. The given example will be converted to a Pandas DataFrame and then + serialized to json using the Pandas split-oriented format. Bytes are + base64-encoded. + :param pip_requirements: {{ pip_requirements }} + :param extra_pip_requirements: {{ extra_pip_requirements }} + """ + import lightgbm as lgb + + _validate_env_arguments(conda_env, pip_requirements, extra_pip_requirements) + + path = os.path.abspath(path) + if os.path.exists(path): + raise MlflowException("Path '{}' already exists".format(path)) + if isinstance(lgb_model, lgb.Booster): + model_data_subpath = "model.lgb" + else: + model_data_subpath = "model.pkl" + model_data_path = os.path.join(path, model_data_subpath) + os.makedirs(path) + if mlflow_model is None: + mlflow_model = Model() + if signature is not None: + mlflow_model.signature = signature + if input_example is not None: + _save_example(mlflow_model, input_example, path) + + # Save a LightGBM model + _save_model(lgb_model, model_data_path) + + lgb_model_class = _get_fully_qualified_class_name(lgb_model) + pyfunc.add_to_model( + mlflow_model, + loader_module="mlflow.lightgbm", + data=model_data_subpath, + env=_CONDA_ENV_FILE_NAME, + ) + mlflow_model.add_flavor( + FLAVOR_NAME, + lgb_version=lgb.__version__, + data=model_data_subpath, + model_class=lgb_model_class, + ) + mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME)) + + if conda_env is None: + if pip_requirements is None: + default_reqs = get_default_pip_requirements() + # To ensure `_load_pyfunc` can successfully load the model during the dependency + # inference, `mlflow_model.save` must be called beforehand to save an MLmodel file. + inferred_reqs = mlflow.models.infer_pip_requirements( + path, + FLAVOR_NAME, + fallback=default_reqs, + ) + default_reqs = sorted(set(inferred_reqs).union(default_reqs)) + else: + default_reqs = None + conda_env, pip_requirements, pip_constraints = _process_pip_requirements( + default_reqs, + pip_requirements, + extra_pip_requirements, + ) + else: + conda_env, pip_requirements, pip_constraints = _process_conda_env(conda_env) + + with open(os.path.join(path, _CONDA_ENV_FILE_NAME), "w") as f: + yaml.safe_dump(conda_env, stream=f, default_flow_style=False) + + # Save `constraints.txt` if necessary + if pip_constraints: + write_to(os.path.join(path, _CONSTRAINTS_FILE_NAME), "\n".join(pip_constraints)) + + # Save `requirements.txt` + write_to(os.path.join(path, _REQUIREMENTS_FILE_NAME), "\n".join(pip_requirements)) + + +def _save_model(lgb_model, model_path): + # LightGBM Boosters are saved using the built-in method `save_model()`, + # whereas LightGBM scikit-learn models are serialized using Cloudpickle. + import lightgbm as lgb + + if isinstance(lgb_model, lgb.Booster): + lgb_model.save_model(model_path) + else: + import cloudpickle + + with open(model_path, "wb") as out: + cloudpickle.dump(lgb_model, out) + + +@format_docstring(LOG_MODEL_PARAM_DOCS.format(package_name=FLAVOR_NAME)) +def log_model( + lgb_model, + artifact_path, + conda_env=None, + registered_model_name=None, + signature: ModelSignature = None, + input_example: ModelInputExample = None, + await_registration_for=DEFAULT_AWAIT_MAX_SLEEP_SECONDS, + pip_requirements=None, + extra_pip_requirements=None, + **kwargs, +): + """ + Log a LightGBM model as an MLflow artifact for the current run. + + :param lgb_model: LightGBM model (an instance of `lightgbm.Booster`_) to be saved. + Note that models that implement the `scikit-learn API`_ are not supported. + :param artifact_path: Run-relative artifact path. + :param conda_env: {{ conda_env }} + :param registered_model_name: If given, create a model version under + ``registered_model_name``, also creating a registered model if one + with the given name does not exist. + + :param signature: :py:class:`ModelSignature ` + describes model input and output :py:class:`Schema `. + The model signature can be :py:func:`inferred ` + from datasets with valid model input (e.g. the training dataset with target + column omitted) and valid model output (e.g. model predictions generated on + the training dataset), for example: + + .. code-block:: python + + from mlflow.models.signature import infer_signature + train = df.drop_column("target_label") + predictions = ... # compute model predictions + signature = infer_signature(train, predictions) + :param input_example: Input example provides one or several instances of valid + model input. The example can be used as a hint of what data to feed the + model. The given example will be converted to a Pandas DataFrame and then + serialized to json using the Pandas split-oriented format. Bytes are + base64-encoded. + :param await_registration_for: Number of seconds to wait for the model version to finish + being created and is in ``READY`` status. By default, the function + waits for five minutes. Specify 0 or None to skip waiting. + :param pip_requirements: {{ pip_requirements }} + :param extra_pip_requirements: {{ extra_pip_requirements }} + :param kwargs: kwargs to pass to `lightgbm.Booster.save_model`_ method. + """ + Model.log( + artifact_path=artifact_path, + flavor=mlflow.lightgbm, + registered_model_name=registered_model_name, + lgb_model=lgb_model, + conda_env=conda_env, + signature=signature, + input_example=input_example, + await_registration_for=await_registration_for, + pip_requirements=pip_requirements, + extra_pip_requirements=extra_pip_requirements, + **kwargs, + ) + + +def _load_model(path): + """ + Load Model Implementation. + :param path: Local filesystem path to + the MLflow Model with the ``lightgbm`` flavor (MLflow < 1.23.0) or + the top-level MLflow Model directory (MLflow >= 1.23.0). + """ + + model_dir = os.path.dirname(path) if os.path.isfile(path) else path + flavor_conf = _get_flavor_configuration(model_path=model_dir, flavor_name=FLAVOR_NAME) + + model_class = flavor_conf.get("model_class", "lightgbm.basic.Booster") + lgb_model_path = os.path.join(model_dir, flavor_conf.get("data")) + + if model_class == "lightgbm.basic.Booster": + import lightgbm as lgb + + model = lgb.Booster(model_file=lgb_model_path) + else: + # LightGBM scikit-learn models are deserialized using Cloudpickle. + import cloudpickle + + with open(lgb_model_path, "rb") as f: + model = cloudpickle.load(f) + + return model + + +def _load_pyfunc(path): + """ + Load PyFunc implementation. Called by ``pyfunc.load_pyfunc``. + + :param path: Local filesystem path to the MLflow Model with the ``lightgbm`` flavor. + """ + return _LGBModelWrapper(_load_model(path)) + + +def load_model(model_uri, dst_path=None): + """ + Load a LightGBM model from a local file or a run. + + :param model_uri: The location, in URI format, of the MLflow model. For example: + + - ``/Users/me/path/to/local/model`` + - ``relative/path/to/local/model`` + - ``s3://my_bucket/path/to/model`` + - ``runs://run-relative/path/to/model`` + + For more information about supported URI schemes, see + `Referencing Artifacts `_. + :param dst_path: The local filesystem path to which to download the model artifact. + This directory must already exist. If unspecified, a local output + path will be created. + + :return: A LightGBM model (an instance of `lightgbm.Booster`_). + """ + local_model_path = _download_artifact_from_uri(artifact_uri=model_uri, output_path=dst_path) + return _load_model(path=local_model_path) + + +class _LGBModelWrapper: + def __init__(self, lgb_model): + self.lgb_model = lgb_model + + def predict(self, dataframe): + return self.lgb_model.predict(dataframe) + + +def _autolog_callback(env, metrics_logger, eval_results): + res = {} + for data_name, eval_name, value, _ in env.evaluation_result_list: + key = data_name + "-" + eval_name + res[key] = value + metrics_logger.record_metrics(res, env.iteration) + eval_results.append(res) + + +@autologging_integration(FLAVOR_NAME) +def autolog( + log_input_examples=False, + log_model_signatures=True, + log_models=True, + disable=False, + exclusive=False, + disable_for_unsupported_versions=False, + silent=False, +): # pylint: disable=unused-argument + """ + Enables (or disables) and configures autologging from LightGBM to MLflow. Logs the following: + + - parameters specified in `lightgbm.train`_. + - metrics on each iteration (if ``valid_sets`` specified). + - metrics at the best iteration (if ``early_stopping_rounds`` specified). + - feature importance (both "split" and "gain") as JSON files and plots. + - trained model, including: + - an example of valid input. + - inferred signature of the inputs and outputs of the model. + + Note that the `scikit-learn API`_ is not supported. + + :param log_input_examples: If ``True``, input examples from training datasets are collected and + logged along with LightGBM model artifacts during training. If + ``False``, input examples are not logged. + Note: Input examples are MLflow model attributes + and are only collected if ``log_models`` is also ``True``. + :param log_model_signatures: If ``True``, + :py:class:`ModelSignatures ` + describing model inputs and outputs are collected and logged along + with LightGBM model artifacts during training. If ``False``, + signatures are not logged. + Note: Model signatures are MLflow model attributes + and are only collected if ``log_models`` is also ``True``. + :param log_models: If ``True``, trained models are logged as MLflow model artifacts. + If ``False``, trained models are not logged. + Input examples and model signatures, which are attributes of MLflow models, + are also omitted when ``log_models`` is ``False``. + :param disable: If ``True``, disables the LightGBM autologging integration. If ``False``, + enables the LightGBM autologging integration. + :param exclusive: If ``True``, autologged content is not logged to user-created fluent runs. + If ``False``, autologged content is logged to the active fluent run, + which may be user-created. + :param disable_for_unsupported_versions: If ``True``, disable autologging for versions of + lightgbm that have not been tested against this version of the MLflow client + or are incompatible. + :param silent: If ``True``, suppress all event logs and warnings from MLflow during LightGBM + autologging. If ``False``, show all events and warnings during LightGBM + autologging. + """ + import lightgbm + import numpy as np + + # Patching this function so we can get a copy of the data given to Dataset.__init__ + # to use as an input example and for inferring the model signature. + # (there is no way to get the data back from a Dataset object once it is consumed by train) + # We store it on the Dataset object so the train function is able to read it. + def __init__(original, self, *args, **kwargs): + data = args[0] if len(args) > 0 else kwargs.get("data") + + if data is not None: + try: + if isinstance(data, str): + raise Exception( + "cannot gather example input when dataset is loaded from a file." + ) + + input_example_info = InputExampleInfo( + input_example=deepcopy(data[:INPUT_EXAMPLE_SAMPLE_ROWS]) + ) + except Exception as e: + input_example_info = InputExampleInfo(error_msg=str(e)) + + setattr(self, "input_example_info", input_example_info) + + original(self, *args, **kwargs) + + def train(original, *args, **kwargs): + def record_eval_results(eval_results, metrics_logger): + """ + Create a callback function that records evaluation results. + """ + return picklable_exception_safe_function( + functools.partial( + _autolog_callback, metrics_logger=metrics_logger, eval_results=eval_results + ) + ) + + def log_feature_importance_plot(features, importance, importance_type): + """ + Log feature importance plot. + """ + import matplotlib.pyplot as plt + + indices = np.argsort(importance) + features = np.array(features)[indices] + importance = importance[indices] + num_features = len(features) + + # If num_features > 10, increase the figure height to prevent the plot + # from being too dense. + w, h = [6.4, 4.8] # matplotlib's default figure size + h = h + 0.1 * num_features if num_features > 10 else h + fig, ax = plt.subplots(figsize=(w, h)) + + yloc = np.arange(num_features) + ax.barh(yloc, importance, align="center", height=0.5) + ax.set_yticks(yloc) + ax.set_yticklabels(features) + ax.set_xlabel("Importance") + ax.set_title("Feature Importance ({})".format(importance_type)) + fig.tight_layout() + + tmpdir = tempfile.mkdtemp() + try: + # pylint: disable=undefined-loop-variable + filepath = os.path.join(tmpdir, "feature_importance_{}.png".format(imp_type)) + fig.savefig(filepath) + mlflow.log_artifact(filepath) + finally: + plt.close(fig) + shutil.rmtree(tmpdir) + + autologging_client = MlflowAutologgingQueueingClient() + + # logging booster params separately via mlflow.log_params to extract key/value pairs + # and make it easier to compare them across runs. + booster_params = args[0] if len(args) > 0 else kwargs["params"] + autologging_client.log_params(run_id=mlflow.active_run().info.run_id, params=booster_params) + + unlogged_params = [ + "params", + "train_set", + "valid_sets", + "valid_names", + "fobj", + "feval", + "init_model", + "evals_result", + "learning_rates", + "callbacks", + ] + + params_to_log_for_fn = get_mlflow_run_params_for_fn_args( + original, args, kwargs, unlogged_params + ) + autologging_client.log_params( + run_id=mlflow.active_run().info.run_id, params=params_to_log_for_fn + ) + + param_logging_operations = autologging_client.flush(synchronous=False) + + all_arg_names = _get_arg_names(original) + num_pos_args = len(args) + + # adding a callback that records evaluation results. + eval_results = [] + callbacks_index = all_arg_names.index("callbacks") + run_id = mlflow.active_run().info.run_id + with batch_metrics_logger(run_id) as metrics_logger: + callback = record_eval_results(eval_results, metrics_logger) + if num_pos_args >= callbacks_index + 1: + tmp_list = list(args) + tmp_list[callbacks_index] += [callback] + args = tuple(tmp_list) + elif "callbacks" in kwargs and kwargs["callbacks"] is not None: + kwargs["callbacks"] += [callback] + else: + kwargs["callbacks"] = [callback] + + # training model + model = original(*args, **kwargs) + + # If early_stopping_rounds is present, logging metrics at the best iteration + # as extra metrics with the max step + 1. + early_stopping_index = all_arg_names.index("early_stopping_rounds") + early_stopping = ( + num_pos_args >= early_stopping_index + 1 or "early_stopping_rounds" in kwargs + ) + if early_stopping: + extra_step = len(eval_results) + autologging_client.log_metrics( + run_id=mlflow.active_run().info.run_id, + metrics={ + "stopped_iteration": extra_step, + # best_iteration is set even if training does not stop early. + "best_iteration": model.best_iteration, + }, + ) + # iteration starts from 1 in LightGBM. + last_iter_results = eval_results[model.best_iteration - 1] + autologging_client.log_metrics( + run_id=mlflow.active_run().info.run_id, + metrics=last_iter_results, + step=extra_step, + ) + early_stopping_logging_operations = autologging_client.flush(synchronous=False) + + # logging feature importance as artifacts. + for imp_type in ["split", "gain"]: + features = model.feature_name() + importance = model.feature_importance(importance_type=imp_type) + try: + log_feature_importance_plot(features, importance, imp_type) + except Exception: + _logger.exception( + "Failed to log feature importance plot. LightGBM autologging " + "will ignore the failure and continue. Exception: " + ) + + imp = {ft: imp for ft, imp in zip(features, importance.tolist())} + tmpdir = tempfile.mkdtemp() + try: + filepath = os.path.join(tmpdir, "feature_importance_{}.json".format(imp_type)) + with open(filepath, "w") as f: + json.dump(imp, f, indent=2) + mlflow.log_artifact(filepath) + finally: + shutil.rmtree(tmpdir) + + # train_set must exist as the original train function already ran successfully + train_set = args[1] if len(args) > 1 else kwargs.get("train_set") + + # it is possible that the dataset was constructed before the patched + # constructor was applied, so we cannot assume the input_example_info exists + input_example_info = getattr(train_set, "input_example_info", None) + + def get_input_example(): + if input_example_info is None: + raise Exception(ENSURE_AUTOLOGGING_ENABLED_TEXT) + if input_example_info.error_msg is not None: + raise Exception(input_example_info.error_msg) + return input_example_info.input_example + + def infer_model_signature(input_example): + model_output = model.predict(input_example) + model_signature = infer_signature(input_example, model_output) + return model_signature + + # Whether to automatically log the trained model based on boolean flag. + if log_models: + # Will only resolve `input_example` and `signature` if `log_models` is `True`. + input_example, signature = resolve_input_example_and_signature( + get_input_example, + infer_model_signature, + log_input_examples, + log_model_signatures, + _logger, + ) + + log_model( + model, + artifact_path="model", + signature=signature, + input_example=input_example, + ) + + param_logging_operations.await_completion() + if early_stopping: + early_stopping_logging_operations.await_completion() + + return model + + safe_patch(FLAVOR_NAME, lightgbm, "train", train, manage_run=True) + safe_patch(FLAVOR_NAME, lightgbm.Dataset, "__init__", __init__) From 7397fce69538c3d8555900848c093cef081fae6a Mon Sep 17 00:00:00 2001 From: Junwen Yao Date: Wed, 15 Dec 2021 16:07:33 -0800 Subject: [PATCH 5/7] remove prev folders Signed-off-by: Junwen Yao --- mlflow/lightgbm/__init__.py | 592 ------------------------------------ mlflow/lightgbm/utils.py | 110 ------- 2 files changed, 702 deletions(-) delete mode 100644 mlflow/lightgbm/__init__.py delete mode 100644 mlflow/lightgbm/utils.py diff --git a/mlflow/lightgbm/__init__.py b/mlflow/lightgbm/__init__.py deleted file mode 100644 index ba8c361ff2f0b..0000000000000 --- a/mlflow/lightgbm/__init__.py +++ /dev/null @@ -1,592 +0,0 @@ -""" -The ``mlflow.lightgbm`` module provides an API for logging and loading LightGBM models. -This module exports LightGBM models with the following flavors: - -LightGBM (native) format - This is the main flavor that can be loaded back into LightGBM. -:py:mod:`mlflow.pyfunc` - Produced for use by generic pyfunc-based deployment tools and batch inference. - -.. _lightgbm.Booster: - https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html#lightgbm.Booster -.. _lightgbm.Booster.save_model: - https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html - #lightgbm.Booster.save_model -.. _lightgbm.train: - https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.train.html#lightgbm-train -.. _scikit-learn API: - https://lightgbm.readthedocs.io/en/latest/Python-API.html#scikit-learn-api -""" -import os -import yaml -import json -import tempfile -import shutil -import logging -import functools -from copy import deepcopy - -import mlflow -from mlflow import pyfunc -from mlflow.models import Model, infer_signature -from mlflow.models.model import MLMODEL_FILE_NAME -from mlflow.models.signature import ModelSignature -from mlflow.models.utils import ModelInputExample, _save_example -from mlflow.utils import _get_fully_qualified_class_name -from mlflow.tracking.artifact_utils import _download_artifact_from_uri -from mlflow.utils.environment import ( - _mlflow_conda_env, - _validate_env_arguments, - _process_pip_requirements, - _process_conda_env, - _CONDA_ENV_FILE_NAME, - _REQUIREMENTS_FILE_NAME, - _CONSTRAINTS_FILE_NAME, -) -from mlflow.utils.requirements_utils import _get_pinned_requirement -from mlflow.utils.file_utils import write_to -from mlflow.utils.docstring_utils import format_docstring, LOG_MODEL_PARAM_DOCS -from mlflow.utils.model_utils import _get_flavor_configuration -from mlflow.exceptions import MlflowException -from mlflow.utils.arguments_utils import _get_arg_names -from mlflow.utils.autologging_utils import ( - autologging_integration, - safe_patch, - picklable_exception_safe_function, - get_mlflow_run_params_for_fn_args, - INPUT_EXAMPLE_SAMPLE_ROWS, - resolve_input_example_and_signature, - InputExampleInfo, - ENSURE_AUTOLOGGING_ENABLED_TEXT, - batch_metrics_logger, - MlflowAutologgingQueueingClient, -) -from mlflow.tracking._model_registry import DEFAULT_AWAIT_MAX_SLEEP_SECONDS -from mlflow.lightgbm.utils import _save_lgb_model, _load_lgb_model - - -FLAVOR_NAME = "lightgbm" - -_logger = logging.getLogger(__name__) - - -def get_default_pip_requirements(): - """ - :return: A list of default pip requirements for MLflow Models produced by this flavor. - Calls to :func:`save_model()` and :func:`log_model()` produce a pip environment - that, at minimum, contains these requirements. - """ - return [_get_pinned_requirement("lightgbm")] - - -def get_default_conda_env(): - """ - :return: The default Conda environment for MLflow Models produced by calls to - :func:`save_model()` and :func:`log_model()`. - """ - return _mlflow_conda_env(additional_pip_deps=get_default_pip_requirements()) - - -@format_docstring(LOG_MODEL_PARAM_DOCS.format(package_name=FLAVOR_NAME)) -def save_model( - lgb_model, - path, - conda_env=None, - mlflow_model=None, - signature: ModelSignature = None, - input_example: ModelInputExample = None, - pip_requirements=None, - extra_pip_requirements=None, -): - """ - Save a LightGBM model to a path on the local file system. - - :param lgb_model: LightGBM model (an instance of `lightgbm.Booster`_) to be saved. - Note that models that implement the `scikit-learn API`_ are not supported. - :param path: Local path where the model is to be saved. - :param conda_env: {{ conda_env }} - :param mlflow_model: :py:mod:`mlflow.models.Model` this flavor is being added to. - - :param signature: :py:class:`ModelSignature ` - describes model input and output :py:class:`Schema `. - The model signature can be :py:func:`inferred ` - from datasets with valid model input (e.g. the training dataset with target - column omitted) and valid model output (e.g. model predictions generated on - the training dataset), for example: - - .. code-block:: python - - from mlflow.models.signature import infer_signature - train = df.drop_column("target_label") - predictions = ... # compute model predictions - signature = infer_signature(train, predictions) - :param input_example: Input example provides one or several instances of valid - model input. The example can be used as a hint of what data to feed the - model. The given example will be converted to a Pandas DataFrame and then - serialized to json using the Pandas split-oriented format. Bytes are - base64-encoded. - :param pip_requirements: {{ pip_requirements }} - :param extra_pip_requirements: {{ extra_pip_requirements }} - """ - import lightgbm as lgb - - _validate_env_arguments(conda_env, pip_requirements, extra_pip_requirements) - - path = os.path.abspath(path) - if os.path.exists(path): - raise MlflowException("Path '{}' already exists".format(path)) - model_data_subpath = "model.lgb" - model_data_path = os.path.join(path, model_data_subpath) - os.makedirs(path) - if mlflow_model is None: - mlflow_model = Model() - if signature is not None: - mlflow_model.signature = signature - if input_example is not None: - _save_example(mlflow_model, input_example, path) - - # Save a LightGBM model - _save_lgb_model(lgb_model, model_data_path) - - pyfunc.add_to_model( - mlflow_model, - loader_module="mlflow.lightgbm", - data=model_data_subpath, - env=_CONDA_ENV_FILE_NAME, - ) - mlflow_model.add_flavor( - FLAVOR_NAME, - lgb_version=lgb.__version__, - data=model_data_subpath, - model_class=_get_fully_qualified_class_name(lgb_model), - ) - mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME)) - - if conda_env is None: - if pip_requirements is None: - default_reqs = get_default_pip_requirements() - # To ensure `_load_pyfunc` can successfully load the model during the dependency - # inference, `mlflow_model.save` must be called beforehand to save an MLmodel file. - inferred_reqs = mlflow.models.infer_pip_requirements( - path, - FLAVOR_NAME, - fallback=default_reqs, - ) - default_reqs = sorted(set(inferred_reqs).union(default_reqs)) - else: - default_reqs = None - conda_env, pip_requirements, pip_constraints = _process_pip_requirements( - default_reqs, - pip_requirements, - extra_pip_requirements, - ) - else: - conda_env, pip_requirements, pip_constraints = _process_conda_env(conda_env) - - with open(os.path.join(path, _CONDA_ENV_FILE_NAME), "w") as f: - yaml.safe_dump(conda_env, stream=f, default_flow_style=False) - - # Save `constraints.txt` if necessary - if pip_constraints: - write_to(os.path.join(path, _CONSTRAINTS_FILE_NAME), "\n".join(pip_constraints)) - - # Save `requirements.txt` - write_to(os.path.join(path, _REQUIREMENTS_FILE_NAME), "\n".join(pip_requirements)) - - -@format_docstring(LOG_MODEL_PARAM_DOCS.format(package_name=FLAVOR_NAME)) -def log_model( - lgb_model, - artifact_path, - conda_env=None, - registered_model_name=None, - signature: ModelSignature = None, - input_example: ModelInputExample = None, - await_registration_for=DEFAULT_AWAIT_MAX_SLEEP_SECONDS, - pip_requirements=None, - extra_pip_requirements=None, - **kwargs, -): - """ - Log a LightGBM model as an MLflow artifact for the current run. - - :param lgb_model: LightGBM model (an instance of `lightgbm.Booster`_) to be saved. - Note that models that implement the `scikit-learn API`_ are not supported. - :param artifact_path: Run-relative artifact path. - :param conda_env: {{ conda_env }} - :param registered_model_name: If given, create a model version under - ``registered_model_name``, also creating a registered model if one - with the given name does not exist. - - :param signature: :py:class:`ModelSignature ` - describes model input and output :py:class:`Schema `. - The model signature can be :py:func:`inferred ` - from datasets with valid model input (e.g. the training dataset with target - column omitted) and valid model output (e.g. model predictions generated on - the training dataset), for example: - - .. code-block:: python - - from mlflow.models.signature import infer_signature - train = df.drop_column("target_label") - predictions = ... # compute model predictions - signature = infer_signature(train, predictions) - :param input_example: Input example provides one or several instances of valid - model input. The example can be used as a hint of what data to feed the - model. The given example will be converted to a Pandas DataFrame and then - serialized to json using the Pandas split-oriented format. Bytes are - base64-encoded. - :param await_registration_for: Number of seconds to wait for the model version to finish - being created and is in ``READY`` status. By default, the function - waits for five minutes. Specify 0 or None to skip waiting. - :param pip_requirements: {{ pip_requirements }} - :param extra_pip_requirements: {{ extra_pip_requirements }} - :param kwargs: kwargs to pass to `lightgbm.Booster.save_model`_ method. - """ - Model.log( - artifact_path=artifact_path, - flavor=mlflow.lightgbm, - registered_model_name=registered_model_name, - lgb_model=lgb_model, - conda_env=conda_env, - signature=signature, - input_example=input_example, - await_registration_for=await_registration_for, - pip_requirements=pip_requirements, - extra_pip_requirements=extra_pip_requirements, - **kwargs, - ) - - -def _load_model(path): - """ - :param path: Local filesystem path to - the MLflow Model with the ``lightgbm`` flavor (MLflow < x.x.x) or - the top-level MLflow Model directory (MLflow >= x.x.x). - """ - model_dir = os.path.dirname(path) if os.path.isfile(path) else path - flavor_conf = _get_flavor_configuration(model_path=model_dir, flavor_name=FLAVOR_NAME) - - model_class = flavor_conf.get("model_class", "lightgbm.basic.Booster") - lgb_model_path = os.path.join(model_dir, flavor_conf.get("data")) - - return _load_lgb_model(model_class, lgb_model_path) - - -def _load_pyfunc(path): - """ - Load PyFunc implementation. Called by ``pyfunc.load_pyfunc``. - - :param path: Local filesystem path to the MLflow Model with the ``lightgbm`` flavor. - """ - return _LGBModelWrapper(_load_model(path)) - - -def load_model(model_uri, dst_path=None): - """ - Load a LightGBM model from a local file or a run. - - :param model_uri: The location, in URI format, of the MLflow model. For example: - - - ``/Users/me/path/to/local/model`` - - ``relative/path/to/local/model`` - - ``s3://my_bucket/path/to/model`` - - ``runs://run-relative/path/to/model`` - - For more information about supported URI schemes, see - `Referencing Artifacts `_. - :param dst_path: The local filesystem path to which to download the model artifact. - This directory must already exist. If unspecified, a local output - path will be created. - - :return: A LightGBM model (an instance of `lightgbm.Booster`_) or LightGBM scikit-learn - models, depending on the saved model class specification. - """ - local_model_path = _download_artifact_from_uri(artifact_uri=model_uri, output_path=dst_path) - return _load_model(path=local_model_path) - - -class _LGBModelWrapper: - def __init__(self, lgb_model): - self.lgb_model = lgb_model - - def predict(self, dataframe): - return self.lgb_model.predict(dataframe) - - -def _autolog_callback(env, metrics_logger, eval_results): - res = {} - for data_name, eval_name, value, _ in env.evaluation_result_list: - key = data_name + "-" + eval_name - res[key] = value - metrics_logger.record_metrics(res, env.iteration) - eval_results.append(res) - - -@autologging_integration(FLAVOR_NAME) -def autolog( - log_input_examples=False, - log_model_signatures=True, - log_models=True, - disable=False, - exclusive=False, - disable_for_unsupported_versions=False, - silent=False, -): # pylint: disable=unused-argument - """ - Enables (or disables) and configures autologging from LightGBM to MLflow. Logs the following: - - - parameters specified in `lightgbm.train`_. - - metrics on each iteration (if ``valid_sets`` specified). - - metrics at the best iteration (if ``early_stopping_rounds`` specified). - - feature importance (both "split" and "gain") as JSON files and plots. - - trained model, including: - - an example of valid input. - - inferred signature of the inputs and outputs of the model. - - Note that the `scikit-learn API`_ is not supported. - - :param log_input_examples: If ``True``, input examples from training datasets are collected and - logged along with LightGBM model artifacts during training. If - ``False``, input examples are not logged. - Note: Input examples are MLflow model attributes - and are only collected if ``log_models`` is also ``True``. - :param log_model_signatures: If ``True``, - :py:class:`ModelSignatures ` - describing model inputs and outputs are collected and logged along - with LightGBM model artifacts during training. If ``False``, - signatures are not logged. - Note: Model signatures are MLflow model attributes - and are only collected if ``log_models`` is also ``True``. - :param log_models: If ``True``, trained models are logged as MLflow model artifacts. - If ``False``, trained models are not logged. - Input examples and model signatures, which are attributes of MLflow models, - are also omitted when ``log_models`` is ``False``. - :param disable: If ``True``, disables the LightGBM autologging integration. If ``False``, - enables the LightGBM autologging integration. - :param exclusive: If ``True``, autologged content is not logged to user-created fluent runs. - If ``False``, autologged content is logged to the active fluent run, - which may be user-created. - :param disable_for_unsupported_versions: If ``True``, disable autologging for versions of - lightgbm that have not been tested against this version of the MLflow client - or are incompatible. - :param silent: If ``True``, suppress all event logs and warnings from MLflow during LightGBM - autologging. If ``False``, show all events and warnings during LightGBM - autologging. - """ - import lightgbm - import numpy as np - - # Patching this function so we can get a copy of the data given to Dataset.__init__ - # to use as an input example and for inferring the model signature. - # (there is no way to get the data back from a Dataset object once it is consumed by train) - # We store it on the Dataset object so the train function is able to read it. - def __init__(original, self, *args, **kwargs): - data = args[0] if len(args) > 0 else kwargs.get("data") - - if data is not None: - try: - if isinstance(data, str): - raise Exception( - "cannot gather example input when dataset is loaded from a file." - ) - - input_example_info = InputExampleInfo( - input_example=deepcopy(data[:INPUT_EXAMPLE_SAMPLE_ROWS]) - ) - except Exception as e: - input_example_info = InputExampleInfo(error_msg=str(e)) - - setattr(self, "input_example_info", input_example_info) - - original(self, *args, **kwargs) - - def train(original, *args, **kwargs): - def record_eval_results(eval_results, metrics_logger): - """ - Create a callback function that records evaluation results. - """ - return picklable_exception_safe_function( - functools.partial( - _autolog_callback, metrics_logger=metrics_logger, eval_results=eval_results - ) - ) - - def log_feature_importance_plot(features, importance, importance_type): - """ - Log feature importance plot. - """ - import matplotlib.pyplot as plt - - indices = np.argsort(importance) - features = np.array(features)[indices] - importance = importance[indices] - num_features = len(features) - - # If num_features > 10, increase the figure height to prevent the plot - # from being too dense. - w, h = [6.4, 4.8] # matplotlib's default figure size - h = h + 0.1 * num_features if num_features > 10 else h - fig, ax = plt.subplots(figsize=(w, h)) - - yloc = np.arange(num_features) - ax.barh(yloc, importance, align="center", height=0.5) - ax.set_yticks(yloc) - ax.set_yticklabels(features) - ax.set_xlabel("Importance") - ax.set_title("Feature Importance ({})".format(importance_type)) - fig.tight_layout() - - tmpdir = tempfile.mkdtemp() - try: - # pylint: disable=undefined-loop-variable - filepath = os.path.join(tmpdir, "feature_importance_{}.png".format(imp_type)) - fig.savefig(filepath) - mlflow.log_artifact(filepath) - finally: - plt.close(fig) - shutil.rmtree(tmpdir) - - autologging_client = MlflowAutologgingQueueingClient() - - # logging booster params separately via mlflow.log_params to extract key/value pairs - # and make it easier to compare them across runs. - booster_params = args[0] if len(args) > 0 else kwargs["params"] - autologging_client.log_params(run_id=mlflow.active_run().info.run_id, params=booster_params) - - unlogged_params = [ - "params", - "train_set", - "valid_sets", - "valid_names", - "fobj", - "feval", - "init_model", - "evals_result", - "learning_rates", - "callbacks", - ] - - params_to_log_for_fn = get_mlflow_run_params_for_fn_args( - original, args, kwargs, unlogged_params - ) - autologging_client.log_params( - run_id=mlflow.active_run().info.run_id, params=params_to_log_for_fn - ) - - param_logging_operations = autologging_client.flush(synchronous=False) - - all_arg_names = _get_arg_names(original) - num_pos_args = len(args) - - # adding a callback that records evaluation results. - eval_results = [] - callbacks_index = all_arg_names.index("callbacks") - run_id = mlflow.active_run().info.run_id - with batch_metrics_logger(run_id) as metrics_logger: - callback = record_eval_results(eval_results, metrics_logger) - if num_pos_args >= callbacks_index + 1: - tmp_list = list(args) - tmp_list[callbacks_index] += [callback] - args = tuple(tmp_list) - elif "callbacks" in kwargs and kwargs["callbacks"] is not None: - kwargs["callbacks"] += [callback] - else: - kwargs["callbacks"] = [callback] - - # training model - model = original(*args, **kwargs) - - # If early_stopping_rounds is present, logging metrics at the best iteration - # as extra metrics with the max step + 1. - early_stopping_index = all_arg_names.index("early_stopping_rounds") - early_stopping = ( - num_pos_args >= early_stopping_index + 1 or "early_stopping_rounds" in kwargs - ) - if early_stopping: - extra_step = len(eval_results) - autologging_client.log_metrics( - run_id=mlflow.active_run().info.run_id, - metrics={ - "stopped_iteration": extra_step, - # best_iteration is set even if training does not stop early. - "best_iteration": model.best_iteration, - }, - ) - # iteration starts from 1 in LightGBM. - last_iter_results = eval_results[model.best_iteration - 1] - autologging_client.log_metrics( - run_id=mlflow.active_run().info.run_id, - metrics=last_iter_results, - step=extra_step, - ) - early_stopping_logging_operations = autologging_client.flush(synchronous=False) - - # logging feature importance as artifacts. - for imp_type in ["split", "gain"]: - features = model.feature_name() - importance = model.feature_importance(importance_type=imp_type) - try: - log_feature_importance_plot(features, importance, imp_type) - except Exception: - _logger.exception( - "Failed to log feature importance plot. LightGBM autologging " - "will ignore the failure and continue. Exception: " - ) - - imp = {ft: imp for ft, imp in zip(features, importance.tolist())} - tmpdir = tempfile.mkdtemp() - try: - filepath = os.path.join(tmpdir, "feature_importance_{}.json".format(imp_type)) - with open(filepath, "w") as f: - json.dump(imp, f, indent=2) - mlflow.log_artifact(filepath) - finally: - shutil.rmtree(tmpdir) - - # train_set must exist as the original train function already ran successfully - train_set = args[1] if len(args) > 1 else kwargs.get("train_set") - - # it is possible that the dataset was constructed before the patched - # constructor was applied, so we cannot assume the input_example_info exists - input_example_info = getattr(train_set, "input_example_info", None) - - def get_input_example(): - if input_example_info is None: - raise Exception(ENSURE_AUTOLOGGING_ENABLED_TEXT) - if input_example_info.error_msg is not None: - raise Exception(input_example_info.error_msg) - return input_example_info.input_example - - def infer_model_signature(input_example): - model_output = model.predict(input_example) - model_signature = infer_signature(input_example, model_output) - return model_signature - - # Whether to automatically log the trained model based on boolean flag. - if log_models: - # Will only resolve `input_example` and `signature` if `log_models` is `True`. - input_example, signature = resolve_input_example_and_signature( - get_input_example, - infer_model_signature, - log_input_examples, - log_model_signatures, - _logger, - ) - - log_model( - model, - artifact_path="model", - signature=signature, - input_example=input_example, - ) - - param_logging_operations.await_completion() - if early_stopping: - early_stopping_logging_operations.await_completion() - - return model - - safe_patch(FLAVOR_NAME, lightgbm, "train", train, manage_run=True) - safe_patch(FLAVOR_NAME, lightgbm.Dataset, "__init__", __init__) diff --git a/mlflow/lightgbm/utils.py b/mlflow/lightgbm/utils.py deleted file mode 100644 index e4364b193afa9..0000000000000 --- a/mlflow/lightgbm/utils.py +++ /dev/null @@ -1,110 +0,0 @@ -import importlib -import json -import os.path -import warnings -import numpy as np - - -def _label_encoder_to_json(le): - """Returns a JSON compatible dictionary""" - meta = {} - for k, v in le.__dict__.items(): - if isinstance(v, np.ndarray): - meta[k] = v.tolist() - else: - meta[k] = v - return meta - - -def _label_encoder_from_json(doc): - """Load the encoder back from a JSON compatible dict""" - from lightgbm.compat import _LGBMLabelEncoder - - le = _LGBMLabelEncoder() - meta = {} - for k, v in doc.items(): - if k == "classes_": - le.classes_ = np.array(v) if v is not None else None - continue - meta[k] = v - le.__dict__.update(meta) - return le - - -def _save_lgb_attr(model_dir, fname, attr_dict): - with open(os.path.join(model_dir, "{}.json".format(fname)), "w") as f: - json.dump(attr_dict, f) - - -def _load_lgb_attr(model_dir, fname): - try: - with open(os.path.join(model_dir, "{}.json".format(fname))) as f: - attr = json.load(f) - return attr - except IOError: - return None - - -def _save_lgb_model(lgb_model, model_path) -> None: - import lightgbm as lgb - - model_dir = os.path.dirname(model_path) - - if not isinstance(lgb_model, lgb.Booster): - meta = {} - for k, v in lgb_model.__dict__.items(): - if k == "_le": - meta["_le"] = _label_encoder_to_json(v) if v else None - continue - if k == "_Booster": - continue - if k == "_classes" and v is not None: - meta["_classes"] = v.tolist() - continue - if k == "_class_map" and v: - py_dict = {} - for clazz, encoded in v.items(): - py_dict[int(clazz)] = int(encoded) - v = py_dict - try: - json.dumps({k: v}) - meta[k] = v - except TypeError: - warnings.warn(str(k) + " is not saved in Scikit-Learn meta.", UserWarning) - _save_lgb_attr(model_dir, "scikit-learn", meta) - lgb_model = lgb_model._Booster - - lgb_model.save_model(model_path) - _save_lgb_attr(model_dir, "params", lgb_model.params) - - -def _load_lgb_model(lgb_model_class, model_path): - import lightgbm as lgb - - module, cls = lgb_model_class.rsplit(".", maxsplit=1) - model_dir = os.path.dirname(model_path) - sk_attr = _load_lgb_attr(model_dir, "scikit-learn") - bst_params = _load_lgb_attr(model_dir, "params") - - booster = lgb.Booster(model_file=model_path, params=bst_params) - - if sk_attr is None: - warnings.warn("Loading a native LightGBM model with Scikit-Learn interface.") - return booster - - sk_model = getattr(importlib.import_module(module), cls)() - states = {} - for k, v in sk_attr.items(): - if k == "_le": - sk_model._le = _label_encoder_from_json(v) - continue - if k == "_classes": - sk_model._classes = np.array(v) - continue - states[k] = v - sk_model.__dict__.update(states) - # Delete the attribute after load - booster.set_attr(scikit_learn=None) - sk_model._Booster = booster - - return sk_model From c212f5533ef2a4b03ce474631028f8b8d4a8126a Mon Sep 17 00:00:00 2001 From: Junwen Yao Date: Wed, 22 Dec 2021 09:04:54 -0800 Subject: [PATCH 6/7] address review Signed-off-by: Junwen Yao --- mlflow/lightgbm.py | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/mlflow/lightgbm.py b/mlflow/lightgbm.py index 397cde1341b22..f91a82c63efac 100644 --- a/mlflow/lightgbm.py +++ b/mlflow/lightgbm.py @@ -68,21 +68,24 @@ _logger = logging.getLogger(__name__) -def get_default_pip_requirements(): +def get_default_pip_requirements(include_cloudpickle=False): """ :return: A list of default pip requirements for MLflow Models produced by this flavor. Calls to :func:`save_model()` and :func:`log_model()` produce a pip environment that, at minimum, contains these requirements. """ - return [_get_pinned_requirement("lightgbm"), _get_pinned_requirement("cloudpickle")] + pip_deps = [_get_pinned_requirement("lightgbm")] + if include_cloudpickle: + pip_deps.append(_get_pinned_requirement("cloudpickle")) + return pip_deps -def get_default_conda_env(): +def get_default_conda_env(include_cloudpickle=False): """ :return: The default Conda environment for MLflow Models produced by calls to :func:`save_model()` and :func:`log_model()`. """ - return _mlflow_conda_env(additional_pip_deps=get_default_pip_requirements()) + return _mlflow_conda_env(additional_pip_deps=get_default_pip_requirements(include_cloudpickle)) @format_docstring(LOG_MODEL_PARAM_DOCS.format(package_name=FLAVOR_NAME)) @@ -133,10 +136,7 @@ def save_model( path = os.path.abspath(path) if os.path.exists(path): raise MlflowException("Path '{}' already exists".format(path)) - if isinstance(lgb_model, lgb.Booster): - model_data_subpath = "model.lgb" - else: - model_data_subpath = "model.pkl" + model_data_subpath = "model.lgb" if isinstance(lgb_model, lgb.Booster) else "model.pkl" model_data_path = os.path.join(path, model_data_subpath) os.makedirs(path) if mlflow_model is None: @@ -146,8 +146,8 @@ def save_model( if input_example is not None: _save_example(mlflow_model, input_example, path) - # Save a LightGBM model - _save_model(lgb_model, model_data_path) + # Save a LightGBM model and retrieve its model type + is_sklearn_model = _save_model(lgb_model, model_data_path) lgb_model_class = _get_fully_qualified_class_name(lgb_model) pyfunc.add_to_model( @@ -166,7 +166,7 @@ def save_model( if conda_env is None: if pip_requirements is None: - default_reqs = get_default_pip_requirements() + default_reqs = get_default_pip_requirements(include_cloudpickle=is_sklearn_model) # To ensure `_load_pyfunc` can successfully load the model during the dependency # inference, `mlflow_model.save` must be called beforehand to save an MLmodel file. inferred_reqs = mlflow.models.infer_pip_requirements( @@ -197,17 +197,24 @@ def save_model( def _save_model(lgb_model, model_path): - # LightGBM Boosters are saved using the built-in method `save_model()`, - # whereas LightGBM scikit-learn models are serialized using Cloudpickle. + """ + LightGBM Boosters are saved using the built-in method `save_model()`, + whereas LightGBM scikit-learn models are serialized using Cloudpickle. + + :return: A boolean value indicating whether the save model is a scikit-learn + model. The returned value will be passed to `get_default_pip_requirements`. + """ import lightgbm as lgb if isinstance(lgb_model, lgb.Booster): lgb_model.save_model(model_path) + return False else: import cloudpickle with open(model_path, "wb") as out: cloudpickle.dump(lgb_model, out) + return True @format_docstring(LOG_MODEL_PARAM_DOCS.format(package_name=FLAVOR_NAME)) @@ -329,7 +336,8 @@ def load_model(model_uri, dst_path=None): This directory must already exist. If unspecified, a local output path will be created. - :return: A LightGBM model (an instance of `lightgbm.Booster`_). + :return: A LightGBM model (an instance of `lightgbm.Booster`_) or a LightGBM scikit-learn + model, depending on the saved model class specification. """ local_model_path = _download_artifact_from_uri(artifact_uri=model_uri, output_path=dst_path) return _load_model(path=local_model_path) From 2b3102905923378185c2088d5a28547db437884d Mon Sep 17 00:00:00 2001 From: Junwen Yao Date: Wed, 22 Dec 2021 14:25:34 -0800 Subject: [PATCH 7/7] a better soln Signed-off-by: Junwen Yao --- mlflow/lightgbm.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/mlflow/lightgbm.py b/mlflow/lightgbm.py index f91a82c63efac..b8ab5df94d88d 100644 --- a/mlflow/lightgbm.py +++ b/mlflow/lightgbm.py @@ -146,8 +146,8 @@ def save_model( if input_example is not None: _save_example(mlflow_model, input_example, path) - # Save a LightGBM model and retrieve its model type - is_sklearn_model = _save_model(lgb_model, model_data_path) + # Save a LightGBM model + _save_model(lgb_model, model_data_path) lgb_model_class = _get_fully_qualified_class_name(lgb_model) pyfunc.add_to_model( @@ -166,7 +166,9 @@ def save_model( if conda_env is None: if pip_requirements is None: - default_reqs = get_default_pip_requirements(include_cloudpickle=is_sklearn_model) + default_reqs = get_default_pip_requirements( + include_cloudpickle=not isinstance(lgb_model, lgb.Booster) + ) # To ensure `_load_pyfunc` can successfully load the model during the dependency # inference, `mlflow_model.save` must be called beforehand to save an MLmodel file. inferred_reqs = mlflow.models.infer_pip_requirements( @@ -200,21 +202,16 @@ def _save_model(lgb_model, model_path): """ LightGBM Boosters are saved using the built-in method `save_model()`, whereas LightGBM scikit-learn models are serialized using Cloudpickle. - - :return: A boolean value indicating whether the save model is a scikit-learn - model. The returned value will be passed to `get_default_pip_requirements`. """ import lightgbm as lgb if isinstance(lgb_model, lgb.Booster): lgb_model.save_model(model_path) - return False else: import cloudpickle with open(model_path, "wb") as out: cloudpickle.dump(lgb_model, out) - return True @format_docstring(LOG_MODEL_PARAM_DOCS.format(package_name=FLAVOR_NAME))