Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Autologging functionality for scikit-learn integration with XGBoost (Part 2) #5055

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
49 changes: 49 additions & 0 deletions examples/xgboost_sklearn/train_sklearn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from pprint import pprint

import pandas as pd
import xgboost as xgb
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import numpy as np
import mlflow
import mlflow.xgboost

from utils import fetch_logged_data


def main():
# prepare example dataset
wine = load_wine()
X = pd.DataFrame(wine.data, columns=wine.feature_names)
y = pd.Series(wine.target)
X_train, X_test, y_train, y_test = train_test_split(X, y)

# enable auto logging
# this includes xgboost.sklearn estimators
mlflow.xgboost.autolog()
run_id = None
with mlflow.start_run() as run:

regressor = xgb.XGBRegressor(n_estimators=100, reg_lambda=1, gamma=0, max_depth=3)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mlflow.log_metrics({"mse": mse})
run_id = run.info.run_id
print("Logged data and model in run {}".format(run_id))
mlflow.xgboost.log_model(regressor, artifact_path="log_model")

# show logged data
for key, data in fetch_logged_data(run.info.run_id).items():
print("\n---------- logged {} ----------".format(key))
pprint(data)

mlflow.xgboost.save_model(regressor, "trained_model/")
reload_model = mlflow.pyfunc.load_model("trained_model/")
np.testing.assert_array_almost_equal(y_pred, reload_model.predict(X_test))


if __name__ == "__main__":
main()
26 changes: 26 additions & 0 deletions examples/xgboost_sklearn/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import mlflow


def yield_artifacts(run_id, path=None):
"""Yield all artifacts in the specified run"""
client = mlflow.tracking.MlflowClient()
for item in client.list_artifacts(run_id, path):
if item.is_dir:
yield from yield_artifacts(run_id, item.path)
else:
yield item.path


def fetch_logged_data(run_id):
"""Fetch params, metrics, tags, and artifacts in the specified run"""
client = mlflow.tracking.MlflowClient()
data = client.get_run(run_id).data
# Exclude system tags: https://www.mlflow.org/docs/latest/tracking.html#system-tags
tags = {k: v for k, v in data.tags.items() if not k.startswith("mlflow.")}
artifacts = list(yield_artifacts(run_id))
return {
"params": data.params,
"metrics": data.metrics,
"tags": tags,
"artifacts": artifacts,
}
107 changes: 96 additions & 11 deletions mlflow/sklearn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,19 @@ def _gen_estimators_to_patch():
]


def _gen_xgboost_sklearn_estimators_to_patch():
import xgboost as xgb

all_classes = inspect.getmembers(xgb.sklearn, inspect.isclass)
base_class = xgb.sklearn.XGBModel
sklearn_estimators = []
for _, class_object in all_classes:
if issubclass(class_object, base_class) and class_object != base_class:
sklearn_estimators.append(class_object)

return sklearn_estimators


def get_default_pip_requirements(include_cloudpickle=False):
"""
:return: A list of default pip requirements for MLflow Models produced by this flavor.
Expand Down Expand Up @@ -365,7 +378,7 @@ def log_model(
# log model
mlflow.sklearn.log_model(sk_model, "sk_models")
"""
return Model.log(
Model.log(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems Model.log() doesn't return any value. Maybe we can remove return.

artifact_path=artifact_path,
flavor=mlflow.sklearn,
sk_model=sk_model,
Expand Down Expand Up @@ -1146,6 +1159,40 @@ def fetch_logged_data(run_id):
``True``. See the `post training metrics`_ section for more
details.
"""
_autolog(
flavor_name=FLAVOR_NAME,
log_input_examples=log_input_examples,
log_model_signatures=log_model_signatures,
log_models=log_models,
disable=disable,
exclusive=exclusive,
disable_for_unsupported_versions=disable_for_unsupported_versions,
silent=silent,
max_tuning_runs=max_tuning_runs,
log_post_training_metrics=log_post_training_metrics,
)


def _autolog(
flavor_name=FLAVOR_NAME,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Internal API for sklearn autologging. The flavor_name field allows mlflow.xgboost to specify the xgboost_sklearn flavor, preventing flavor conflict with mlflow.sklearn.

log_input_examples=False,
log_model_signatures=True,
log_models=True,
disable=False,
exclusive=False,
disable_for_unsupported_versions=False,
silent=False,
max_tuning_runs=5,
log_post_training_metrics=True,
): # pylint: disable=unused-argument
"""
Internal autologging function for scikit-learn models.
:param flavor_name: A string value. Enable a ``mlflow.sklearn`` autologging routine
for a flavor. By default it enables autologging for original
scikit-learn models, as ``mlflow.sklearn.autolog()`` does. If
the argument is `xgboost_sklearn`, autologging for XGBoost scikit-learn
models is enabled.
"""
import pandas as pd
import sklearn
import sklearn.metrics
Expand Down Expand Up @@ -1193,10 +1240,38 @@ def fit_mlflow(original, self, *args, **kwargs):
autologging_client = MlflowAutologgingQueueingClient()
_log_pretraining_metadata(autologging_client, self, *args, **kwargs)
params_logging_future = autologging_client.flush(synchronous=False)
fit_output = original(self, *args, **kwargs)

if flavor_name == "xgboost_sklearn":
import mlflow.xgboost

# mlflow xgboost autologging items:
# (1) record eval results and (2) log feature importance plot
if self.importance_type is None:
importance_types = ["weight"]
else:
importance_types = (
self.importance_type
if isinstance(self.importance_type, list)
else [self.importance_type]
)

(
fit_output,
early_stopping,
early_stopping_logging_operations,
) = mlflow.xgboost._mlflow_xgboost_logging(
importance_types, autologging_client, _logger, original, self, *args, **kwargs,
)
else:
fit_output = original(self, *args, **kwargs)

_log_posttraining_metadata(autologging_client, self, *args, **kwargs)
autologging_client.flush(synchronous=True)
params_logging_future.await_completion()

if flavor_name == "xgboost_sklearn" and early_stopping:
early_stopping_logging_operations.await_completion()

return fit_output

def _log_pretraining_metadata(
Expand Down Expand Up @@ -1282,7 +1357,12 @@ def get_input_example():

def _log_model_with_except_handling(*args, **kwargs):
try:
return log_model(*args, **kwargs)
if flavor_name == "xgboost_sklearn":
import mlflow.xgboost

return mlflow.xgboost.log_model(*args, **kwargs)
else:
return log_model(*args, **kwargs)
except _SklearnCustomModelPicklingError as e:
_logger.warning(str(e))

Expand Down Expand Up @@ -1329,7 +1409,7 @@ def _log_model_with_except_handling(*args, **kwargs):
# Fetch environment-specific tags (e.g., user and source) to ensure that lineage
# information is consistent with the parent run
child_tags = context_registry.resolve_tags()
child_tags.update({MLFLOW_AUTOLOGGING: FLAVOR_NAME})
child_tags.update({MLFLOW_AUTOLOGGING: flavor_name})
_create_child_runs_for_parameter_search(
autologging_client=autologging_client,
cv_estimator=estimator,
Expand Down Expand Up @@ -1536,40 +1616,45 @@ def out(*args, **kwargs):

_apply_sklearn_descriptor_unbound_method_call_fix()

for class_def in _gen_estimators_to_patch():
if flavor_name == "xgboost_sklearn":
estimators_to_patch = _gen_xgboost_sklearn_estimators_to_patch()
else:
estimators_to_patch = _gen_estimators_to_patch()

for class_def in estimators_to_patch:
# Patch fitting methods
for func_name in ["fit", "fit_transform", "fit_predict"]:
_patch_estimator_method_if_available(
FLAVOR_NAME, class_def, func_name, patched_fit, manage_run=True,
flavor_name, class_def, func_name, patched_fit, manage_run=True,
)

# Patch inference methods
for func_name in ["predict", "predict_proba", "transform", "predict_log_proba"]:
_patch_estimator_method_if_available(
FLAVOR_NAME, class_def, func_name, patched_predict, manage_run=False,
flavor_name, class_def, func_name, patched_predict, manage_run=False,
)

# Patch scoring methods
_patch_estimator_method_if_available(
FLAVOR_NAME, class_def, "score", patched_model_score, manage_run=False,
flavor_name, class_def, "score", patched_model_score, manage_run=False,
)

if log_post_training_metrics:
for metric_name in _get_metric_name_list():
safe_patch(
FLAVOR_NAME, sklearn.metrics, metric_name, patched_metric_api, manage_run=False
flavor_name, sklearn.metrics, metric_name, patched_metric_api, manage_run=False
)

for scorer in sklearn.metrics.SCORERS.values():
safe_patch(FLAVOR_NAME, scorer, "_score_func", patched_metric_api, manage_run=False)
safe_patch(flavor_name, scorer, "_score_func", patched_metric_api, manage_run=False)

def patched_fn_with_autolog_disabled(original, *args, **kwargs):
with disable_autologging():
return original(*args, **kwargs)

for disable_autolog_func_name in _apis_autologging_disabled:
safe_patch(
FLAVOR_NAME,
flavor_name,
sklearn.model_selection,
disable_autolog_func_name,
patched_fn_with_autolog_disabled,
Expand Down