Skip to content

Commit

Permalink
Autologging functionality for scikit-learn integration with XGBoost …
Browse files Browse the repository at this point in the history
…(Part 2) (#5078)

* new commit, resolve conflict

Signed-off-by: Junwen Yao <jwyiao@gmail.com>

* add example

Signed-off-by: Junwen Yao <jwyiao@gmail.com>

* fix lint

Signed-off-by: Junwen Yao <jwyiao@gmail.com>

* address review

Signed-off-by: Junwen Yao <jwyiao@gmail.com>

* fix build_doc

Signed-off-by: Junwen Yao <jwyiao@gmail.com>

* Update mlflow/sklearn/__init__.py

remove additional lines

Signed-off-by: Junwen Yao <jwyiao@gmail.com>

* remove extra lines

Signed-off-by: Junwen Yao <jwyiao@gmail.com>

* address review > TODO:(1)doc(2)test

Signed-off-by: Junwen Yao <jwyiao@gmail.com>

* address review + add tests > TODO:doc,README

Signed-off-by: Junwen Yao <jwyiao@gmail.com>

* address review + complete doc

Signed-off-by: Junwen Yao <jwyiao@gmail.com>

* fix lint

Signed-off-by: Junwen Yao <jwyiao@gmail.com>

* update examples + fix example tests

Signed-off-by: Junwen Yao <jwyiao@gmail.com>

* address review

Signed-off-by: Junwen Yao <jwyiao@gmail.com>

* address review: update example

Signed-off-by: Junwen Yao <jwyiao@gmail.com>
  • Loading branch information
jwyyy committed Nov 29, 2021
1 parent af8460f commit 5381d68
Show file tree
Hide file tree
Showing 16 changed files with 292 additions and 51 deletions.
26 changes: 2 additions & 24 deletions examples/xgboost/README.md
@@ -1,25 +1,3 @@
# XGBoost Example
# Examples for XGBoost Autologging

This example trains an XGBoost classifier with the iris dataset and logs hyperparameters, metrics, and trained model.

## Running the code

```
python train.py --learning-rate 0.2 --colsample-bytree 0.8 --subsample 0.9
```
You can try experimenting with different parameter values like:
```
python train.py --learning-rate 0.4 --colsample-bytree 0.7 --subsample 0.8
```

Then you can open the MLflow UI to track the experiments and compare your runs via:
```
mlflow ui
```


## Running the code as a project

```
mlflow run . -P learning_rate=0.2 -P colsample_bytree=0.8 -P subsample=0.9
```
Two examples are provided to demonstrate XGBoost autologging functionalities. The `xgboost_native` folder contains an example that logs a Booster model trained by `xgboost.train()`. The `xgboost_sklearn` includes another example showing how autologging works for XGBoost scikit-learn models. In fact, there is no difference in turning on autologging for all XGBoost models. That is, `mlflow.xgboost.autolog()` works for all XGBoost models.
File renamed without changes.
25 changes: 25 additions & 0 deletions examples/xgboost/xgboost_native/README.md
@@ -0,0 +1,25 @@
# XGBoost Example

This example trains an XGBoost classifier with the iris dataset and logs hyperparameters, metrics, and trained model.

## Running the code

```
python train.py --learning-rate 0.2 --colsample-bytree 0.8 --subsample 0.9
```
You can try experimenting with different parameter values like:
```
python train.py --learning-rate 0.4 --colsample-bytree 0.7 --subsample 0.8
```

Then you can open the MLflow UI to track the experiments and compare your runs via:
```
mlflow ui
```


## Running the code as a project

```
mlflow run . -P learning_rate=0.2 -P colsample_bytree=0.8 -P subsample=0.9
```
File renamed without changes.
File renamed without changes.
7 changes: 7 additions & 0 deletions examples/xgboost/xgboost_sklearn/MLproject
@@ -0,0 +1,7 @@
name: xgboost_sklearn_example

conda_env: conda.yaml

entry_points:
main:
command: "python train.py"
12 changes: 12 additions & 0 deletions examples/xgboost/xgboost_sklearn/README.md
@@ -0,0 +1,12 @@
# XGBoost Scikit-learn Model Example

This example trains an [`XGBoost.XGBRegressor`](https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBRegressor) with the diabetes dataset and logs hyperparameters, metrics, and trained model.

Like the other XGBoost example, we enable autologging for XGBoost scikit-learn models via `mlflow.xgboost.autolog()`. Saving / loading models also supports XGBoost scikit-learn models.

You can run this example using the following command:
```
python train_sklearn.py
```


11 changes: 11 additions & 0 deletions examples/xgboost/xgboost_sklearn/conda.yaml
@@ -0,0 +1,11 @@
channels:
- conda-forge
dependencies:
- python=3.8.12
- pip
- pip:
- mlflow
- pandas==1.3.4
- scikit-learn==0.24.2
- xgboost==1.5.0
name: mlflow-env
39 changes: 39 additions & 0 deletions examples/xgboost/xgboost_sklearn/train.py
@@ -0,0 +1,39 @@
from pprint import pprint

import xgboost as xgb
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import mlflow
import mlflow.xgboost

from utils import fetch_logged_data


def main():
# prepare example dataset
X, y = load_diabetes(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)

# enable auto logging
# this includes xgboost.sklearn estimators
mlflow.xgboost.autolog()

with mlflow.start_run() as run:

regressor = xgb.XGBRegressor(n_estimators=20, reg_lambda=1, gamma=0, max_depth=3)
regressor.fit(X_train, y_train, eval_set=[(X_test, y_test)])
y_pred = regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
run_id = run.info.run_id
print("Logged data and model in run {}".format(run_id))

# show logged data
for key, data in fetch_logged_data(run.info.run_id).items():
print("\n---------- logged {} ----------".format(key))
pprint(data)


if __name__ == "__main__":
main()
26 changes: 26 additions & 0 deletions examples/xgboost/xgboost_sklearn/utils.py
@@ -0,0 +1,26 @@
import mlflow


def yield_artifacts(run_id, path=None):
"""Yield all artifacts in the specified run"""
client = mlflow.tracking.MlflowClient()
for item in client.list_artifacts(run_id, path):
if item.is_dir:
yield from yield_artifacts(run_id, item.path)
else:
yield item.path


def fetch_logged_data(run_id):
"""Fetch params, metrics, tags, and artifacts in the specified run"""
client = mlflow.tracking.MlflowClient()
data = client.get_run(run_id).data
# Exclude system tags: https://www.mlflow.org/docs/latest/tracking.html#system-tags
tags = {k: v for k, v in data.tags.items() if not k.startswith("mlflow.")}
artifacts = list(yield_artifacts(run_id))
return {
"params": data.params,
"metrics": data.metrics,
"tags": tags,
"artifacts": artifacts,
}
91 changes: 79 additions & 12 deletions mlflow/sklearn/__init__.py
Expand Up @@ -371,7 +371,7 @@ def log_model(
# log model
mlflow.sklearn.log_model(sk_model, "sk_models")
"""
return Model.log(
Model.log(
artifact_path=artifact_path,
flavor=mlflow.sklearn,
sk_model=sk_model,
Expand Down Expand Up @@ -1152,6 +1152,40 @@ def fetch_logged_data(run_id):
``True``. See the `post training metrics`_ section for more
details.
"""
_autolog(
flavor_name=FLAVOR_NAME,
log_input_examples=log_input_examples,
log_model_signatures=log_model_signatures,
log_models=log_models,
disable=disable,
exclusive=exclusive,
disable_for_unsupported_versions=disable_for_unsupported_versions,
silent=silent,
max_tuning_runs=max_tuning_runs,
log_post_training_metrics=log_post_training_metrics,
)


def _autolog(
flavor_name=FLAVOR_NAME,
log_input_examples=False,
log_model_signatures=True,
log_models=True,
disable=False,
exclusive=False,
disable_for_unsupported_versions=False,
silent=False,
max_tuning_runs=5,
log_post_training_metrics=True,
): # pylint: disable=unused-argument
"""
Internal autologging function for scikit-learn models.
:param flavor_name: A string value. Enable a ``mlflow.sklearn`` autologging routine
for a flavor. By default it enables autologging for original
scikit-learn models, as ``mlflow.sklearn.autolog()`` does. If
the argument is `xgboost`, autologging for XGBoost scikit-learn
models is enabled.
"""
import pandas as pd
import sklearn
import sklearn.metrics
Expand All @@ -1162,6 +1196,7 @@ def fetch_logged_data(run_id):
_MIN_SKLEARN_VERSION,
_TRAINING_PREFIX,
_is_supported_version,
_gen_xgboost_sklearn_estimators_to_patch,
_get_args_for_metrics,
_log_estimator_content,
_all_estimators,
Expand Down Expand Up @@ -1190,6 +1225,31 @@ def fetch_logged_data(run_id):
stacklevel=2,
)

def fit_mlflow_xgboost(original, self, *args, **kwargs):
"""
Autologging function for XGBoost scikit-learn models
"""
# parameter, metric, and non-model artifact logging
# are done in `train()` in `mlflow.xgboost.autolog()`
fit_output = original(self, *args, **kwargs)
# log models after training
X = _get_args_for_metrics(self.fit, args, kwargs)[0]
if log_models:
input_example, signature = resolve_input_example_and_signature(
lambda: X[:INPUT_EXAMPLE_SAMPLE_ROWS],
lambda input_example: infer_signature(input_example, self.predict(input_example)),
log_input_examples,
log_model_signatures,
_logger,
)
mlflow.xgboost.log_model(
self,
artifact_path="model",
signature=signature,
input_example=input_example,
)
return fit_output

def fit_mlflow(original, self, *args, **kwargs):
"""
Autologging function that performs model training by executing the training method
Expand Down Expand Up @@ -1340,7 +1400,7 @@ def _log_model_with_except_handling(*args, **kwargs):
# Fetch environment-specific tags (e.g., user and source) to ensure that lineage
# information is consistent with the parent run
child_tags = context_registry.resolve_tags()
child_tags.update({MLFLOW_AUTOLOGGING: FLAVOR_NAME})
child_tags.update({MLFLOW_AUTOLOGGING: flavor_name})
_create_child_runs_for_parameter_search(
autologging_client=autologging_client,
cv_estimator=estimator,
Expand Down Expand Up @@ -1369,7 +1429,7 @@ def _log_model_with_except_handling(*args, **kwargs):
)
_logger.warning(msg)

def patched_fit(original, self, *args, **kwargs):
def patched_fit(fit_impl, original, self, *args, **kwargs):
"""
Autologging patch function to be applied to a sklearn model class that defines a `fit`
method and inherits from `BaseEstimator` (thereby defining the `get_params()` method)
Expand All @@ -1390,7 +1450,7 @@ def patched_fit(original, self, *args, **kwargs):
# In `fit_mlflow` call, it will also call metric API for computing training metrics
# so we need temporarily disable the post_training_metrics patching.
with _AUTOLOGGING_METRICS_MANAGER.disable_log_post_training_metrics():
result = fit_mlflow(original, self, *args, **kwargs)
result = fit_impl(original, self, *args, **kwargs)
if should_log_post_training_metrics:
_AUTOLOGGING_METRICS_MANAGER.register_model(
self, mlflow.active_run().info.run_id
Expand Down Expand Up @@ -1547,21 +1607,28 @@ def out(*args, **kwargs):

_apply_sklearn_descriptor_unbound_method_call_fix()

for class_def in _gen_estimators_to_patch():
if flavor_name == mlflow.xgboost.FLAVOR_NAME:
estimators_to_patch = _gen_xgboost_sklearn_estimators_to_patch()
patched_fit_impl = fit_mlflow_xgboost
else:
estimators_to_patch = _gen_estimators_to_patch()
patched_fit_impl = fit_mlflow

for class_def in estimators_to_patch:
# Patch fitting methods
for func_name in ["fit", "fit_transform", "fit_predict"]:
_patch_estimator_method_if_available(
FLAVOR_NAME,
flavor_name,
class_def,
func_name,
patched_fit,
functools.partial(patched_fit, patched_fit_impl),
manage_run=True,
)

# Patch inference methods
for func_name in ["predict", "predict_proba", "transform", "predict_log_proba"]:
_patch_estimator_method_if_available(
FLAVOR_NAME,
flavor_name,
class_def,
func_name,
patched_predict,
Expand All @@ -1570,7 +1637,7 @@ def out(*args, **kwargs):

# Patch scoring methods
_patch_estimator_method_if_available(
FLAVOR_NAME,
flavor_name,
class_def,
"score",
patched_model_score,
Expand All @@ -1580,19 +1647,19 @@ def out(*args, **kwargs):
if log_post_training_metrics:
for metric_name in _get_metric_name_list():
safe_patch(
FLAVOR_NAME, sklearn.metrics, metric_name, patched_metric_api, manage_run=False
flavor_name, sklearn.metrics, metric_name, patched_metric_api, manage_run=False
)

for scorer in sklearn.metrics.SCORERS.values():
safe_patch(FLAVOR_NAME, scorer, "_score_func", patched_metric_api, manage_run=False)
safe_patch(flavor_name, scorer, "_score_func", patched_metric_api, manage_run=False)

def patched_fn_with_autolog_disabled(original, *args, **kwargs):
with disable_autologging():
return original(*args, **kwargs)

for disable_autolog_func_name in _apis_autologging_disabled:
safe_patch(
FLAVOR_NAME,
flavor_name,
sklearn.model_selection,
disable_autolog_func_name,
patched_fn_with_autolog_disabled,
Expand Down
13 changes: 13 additions & 0 deletions mlflow/sklearn/utils.py
Expand Up @@ -34,6 +34,19 @@
_SklearnMetric = collections.namedtuple("_SklearnMetric", ["name", "function", "arguments"])


def _gen_xgboost_sklearn_estimators_to_patch():
import xgboost as xgb

all_classes = inspect.getmembers(xgb.sklearn, inspect.isclass)
base_class = xgb.sklearn.XGBModel
sklearn_estimators = []
for _, class_object in all_classes:
if issubclass(class_object, base_class) and class_object != base_class:
sklearn_estimators.append(class_object)

return sklearn_estimators


def _get_estimator_info_tags(estimator):
"""
:return: A dictionary of MLflow run tag keys and values
Expand Down

0 comments on commit 5381d68

Please sign in to comment.