Autologging functionality for scikit-learn integration with XGBoost …

…(Part 2) (#5078) * new commit, resolve conflict Signed-off-by: Junwen Yao <jwyiao@gmail.com> * add example Signed-off-by: Junwen Yao <jwyiao@gmail.com> * fix lint Signed-off-by: Junwen Yao <jwyiao@gmail.com> * address review Signed-off-by: Junwen Yao <jwyiao@gmail.com> * fix build_doc Signed-off-by: Junwen Yao <jwyiao@gmail.com> * Update mlflow/sklearn/__init__.py remove additional lines Signed-off-by: Junwen Yao <jwyiao@gmail.com> * remove extra lines Signed-off-by: Junwen Yao <jwyiao@gmail.com> * address review > TODO:(1)doc(2)test Signed-off-by: Junwen Yao <jwyiao@gmail.com> * address review + add tests > TODO:doc,README Signed-off-by: Junwen Yao <jwyiao@gmail.com> * address review + complete doc Signed-off-by: Junwen Yao <jwyiao@gmail.com> * fix lint Signed-off-by: Junwen Yao <jwyiao@gmail.com> * update examples + fix example tests Signed-off-by: Junwen Yao <jwyiao@gmail.com> * address review Signed-off-by: Junwen Yao <jwyiao@gmail.com> * address review: update example Signed-off-by: Junwen Yao <jwyiao@gmail.com>
mlflow · Nov 29, 2021 · 5381d68 · 5381d68
1 parent af8460f
commit 5381d68
Show file tree

Hide file tree

Showing 16 changed files with 292 additions and 51 deletions.
diff --git a/examples/xgboost/README.md b/examples/xgboost/README.md
@@ -1,25 +1,3 @@
-# XGBoost Example
+# Examples for XGBoost Autologging
 
-This example trains an XGBoost classifier with the iris dataset and logs hyperparameters, metrics, and trained model.
-
-## Running the code
-
-```
-python train.py --learning-rate 0.2 --colsample-bytree 0.8 --subsample 0.9
-```
-You can try experimenting with different parameter values like:
-```
-python train.py --learning-rate 0.4 --colsample-bytree 0.7 --subsample 0.8
-```
-
-Then you can open the MLflow UI to track the experiments and compare your runs via:
-```
-mlflow ui
-```
-
-
-## Running the code as a project
-
-```
-mlflow run . -P learning_rate=0.2 -P colsample_bytree=0.8 -P subsample=0.9
-```
+Two examples are provided to demonstrate XGBoost autologging functionalities. The `xgboost_native` folder contains an example that logs a Booster model trained by `xgboost.train()`. The `xgboost_sklearn` includes another example showing how autologging works for XGBoost scikit-learn models. In fact, there is no difference in turning on autologging for all XGBoost models. That is, `mlflow.xgboost.autolog()` works for all XGBoost models.
diff --git a/examples/xgboost/MLproject → examples/xgboost/xgboost_native/MLproject b/examples/xgboost/MLproject → examples/xgboost/xgboost_native/MLproject
diff --git a/examples/xgboost/xgboost_native/README.md b/examples/xgboost/xgboost_native/README.md
@@ -0,0 +1,25 @@
+# XGBoost Example
+
+This example trains an XGBoost classifier with the iris dataset and logs hyperparameters, metrics, and trained model.
+
+## Running the code
+
+```
+python train.py --learning-rate 0.2 --colsample-bytree 0.8 --subsample 0.9
+```
+You can try experimenting with different parameter values like:
+```
+python train.py --learning-rate 0.4 --colsample-bytree 0.7 --subsample 0.8
+```
+
+Then you can open the MLflow UI to track the experiments and compare your runs via:
+```
+mlflow ui
+```
+
+
+## Running the code as a project
+
+```
+mlflow run . -P learning_rate=0.2 -P colsample_bytree=0.8 -P subsample=0.9
+```
diff --git a/examples/xgboost/conda.yaml → examples/xgboost/xgboost_native/conda.yaml b/examples/xgboost/conda.yaml → examples/xgboost/xgboost_native/conda.yaml
diff --git a/examples/xgboost/train.py → examples/xgboost/xgboost_native/train.py b/examples/xgboost/train.py → examples/xgboost/xgboost_native/train.py
diff --git a/examples/xgboost/xgboost_sklearn/MLproject b/examples/xgboost/xgboost_sklearn/MLproject
@@ -0,0 +1,7 @@
+name: xgboost_sklearn_example
+
+conda_env: conda.yaml
+
+entry_points:
+  main:
+    command: "python train.py"
diff --git a/examples/xgboost/xgboost_sklearn/README.md b/examples/xgboost/xgboost_sklearn/README.md
@@ -0,0 +1,12 @@
+# XGBoost Scikit-learn Model Example
+
+This example trains an [`XGBoost.XGBRegressor`](https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBRegressor) with the diabetes dataset and logs hyperparameters, metrics, and trained model.
+
+Like the other XGBoost example, we enable autologging for XGBoost scikit-learn models via `mlflow.xgboost.autolog()`. Saving / loading models also supports XGBoost scikit-learn models.
+
+You can run this example using the following command:
+```
+python train_sklearn.py
+```
+
+
diff --git a/examples/xgboost/xgboost_sklearn/conda.yaml b/examples/xgboost/xgboost_sklearn/conda.yaml
@@ -0,0 +1,11 @@
+channels:
+- conda-forge
+dependencies:
+- python=3.8.12
+- pip
+- pip:
+  - mlflow
+  - pandas==1.3.4
+  - scikit-learn==0.24.2
+  - xgboost==1.5.0
+name: mlflow-env
diff --git a/examples/xgboost/xgboost_sklearn/train.py b/examples/xgboost/xgboost_sklearn/train.py
@@ -0,0 +1,39 @@
+from pprint import pprint
+
+import xgboost as xgb
+from sklearn.datasets import load_diabetes
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error
+
+import mlflow
+import mlflow.xgboost
+
+from utils import fetch_logged_data
+
+
+def main():
+    # prepare example dataset
+    X, y = load_diabetes(return_X_y=True, as_frame=True)
+    X_train, X_test, y_train, y_test = train_test_split(X, y)
+
+    # enable auto logging
+    # this includes xgboost.sklearn estimators
+    mlflow.xgboost.autolog()
+
+    with mlflow.start_run() as run:
+
+        regressor = xgb.XGBRegressor(n_estimators=20, reg_lambda=1, gamma=0, max_depth=3)
+        regressor.fit(X_train, y_train, eval_set=[(X_test, y_test)])
+        y_pred = regressor.predict(X_test)
+        mse = mean_squared_error(y_test, y_pred)
+        run_id = run.info.run_id
+        print("Logged data and model in run {}".format(run_id))
+
+    # show logged data
+    for key, data in fetch_logged_data(run.info.run_id).items():
+        print("\n---------- logged {} ----------".format(key))
+        pprint(data)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/xgboost/xgboost_sklearn/utils.py b/examples/xgboost/xgboost_sklearn/utils.py
@@ -0,0 +1,26 @@
+import mlflow
+
+
+def yield_artifacts(run_id, path=None):
+    """Yield all artifacts in the specified run"""
+    client = mlflow.tracking.MlflowClient()
+    for item in client.list_artifacts(run_id, path):
+        if item.is_dir:
+            yield from yield_artifacts(run_id, item.path)
+        else:
+            yield item.path
+
+
+def fetch_logged_data(run_id):
+    """Fetch params, metrics, tags, and artifacts in the specified run"""
+    client = mlflow.tracking.MlflowClient()
+    data = client.get_run(run_id).data
+    # Exclude system tags: https://www.mlflow.org/docs/latest/tracking.html#system-tags
+    tags = {k: v for k, v in data.tags.items() if not k.startswith("mlflow.")}
+    artifacts = list(yield_artifacts(run_id))
+    return {
+        "params": data.params,
+        "metrics": data.metrics,
+        "tags": tags,
+        "artifacts": artifacts,
+    }
diff --git a/mlflow/sklearn/__init__.py b/mlflow/sklearn/__init__.py
@@ -371,7 +371,7 @@ def log_model(
         # log model
         mlflow.sklearn.log_model(sk_model, "sk_models")
     """
-    return Model.log(
+    Model.log(
         artifact_path=artifact_path,
         flavor=mlflow.sklearn,
         sk_model=sk_model,
@@ -1152,6 +1152,40 @@ def fetch_logged_data(run_id):
                                       ``True``. See the `post training metrics`_ section for more
                                       details.
     """
+    _autolog(
+        flavor_name=FLAVOR_NAME,
+        log_input_examples=log_input_examples,
+        log_model_signatures=log_model_signatures,
+        log_models=log_models,
+        disable=disable,
+        exclusive=exclusive,
+        disable_for_unsupported_versions=disable_for_unsupported_versions,
+        silent=silent,
+        max_tuning_runs=max_tuning_runs,
+        log_post_training_metrics=log_post_training_metrics,
+    )
+
+
+def _autolog(
+    flavor_name=FLAVOR_NAME,
+    log_input_examples=False,
+    log_model_signatures=True,
+    log_models=True,
+    disable=False,
+    exclusive=False,
+    disable_for_unsupported_versions=False,
+    silent=False,
+    max_tuning_runs=5,
+    log_post_training_metrics=True,
+):  # pylint: disable=unused-argument
+    """
+    Internal autologging function for scikit-learn models.
+    :param flavor_name: A string value. Enable a ``mlflow.sklearn`` autologging routine
+                        for a flavor. By default it enables autologging for original
+                        scikit-learn models, as ``mlflow.sklearn.autolog()`` does. If
+                        the argument is `xgboost`, autologging for XGBoost scikit-learn
+                        models is enabled.
+    """
     import pandas as pd
     import sklearn
     import sklearn.metrics
@@ -1162,6 +1196,7 @@ def fetch_logged_data(run_id):
         _MIN_SKLEARN_VERSION,
         _TRAINING_PREFIX,
         _is_supported_version,
+        _gen_xgboost_sklearn_estimators_to_patch,
         _get_args_for_metrics,
         _log_estimator_content,
         _all_estimators,
@@ -1190,6 +1225,31 @@ def fetch_logged_data(run_id):
             stacklevel=2,
         )
 
+    def fit_mlflow_xgboost(original, self, *args, **kwargs):
+        """
+        Autologging function for XGBoost scikit-learn models
+        """
+        # parameter, metric, and non-model artifact logging
+        # are done in `train()` in `mlflow.xgboost.autolog()`
+        fit_output = original(self, *args, **kwargs)
+        # log models after training
+        X = _get_args_for_metrics(self.fit, args, kwargs)[0]
+        if log_models:
+            input_example, signature = resolve_input_example_and_signature(
+                lambda: X[:INPUT_EXAMPLE_SAMPLE_ROWS],
+                lambda input_example: infer_signature(input_example, self.predict(input_example)),
+                log_input_examples,
+                log_model_signatures,
+                _logger,
+            )
+            mlflow.xgboost.log_model(
+                self,
+                artifact_path="model",
+                signature=signature,
+                input_example=input_example,
+            )
+        return fit_output
+
     def fit_mlflow(original, self, *args, **kwargs):
         """
         Autologging function that performs model training by executing the training method
@@ -1340,7 +1400,7 @@ def _log_model_with_except_handling(*args, **kwargs):
                     # Fetch environment-specific tags (e.g., user and source) to ensure that lineage
                     # information is consistent with the parent run
                     child_tags = context_registry.resolve_tags()
-                    child_tags.update({MLFLOW_AUTOLOGGING: FLAVOR_NAME})
+                    child_tags.update({MLFLOW_AUTOLOGGING: flavor_name})
                     _create_child_runs_for_parameter_search(
                         autologging_client=autologging_client,
                         cv_estimator=estimator,
@@ -1369,7 +1429,7 @@ def _log_model_with_except_handling(*args, **kwargs):
                     )
                     _logger.warning(msg)
 
-    def patched_fit(original, self, *args, **kwargs):
+    def patched_fit(fit_impl, original, self, *args, **kwargs):
         """
         Autologging patch function to be applied to a sklearn model class that defines a `fit`
         method and inherits from `BaseEstimator` (thereby defining the `get_params()` method)
@@ -1390,7 +1450,7 @@ def patched_fit(original, self, *args, **kwargs):
                 # In `fit_mlflow` call, it will also call metric API for computing training metrics
                 # so we need temporarily disable the post_training_metrics patching.
                 with _AUTOLOGGING_METRICS_MANAGER.disable_log_post_training_metrics():
-                    result = fit_mlflow(original, self, *args, **kwargs)
+                    result = fit_impl(original, self, *args, **kwargs)
                 if should_log_post_training_metrics:
                     _AUTOLOGGING_METRICS_MANAGER.register_model(
                         self, mlflow.active_run().info.run_id
@@ -1547,21 +1607,28 @@ def out(*args, **kwargs):
 
     _apply_sklearn_descriptor_unbound_method_call_fix()
 
-    for class_def in _gen_estimators_to_patch():
+    if flavor_name == mlflow.xgboost.FLAVOR_NAME:
+        estimators_to_patch = _gen_xgboost_sklearn_estimators_to_patch()
+        patched_fit_impl = fit_mlflow_xgboost
+    else:
+        estimators_to_patch = _gen_estimators_to_patch()
+        patched_fit_impl = fit_mlflow
+
+    for class_def in estimators_to_patch:
         # Patch fitting methods
         for func_name in ["fit", "fit_transform", "fit_predict"]:
             _patch_estimator_method_if_available(
-                FLAVOR_NAME,
+                flavor_name,
                 class_def,
                 func_name,
-                patched_fit,
+                functools.partial(patched_fit, patched_fit_impl),
                 manage_run=True,
             )
 
         # Patch inference methods
         for func_name in ["predict", "predict_proba", "transform", "predict_log_proba"]:
             _patch_estimator_method_if_available(
-                FLAVOR_NAME,
+                flavor_name,
                 class_def,
                 func_name,
                 patched_predict,
@@ -1570,7 +1637,7 @@ def out(*args, **kwargs):
 
         # Patch scoring methods
         _patch_estimator_method_if_available(
-            FLAVOR_NAME,
+            flavor_name,
             class_def,
             "score",
             patched_model_score,
@@ -1580,19 +1647,19 @@ def out(*args, **kwargs):
     if log_post_training_metrics:
         for metric_name in _get_metric_name_list():
             safe_patch(
-                FLAVOR_NAME, sklearn.metrics, metric_name, patched_metric_api, manage_run=False
+                flavor_name, sklearn.metrics, metric_name, patched_metric_api, manage_run=False
             )
 
         for scorer in sklearn.metrics.SCORERS.values():
-            safe_patch(FLAVOR_NAME, scorer, "_score_func", patched_metric_api, manage_run=False)
+            safe_patch(flavor_name, scorer, "_score_func", patched_metric_api, manage_run=False)
 
     def patched_fn_with_autolog_disabled(original, *args, **kwargs):
         with disable_autologging():
             return original(*args, **kwargs)
 
     for disable_autolog_func_name in _apis_autologging_disabled:
         safe_patch(
-            FLAVOR_NAME,
+            flavor_name,
             sklearn.model_selection,
             disable_autolog_func_name,
             patched_fn_with_autolog_disabled,

diff --git a/mlflow/sklearn/utils.py b/mlflow/sklearn/utils.py
@@ -34,6 +34,19 @@
 _SklearnMetric = collections.namedtuple("_SklearnMetric", ["name", "function", "arguments"])
 
 
+def _gen_xgboost_sklearn_estimators_to_patch():
+    import xgboost as xgb
+
+    all_classes = inspect.getmembers(xgb.sklearn, inspect.isclass)
+    base_class = xgb.sklearn.XGBModel
+    sklearn_estimators = []
+    for _, class_object in all_classes:
+        if issubclass(class_object, base_class) and class_object != base_class:
+            sklearn_estimators.append(class_object)
+
+    return sklearn_estimators
+
+
 def _get_estimator_info_tags(estimator):
     """
     :return: A dictionary of MLflow run tag keys and values