From 00a9b4ead5208e1eb42ea07caaca89ee0a6c1ddf Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Mon, 26 Sep 2022 11:38:54 +0800 Subject: [PATCH 1/3] [pyspark][doc] add more doc for pyspark --- doc/tutorials/spark_estimator.rst | 162 +++++++++++++++++++++++++++--- 1 file changed, 149 insertions(+), 13 deletions(-) diff --git a/doc/tutorials/spark_estimator.rst b/doc/tutorials/spark_estimator.rst index 963a79377a6d..95849f37319e 100644 --- a/doc/tutorials/spark_estimator.rst +++ b/doc/tutorials/spark_estimator.rst @@ -1,12 +1,19 @@ -############################### -Using XGBoost PySpark Estimator -############################### +################################ +Distributed XGBoost with PySpark +################################ Starting from version 2.0, xgboost supports pyspark estimator APIs. The feature is still experimental and not yet ready for production use. -***************** +.. contents:: + :backlinks: none + :local: + +************************* +XGBoost PySpark Estimator +************************* + SparkXGBRegressor -***************** +================= SparkXGBRegressor is a PySpark ML estimator. It implements the XGBoost classification algorithm based on XGBoost python library, and it can be used in PySpark Pipeline @@ -17,10 +24,14 @@ We can create a `SparkXGBRegressor` estimator like: .. code-block:: python from xgboost.spark import SparkXGBRegressor - spark_reg_estimator = SparkXGBRegressor(num_workers=2, max_depth=5) + spark_reg_estimator = SparkXGBRegressor( + features_col="features", + label_col="label", + num_workers=2, + ) -The above snippet create an spark estimator which can fit on a spark dataset, +The above snippet creates a spark estimator which can fit on a spark dataset, and return a spark model that can transform a spark dataset and generate dataset with prediction column. We can set almost all of xgboost sklearn estimator parameters as `SparkXGBRegressor` parameters, but some parameter such as `nthread` is forbidden @@ -30,8 +41,9 @@ such as `weight_col`, `validation_indicator_col`, `use_gpu`, for details please The following code snippet shows how to train a spark xgboost regressor model, first we need to prepare a training dataset as a spark dataframe contains -"features" and "label" column, the "features" column must be `pyspark.ml.linalg.Vector` -type or spark array type. +"label" column and "features" column(s), the "features" column(s) must be `pyspark.ml.linalg.Vector` +type or spark array type or a list of feature column names. + .. code-block:: python @@ -51,11 +63,8 @@ type or spark array type. The above snippet code returns a `transformed_test_spark_dataframe` that contains the input dataset columns and an appended column "prediction" representing the prediction results. - -****************** SparkXGBClassifier -****************** - +================== `SparkXGBClassifier` estimator has similar API with `SparkXGBRegressor`, but it has some pyspark classifier specific params, e.g. `raw_prediction_col` and `probability_col` parameters. @@ -64,3 +73,130 @@ generate result dataset with 3 new columns: - "prediction": represents the predicted label. - "raw_prediction": represents the output margin values. - "probability": represents the prediction probability on each label. + + +*************************** +XGBoost PySpark GPU support +*************************** + +XGBoost PySpark supports GPU training and prediction. To enable GPU support, you first need +to install the xgboost and cudf packages. Then you can set `use_gpu` parameter to `True`. + +Below tutorial will show you how to train a model with XGBoost PySpark GPU on Spark +standalone cluster. + + +Write your PySpark application +============================== + +.. code-block:: python + + from xgboost.spark import SparkXGBRegressor + spark = SparkSession.builder.getOrCreate() + + # read data into spark dataframe + train_data_path = "xxxx/train" + train_df = spark.read.parquet(data_path) + + test_data_path = "xxxx/test" + test_df = spark.read.parquet(test_data_path) + + # assume the label column is named "class" + label_name = "class" + + # get a list with feature column names + feature_names = [x.name for x in train_df.schema if x.name != label] + + # create a xgboost pyspark regressor estimator and set use_gpu=True + regressor = SparkXGBRegressor( + features_col=feature_names, + label_col=label_name, + num_workers=2, + use_gpu=True, + ) + + # train and return the model + model = regressor.fit(train_df) + + # predict on test data + predict_df = model.transform(test_df) + predict_df.show() + +Prepare the necessary packages +============================== + +We recommend using Conda or Virtualenv to manage python dependencies +in PySpark. Please refer to +`How to Manage Python Dependencies in PySpark `_. + +.. code-block:: bash + + conda create -y -n xgboost-env -c conda-forge conda-pack python=3.9 + conda activate xgboost-env + pip install xgboost + pip install cudf + conda pack -f -o xgboost-env.tar.gz + + +Submit the PySpark application +============================== + +Assuming you have configured your Spark cluster with GPU support, if not yet, please +refer to `spark standalone configuration with GPU support `_. + +.. code-block:: bash + + export PYSPARK_DRIVER_PYTHON=python + export PYSPARK_PYTHON=./environment/bin/python + + spark-submit \ + --master spark://:7077 \ + --conf spark.executor.resource.gpu.amount=1 \ + --conf spark.task.resource.gpu.amount=1 \ + --archives xgboost-env.tar.gz#environment \ + xgboost_app.py + + +Model Persistence +================= + +.. code-block:: python + + # save the model + model.save("/tmp/xgboost-pyspark-model") + + # load the model + model2 = SparkXGBRankerModel.load("/tmp/xgboost-pyspark-model") + +The above code snippet shows how to save/load xgboost pyspark model. And you can also +load the model with xgboost python package directly without involving spark. + +.. code-block:: python + + import xgboost as xgb + bst = xgb.Booster() + bst.load_model("/tmp/xgboost-pyspark-model/model/part-00000") + + +Accelerate the whole pipeline of xgboost pyspark +================================================ + +With `RAPIDS Accelerator for Apache Spark `_, +you can accelerate the whole pipeline (ETL, Train, Transform) for xgboost pyspark +without any code change by leveraging GPU. + +You only need to add some configurations to enable RAPIDS plugin when submitting. + +.. code-block:: bash + + export PYSPARK_DRIVER_PYTHON=python + export PYSPARK_PYTHON=./environment/bin/python + + spark-submit \ + --master spark://:7077 \ + --conf spark.executor.resource.gpu.amount=1 \ + --conf spark.task.resource.gpu.amount=1 \ + --packages com.nvidia:rapids-4-spark_2.12:22.08.0 \ + --conf spark.plugins=com.nvidia.spark.SQLPlugin \ + --archives xgboost-env.tar.gz#environment \ + xgboost_app.py From 668e4924035aeec67538b6dacdbf72061b2fa256 Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Tue, 27 Sep 2022 08:17:31 +0800 Subject: [PATCH 2/3] resolve comments --- doc/tutorials/spark_estimator.rst | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/doc/tutorials/spark_estimator.rst b/doc/tutorials/spark_estimator.rst index bf19997ffc93..65fd220eb875 100644 --- a/doc/tutorials/spark_estimator.rst +++ b/doc/tutorials/spark_estimator.rst @@ -164,12 +164,11 @@ Model Persistence # save the model model.save("/tmp/xgboost-pyspark-model") - # load the model model2 = SparkXGBRankerModel.load("/tmp/xgboost-pyspark-model") -The above code snippet shows how to save/load xgboost pyspark model. And you can also -load the model with xgboost python package directly without involving spark. +The above code snippet shows how to save/load xgboost pyspark model. You can also +load the model with xgboost python package directly without involving Spark. .. code-block:: python @@ -177,6 +176,12 @@ load the model with xgboost python package directly without involving spark. bst = xgb.Booster() bst.load_model("/tmp/xgboost-pyspark-model/model/part-00000") +If you don't want to save the model to disk. You still can get the Booster attribute +from the fitted model. + +.. code-block:: python + + bst: xgb.Booster = model.get_booster() Accelerate the whole pipeline of xgboost pyspark ================================================ @@ -185,7 +190,7 @@ With `RAPIDS Accelerator for Apache Spark Date: Wed, 28 Sep 2022 19:49:50 +0800 Subject: [PATCH 3/3] Description. --- doc/tutorials/spark_estimator.rst | 33 +++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/doc/tutorials/spark_estimator.rst b/doc/tutorials/spark_estimator.rst index ae85fbb2558c..b95755acefa0 100644 --- a/doc/tutorials/spark_estimator.rst +++ b/doc/tutorials/spark_estimator.rst @@ -1,8 +1,12 @@ ################################ Distributed XGBoost with PySpark ################################ + Starting from version 1.7.0, xgboost supports pyspark estimator APIs. -The feature is still experimental and not yet ready for production use. + +.. note:: + + The feature is still experimental and not yet ready for production use. .. contents:: :backlinks: none @@ -160,28 +164,38 @@ refer to `spark standalone configuration with GPU support