From c3e74ae040ae65b8e4f92996d783e2593d134905 Mon Sep 17 00:00:00 2001 From: Rong Ou Date: Wed, 1 Jun 2022 15:08:54 -0700 Subject: [PATCH] Support GPU training in the NVFlare demo --- demo/nvflare/README.md | 11 +++++++++++ demo/nvflare/config/config_fed_client.json | 3 ++- demo/nvflare/custom/trainer.py | 7 ++++++- plugin/federated/README.md | 8 +++++++- 4 files changed, 26 insertions(+), 3 deletions(-) diff --git a/demo/nvflare/README.md b/demo/nvflare/README.md index 226a90c38765..ad5ed9a60cc7 100644 --- a/demo/nvflare/README.md +++ b/demo/nvflare/README.md @@ -3,6 +3,8 @@ This directory contains a demo of Federated Learning using [NVFlare](https://nvidia.github.io/NVFlare/). +## Training with CPU only + To run the demo, first build XGBoost with the federated learning plugin enabled (see the [README](../../plugin/federated/README.md)). @@ -53,3 +55,12 @@ Finally, shutdown everything from the admin CLI: shutdown client shutdown server ``` + +## Training with GPUs + +To demo with Federated Learning using GPUs, make sure your machine has at least 2 GPUs. +Build XGBoost with the federated learning plugin enabled along with CUDA, but with NCCL +turned off (see the [README](../../plugin/federated/README.md)). + +Modify `config/config_fed_client.json` and set `use_gpus` to `true`, then repeat the steps +above. diff --git a/demo/nvflare/config/config_fed_client.json b/demo/nvflare/config/config_fed_client.json index 39f23e9bca42..c15a1997c1c8 100755 --- a/demo/nvflare/config/config_fed_client.json +++ b/demo/nvflare/config/config_fed_client.json @@ -12,7 +12,8 @@ "world_size": 2, "server_cert_path": "server-cert.pem", "client_key_path": "client-key.pem", - "client_cert_path": "client-cert.pem" + "client_cert_path": "client-cert.pem", + "use_gpus": "false" } } } diff --git a/demo/nvflare/custom/trainer.py b/demo/nvflare/custom/trainer.py index 9403fec00215..c19d9799f143 100644 --- a/demo/nvflare/custom/trainer.py +++ b/demo/nvflare/custom/trainer.py @@ -16,7 +16,7 @@ class SupportedTasks(object): class XGBoostTrainer(Executor): def __init__(self, server_address: str, world_size: int, server_cert_path: str, - client_key_path: str, client_cert_path: str): + client_key_path: str, client_cert_path: str, use_gpus: bool): """Trainer for federated XGBoost. Args: @@ -32,6 +32,7 @@ def __init__(self, server_address: str, world_size: int, server_cert_path: str, self._server_cert_path = server_cert_path self._client_key_path = client_key_path self._client_cert_path = client_cert_path + self._use_gpus = use_gpus def execute(self, task_name: str, shareable: Shareable, fl_ctx: FLContext, abort_signal: Signal) -> Shareable: @@ -66,6 +67,10 @@ def _do_training(self, fl_ctx: FLContext): # Specify parameters via map, definition are same as c++ version param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'} + if self._use_gpus: + self.log_info(fl_ctx, f'Training with GPU {rank}') + param['tree_method'] = 'gpu_hist' + param['gpu_id'] = rank # Specify validations set to watch performance watchlist = [(dtest, 'eval'), (dtrain, 'train')] diff --git a/plugin/federated/README.md b/plugin/federated/README.md index a5fa95e0c140..b9574b977e60 100644 --- a/plugin/federated/README.md +++ b/plugin/federated/README.md @@ -20,7 +20,12 @@ Build the Plugin # Under xgboost source tree. mkdir build cd build -cmake .. -GNinja -DPLUGIN_FEDERATED=ON +# For now NCCL needs to be turned off. +cmake .. -GNinja\ + -DPLUGIN_FEDERATED=ON\ + -DUSE_CUDA=ON\ + -DBUILD_WITH_CUDA_CUB=ON\ + -DUSE_NCCL=OFF ninja cd ../python-package pip install -e . # or equivalently python setup.py develop @@ -31,5 +36,6 @@ Test Federated XGBoost ```shell # Under xgboost source tree. cd tests/distributed +# This tests both CPU training (`hist`) and GPU training (`gpu_hist`). ./runtests-federated.sh ```