From 1530ae77f39496e9d5b8fd43a98107bbbaaf5126 Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Sat, 8 Oct 2022 23:10:44 -0700 Subject: [PATCH 01/26] Set up CMake to fetch gRPC on the fly --- CMakeLists.txt | 12 +++++++ plugin/federated/CMakeLists.txt | 57 ++++++++++++++++++++++++--------- plugin/federated/README.md | 12 ++----- 3 files changed, 55 insertions(+), 26 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fd087a68f208..147b59eb7b3e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,6 +4,7 @@ include(cmake/Utils.cmake) list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules") cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0079 NEW) +cmake_policy(SET CMP0076 NEW) set(CMAKE_POLICY_DEFAULT_CMP0063 NEW) cmake_policy(SET CMP0063 NEW) @@ -117,6 +118,17 @@ endif (BUILD_STATIC_LIB AND (R_LIB OR JVM_BINDINGS)) if (PLUGIN_RMM AND (NOT BUILD_WITH_CUDA_CUB)) message(SEND_ERROR "Cannot build with RMM using cub submodule.") endif (PLUGIN_RMM AND (NOT BUILD_WITH_CUDA_CUB)) +if (PLUGIN_FEDERATED) + if (CMAKE_CROSSCOMPILING) + message(SEND_ERROR "Cannot cross compile with federated learning support") + endif () + if (BUILD_STATIC_LIB) + message(SEND_ERROR "Cannot build static lib with federated learning support") + endif () + if (R_LIB OR JVM_BINDINGS) + message(SEND_ERROR "Cannot enable federated learning support when R or JVM packages are enabled.") + endif () +endif () #-- Sanitizer if (USE_SANITIZER) diff --git a/plugin/federated/CMakeLists.txt b/plugin/federated/CMakeLists.txt index 24ba47abfb8e..c56b3ea55de4 100644 --- a/plugin/federated/CMakeLists.txt +++ b/plugin/federated/CMakeLists.txt @@ -1,26 +1,51 @@ -# gRPC needs to be installed first. See README.md. -find_package(Protobuf CONFIG REQUIRED) -find_package(gRPC CONFIG REQUIRED) -find_package(Threads) +# Download and build gRPC +include(FetchContent) +FetchContent_Declare( + grpc + GIT_REPOSITORY https://github.com/grpc/grpc.git + GIT_TAG v1.49.1 +) +set(FETCHCONTENT_QUIET OFF) +FetchContent_MakeAvailable(grpc) +set(_PROTOBUF_LIBPROTOBUF libprotobuf) +set(_PROTOBUF_PROTOC $) +set(_GRPC_GRPCPP grpc++) +set(_GRPC_CPP_PLUGIN_EXECUTABLE $) + +# Proto file +get_filename_component(federated_proto "federated.proto" ABSOLUTE) +get_filename_component(federated_proto_path "${federated_proto}" PATH) # Generated code from the protobuf definition. -add_library(federated_proto federated.proto) -target_link_libraries(federated_proto PUBLIC protobuf::libprotobuf gRPC::grpc gRPC::grpc++) -target_include_directories(federated_proto PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) -set_property(TARGET federated_proto PROPERTY POSITION_INDEPENDENT_CODE ON) +set(federated_srcs "${CMAKE_CURRENT_BINARY_DIR}/federated.pb.cc") +set(federated_hdrs "${CMAKE_CURRENT_BINARY_DIR}/federated.pb.h") +set(federated_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/federated.grpc.pb.cc") +set(federated_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/federated.grpc.pb.h") +add_custom_command( + OUTPUT "${federated_srcs}" "${federated_hdrs}" + "${federated_grpc_srcs}" "${federated_grpc_hdrs}" + COMMAND ${_PROTOBUF_PROTOC} + ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" + --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" + -I "${federated_proto_path}" + --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}" + "${federated_proto}" + DEPENDS "${federated_proto}") -get_target_property(grpc_cpp_plugin_location gRPC::grpc_cpp_plugin LOCATION) -protobuf_generate(TARGET federated_proto LANGUAGE cpp) -protobuf_generate( - TARGET federated_proto - LANGUAGE grpc - GENERATE_EXTENSIONS .grpc.pb.h .grpc.pb.cc - PLUGIN "protoc-gen-grpc=${grpc_cpp_plugin_location}") +add_library(federated_proto_lib STATIC) +target_sources(federated_proto_lib PRIVATE + ${federated_srcs} ${federated_hdrs} + ${federated_grpc_srcs} ${federated_grpc_hdrs}) +target_link_libraries(federated_proto_lib PUBLIC + ${_PROTOBUF_LIBPROTOBUF} ${_GRPC_GRPCPP}) +target_include_directories(federated_proto_lib PUBLIC + ${CMAKE_CURRENT_BINARY_DIR}) +set_property(TARGET federated_proto_lib PROPERTY POSITION_INDEPENDENT_CODE ON) # Wrapper for the gRPC client. add_library(federated_client INTERFACE) target_sources(federated_client INTERFACE federated_client.h) -target_link_libraries(federated_client INTERFACE federated_proto) +target_link_libraries(federated_client INTERFACE federated_proto_lib) # Rabit engine for Federated Learning. target_sources(objxgboost PRIVATE federated_server.cc) diff --git a/plugin/federated/README.md b/plugin/federated/README.md index 5858d7cebf50..fe5df38f46b6 100644 --- a/plugin/federated/README.md +++ b/plugin/federated/README.md @@ -3,16 +3,8 @@ XGBoost Plugin for Federated Learning This folder contains the plugin for federated learning. Follow these steps to build and test it. -Install gRPC ------------- -```shell -sudo apt-get install build-essential autoconf libtool pkg-config cmake ninja-build -git clone -b v1.47.0 https://github.com/grpc/grpc -cd grpc -git submodule update --init -cmake -S . -B build -GNinja -DABSL_PROPAGATE_CXX_STD=ON -cmake --build build --target install -``` +Note. Building XGBoost with `-DPLUGIN_FEDERATED=ON` flag will automatically download the gRPC +source code and build it, along with its dependencies. This will increase compilation time. Build the Plugin ---------------- From be9edac425c3014e185c4efcbc6cdf5965c06768 Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Sat, 8 Oct 2022 23:11:08 -0700 Subject: [PATCH 02/26] Always use device 0 in FederatedAdapterTest, as gtest only uses a single GPU --- tests/cpp/plugin/test_federated_adapter.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/cpp/plugin/test_federated_adapter.cu b/tests/cpp/plugin/test_federated_adapter.cu index 09187f940c5f..65c673be8c3b 100644 --- a/tests/cpp/plugin/test_federated_adapter.cu +++ b/tests/cpp/plugin/test_federated_adapter.cu @@ -54,7 +54,7 @@ TEST_F(FederatedAdapterTest, DeviceAllReduceSum) { for (auto rank = 0; rank < kWorldSize; rank++) { threads.emplace_back(std::thread([rank] { FederatedCommunicator comm{kWorldSize, rank, kServerAddress}; - DeviceCommunicatorAdapter adapter{rank, &comm}; + DeviceCommunicatorAdapter adapter{0, &comm}; int const count = 3; thrust::device_vector buffer(count, 0); thrust::sequence(buffer.begin(), buffer.end()); @@ -76,7 +76,7 @@ TEST_F(FederatedAdapterTest, DeviceAllGatherV) { for (auto rank = 0; rank < kWorldSize; rank++) { threads.emplace_back(std::thread([rank] { FederatedCommunicator comm{kWorldSize, rank, kServerAddress}; - DeviceCommunicatorAdapter adapter{rank, &comm}; + DeviceCommunicatorAdapter adapter{0, &comm}; int const count = rank + 2; thrust::device_vector buffer(count, 0); From 4ce654180b4bb1deee63d6cc40da37833b0b53e3 Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Sat, 8 Oct 2022 23:24:13 -0700 Subject: [PATCH 03/26] Disallow federated plugin when Windows is used --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 147b59eb7b3e..64aeae29c737 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -128,6 +128,9 @@ if (PLUGIN_FEDERATED) if (R_LIB OR JVM_BINDINGS) message(SEND_ERROR "Cannot enable federated learning support when R or JVM packages are enabled.") endif () + if (WIN32) + message(SEND_ERROR "Federated learning not supported for Windows platform") + endif () endif () #-- Sanitizer From 962c30ac58c064c64d655cac16f99b499db5f387 Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Sat, 8 Oct 2022 23:42:02 -0700 Subject: [PATCH 04/26] Add option to use gRPC from system / Conda --- CMakeLists.txt | 2 ++ plugin/federated/CMakeLists.txt | 37 ++++++++++++++++++++++----------- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 64aeae29c737..cafcef63fc1e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,6 +68,8 @@ address, leak, undefined and thread.") option(PLUGIN_DENSE_PARSER "Build dense parser plugin" OFF) option(PLUGIN_RMM "Build with RAPIDS Memory Manager (RMM)" OFF) option(PLUGIN_FEDERATED "Build with Federated Learning" OFF) +option(USE_GRPC_FROM_SYSTEM "Use gRPC from the system; if OFF, CMake will download +gRPC automatically. Only applicable if PLUGIN_FEDERATED=ON" OFF) ## TODO: 1. Add check if DPC++ compiler is used for building option(PLUGIN_UPDATER_ONEAPI "DPC++ updater" OFF) option(ADD_PKGCONFIG "Add xgboost.pc into system." ON) diff --git a/plugin/federated/CMakeLists.txt b/plugin/federated/CMakeLists.txt index c56b3ea55de4..204d2997357d 100644 --- a/plugin/federated/CMakeLists.txt +++ b/plugin/federated/CMakeLists.txt @@ -1,16 +1,29 @@ # Download and build gRPC -include(FetchContent) -FetchContent_Declare( - grpc - GIT_REPOSITORY https://github.com/grpc/grpc.git - GIT_TAG v1.49.1 -) -set(FETCHCONTENT_QUIET OFF) -FetchContent_MakeAvailable(grpc) -set(_PROTOBUF_LIBPROTOBUF libprotobuf) -set(_PROTOBUF_PROTOC $) -set(_GRPC_GRPCPP grpc++) -set(_GRPC_CPP_PLUGIN_EXECUTABLE $) +if (USE_GRPC_FROM_SYSTEM) + message(STATUS "Attempting to locate gRPC installation from the system...") + set(protobuf_MODULE_COMPATIBLE TRUE) + find_package(Protobuf CONFIG REQUIRED) + find_package(gRPC CONFIG REQUIRED) + message(STATUS "Found gRPC: ${gRPC_CONFIG}") + set(_PROTOBUF_LIBPROTOBUF protobuf::libprotobuf) + set(_PROTOBUF_PROTOC $) + set(_GRPC_GRPCPP gRPC::grpc++) + set(_GRPC_CPP_PLUGIN_EXECUTABLE $) +else () + message(STATUS "Downloading gRPC source from GitHub...") + include(FetchContent) + FetchContent_Declare( + grpc + GIT_REPOSITORY https://github.com/grpc/grpc.git + GIT_TAG v1.49.1 + ) + set(FETCHCONTENT_QUIET OFF) + FetchContent_MakeAvailable(grpc) + set(_PROTOBUF_LIBPROTOBUF libprotobuf) + set(_PROTOBUF_PROTOC $) + set(_GRPC_GRPCPP grpc++) + set(_GRPC_CPP_PLUGIN_EXECUTABLE $) +endif () # Proto file get_filename_component(federated_proto "federated.proto" ABSOLUTE) From 157cc3d059cbd6bf264da8c5c5a54deaf834ff4d Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Sat, 8 Oct 2022 23:56:05 -0700 Subject: [PATCH 05/26] Remove unused Dockerfile.gpu_build --- tests/ci_build/Dockerfile.gpu_build | 49 ----------------------------- 1 file changed, 49 deletions(-) delete mode 100644 tests/ci_build/Dockerfile.gpu_build diff --git a/tests/ci_build/Dockerfile.gpu_build b/tests/ci_build/Dockerfile.gpu_build deleted file mode 100644 index 0d9f6a27c5ea..000000000000 --- a/tests/ci_build/Dockerfile.gpu_build +++ /dev/null @@ -1,49 +0,0 @@ -ARG CUDA_VERSION_ARG -FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu16.04 -ARG CUDA_VERSION_ARG - -# Environment -ENV DEBIAN_FRONTEND noninteractive -SHELL ["/bin/bash", "-c"] # Use Bash as shell - -# Install all basic requirements -RUN \ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/3bf863cc.pub && \ - apt-get update && \ - apt-get install -y software-properties-common && \ - add-apt-repository ppa:ubuntu-toolchain-r/test && \ - apt-get update && \ - apt-get install -y tar unzip wget bzip2 libgomp1 git build-essential doxygen graphviz llvm libasan2 libidn11 ninja-build gcc-8 g++-8 && \ - # CMake - wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \ - bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr && \ - # Python - wget -nv -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ - bash Miniconda3.sh -b -p /opt/python - -# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) -RUN \ - export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \ - export NCCL_VERSION=2.13.4-1 && \ - apt-get update && \ - apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT} - -ENV PATH=/opt/python/bin:$PATH -ENV CC=gcc-8 -ENV CXX=g++-8 -ENV CPP=cpp-8 - -ENV GOSU_VERSION 1.10 - -# Install lightweight sudo (not bound to TTY) -RUN set -ex; \ - wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] From ae62808ab7ee0fff5c0d9dfb15f38a4916607e61 Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Sun, 9 Oct 2022 00:12:16 -0700 Subject: [PATCH 06/26] Speed up FetchContent --- plugin/federated/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/plugin/federated/CMakeLists.txt b/plugin/federated/CMakeLists.txt index 204d2997357d..c3ca9131007b 100644 --- a/plugin/federated/CMakeLists.txt +++ b/plugin/federated/CMakeLists.txt @@ -16,6 +16,7 @@ else () grpc GIT_REPOSITORY https://github.com/grpc/grpc.git GIT_TAG v1.49.1 + GIT_SHALLOW ON ) set(FETCHCONTENT_QUIET OFF) FetchContent_MakeAvailable(grpc) From 2b951074934d60bfcae18c779e8885cd2a5aabb2 Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Sun, 9 Oct 2022 00:23:47 -0700 Subject: [PATCH 07/26] Install gRPC into build containers --- tests/ci_build/Dockerfile.cpu | 9 +++++++++ tests/ci_build/Dockerfile.gpu_build_centos7 | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/tests/ci_build/Dockerfile.cpu b/tests/ci_build/Dockerfile.cpu index 786ab834b014..517ed49af3f9 100644 --- a/tests/ci_build/Dockerfile.cpu +++ b/tests/ci_build/Dockerfile.cpu @@ -26,6 +26,15 @@ ENV CPP=cpp-8 ENV GOSU_VERSION 1.10 ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/ +# Install gRPC +RUN git clone -b v1.49.1 https://github.com/grpc/grpc.git \ + --recurse-submodules --depth 1 --shallow-submodules && \ + pushd grpc && \ + cmake -S . -B build -GNinja && \ + cmake --build build --target install && \ + popd && \ + rm -rf grpc + # Create new Conda environment COPY conda_env/cpu_test.yml /scripts/ RUN mamba env create -n cpu_test --file=/scripts/cpu_test.yml diff --git a/tests/ci_build/Dockerfile.gpu_build_centos7 b/tests/ci_build/Dockerfile.gpu_build_centos7 index d92bb4984b0e..5954950a1914 100644 --- a/tests/ci_build/Dockerfile.gpu_build_centos7 +++ b/tests/ci_build/Dockerfile.gpu_build_centos7 @@ -35,6 +35,15 @@ ENV CPP=/opt/rh/devtoolset-8/root/usr/bin/cpp ENV GOSU_VERSION 1.10 +# Install gRPC +RUN git clone -b v1.49.1 https://github.com/grpc/grpc.git \ + --recurse-submodules --depth 1 && \ + pushd grpc && \ + cmake -S . -B build -GNinja && \ + cmake --build build --target install && \ + popd && \ + rm -rf grpc + # Install lightweight sudo (not bound to TTY) RUN set -ex; \ wget -nv -nc -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ From c8bf4d40f61fdfd01b962310d05ed751f08bc98f Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Sun, 9 Oct 2022 01:25:37 -0700 Subject: [PATCH 08/26] Build CPU and GPU binary with federated learning support --- tests/buildkite/build-cpu.sh | 2 +- tests/buildkite/build-cuda.sh | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/buildkite/build-cpu.sh b/tests/buildkite/build-cpu.sh index 60c84c52ccfb..52908ee2a07c 100755 --- a/tests/buildkite/build-cpu.sh +++ b/tests/buildkite/build-cpu.sh @@ -14,7 +14,7 @@ $command_wrapper rm -fv dmlc-core/include/dmlc/build_config_default.h # the configured header build/dmlc/build_config.h instead of # include/dmlc/build_config_default.h. echo "--- Build libxgboost from the source" -$command_wrapper tests/ci_build/build_via_cmake.sh -DPLUGIN_DENSE_PARSER=ON +$command_wrapper tests/ci_build/build_via_cmake.sh -DPLUGIN_DENSE_PARSER=ON -DPLUGIN_FEDERATED=ON echo "--- Run Google Test" $command_wrapper bash -c "cd build && ctest --extra-verbose" echo "--- Stash XGBoost CLI executable" diff --git a/tests/buildkite/build-cuda.sh b/tests/buildkite/build-cuda.sh index f8efb0853c11..686c346b352e 100755 --- a/tests/buildkite/build-cuda.sh +++ b/tests/buildkite/build-cuda.sh @@ -21,9 +21,9 @@ command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg echo "--- Build libxgboost from the source" $command_wrapper tests/ci_build/prune_libnccl.sh $command_wrapper tests/ci_build/build_via_cmake.sh -DUSE_CUDA=ON -DUSE_NCCL=ON \ - -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DUSE_NCCL_LIB_PATH=ON \ - -DNCCL_INCLUDE_DIR=/usr/include -DNCCL_LIBRARY=/workspace/libnccl_static.a \ - ${arch_flag} + -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DPLUGIN_FEDERATED=ON \ + -DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \ + -DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag} echo "--- Build binary wheel" $command_wrapper bash -c \ "cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal" From 59a14d04073bb335b27885321d3a5f1b1ce9203e Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Sun, 9 Oct 2022 01:29:48 -0700 Subject: [PATCH 09/26] Update doc, since USE_NCCL can be used --- plugin/federated/README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/plugin/federated/README.md b/plugin/federated/README.md index fe5df38f46b6..132e5e2386bc 100644 --- a/plugin/federated/README.md +++ b/plugin/federated/README.md @@ -5,6 +5,8 @@ This folder contains the plugin for federated learning. Follow these steps to bu Note. Building XGBoost with `-DPLUGIN_FEDERATED=ON` flag will automatically download the gRPC source code and build it, along with its dependencies. This will increase compilation time. +If you already have gRPC installed on your system, pass additional flag +`-DUSE_GRPC_FROM_SYSTEM=ON` to speed up the build. Build the Plugin ---------------- @@ -12,12 +14,11 @@ Build the Plugin # Under xgboost source tree. mkdir build cd build -# For now NCCL needs to be turned off. cmake .. -GNinja\ -DPLUGIN_FEDERATED=ON\ - -DUSE_CUDA=ON\ -DBUILD_WITH_CUDA_CUB=ON\ - -DUSE_NCCL=OFF + -DUSE_CUDA=ON\ + -DUSE_NCCL=ON ninja cd ../python-package pip install -e . # or equivalently python setup.py develop From e714002161f059b17960cdec11dda8394031b434 Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Sun, 9 Oct 2022 01:38:56 -0700 Subject: [PATCH 10/26] Explicitly build containers cpu / gpu_build_centos7 --- tests/buildkite/pipeline-mgpu.yml | 1 + tests/buildkite/pipeline.yml | 2 ++ 2 files changed, 3 insertions(+) diff --git a/tests/buildkite/pipeline-mgpu.yml b/tests/buildkite/pipeline-mgpu.yml index 690027da5009..75d7855b6dc9 100644 --- a/tests/buildkite/pipeline-mgpu.yml +++ b/tests/buildkite/pipeline-mgpu.yml @@ -17,6 +17,7 @@ steps: - label: ":docker: Build containers" commands: - "tests/buildkite/build-containers.sh gpu" + - "tests/buildkite/build-containers.sh gpu_build_centos7" - "tests/buildkite/build-containers.sh jvm_gpu_build" key: build-containers agents: diff --git a/tests/buildkite/pipeline.yml b/tests/buildkite/pipeline.yml index 8d6ab86d95f6..e2a4fcaf2405 100644 --- a/tests/buildkite/pipeline.yml +++ b/tests/buildkite/pipeline.yml @@ -13,7 +13,9 @@ steps: #### -------- CONTAINER BUILD -------- - label: ":docker: Build containers" commands: + - "tests/buildkite/build-containers.sh cpu" - "tests/buildkite/build-containers.sh gpu" + - "tests/buildkite/build-containers.sh gpu_build_centos7" - "tests/buildkite/build-containers.sh rmm" key: build-containers agents: From 6052cf2611ec54f38da18fd455f4a30b44b653a0 Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Sun, 9 Oct 2022 01:45:23 -0700 Subject: [PATCH 11/26] Fix --- tests/buildkite/build-containers.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/buildkite/build-containers.sh b/tests/buildkite/build-containers.sh index b12da2a634ed..75353efc8485 100755 --- a/tests/buildkite/build-containers.sh +++ b/tests/buildkite/build-containers.sh @@ -6,7 +6,7 @@ set -x if [ "$#" -lt 1 ] then echo "Usage: $0 [container to build]" - return 1 + exit 1 fi container=$1 @@ -17,6 +17,8 @@ echo "--- Build container ${container}" BUILD_ARGS="" case "${container}" in + cpu) + ;; gpu|rmm) BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION" @@ -28,7 +30,7 @@ case "${container}" in *) echo "Unrecognized container ID: ${container}" - return 2 + exit 2 ;; esac From 66b410b71a177853353d25b3cc4f81861cac8ab2 Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Sun, 9 Oct 2022 02:07:13 -0700 Subject: [PATCH 12/26] Remove option for FetchContent --- CMakeLists.txt | 2 -- plugin/federated/CMakeLists.txt | 36 +++++++++------------------------ plugin/federated/README.md | 21 +++++++++++++++---- 3 files changed, 26 insertions(+), 33 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cafcef63fc1e..64aeae29c737 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,8 +68,6 @@ address, leak, undefined and thread.") option(PLUGIN_DENSE_PARSER "Build dense parser plugin" OFF) option(PLUGIN_RMM "Build with RAPIDS Memory Manager (RMM)" OFF) option(PLUGIN_FEDERATED "Build with Federated Learning" OFF) -option(USE_GRPC_FROM_SYSTEM "Use gRPC from the system; if OFF, CMake will download -gRPC automatically. Only applicable if PLUGIN_FEDERATED=ON" OFF) ## TODO: 1. Add check if DPC++ compiler is used for building option(PLUGIN_UPDATER_ONEAPI "DPC++ updater" OFF) option(ADD_PKGCONFIG "Add xgboost.pc into system." ON) diff --git a/plugin/federated/CMakeLists.txt b/plugin/federated/CMakeLists.txt index c3ca9131007b..f83c24b552ec 100644 --- a/plugin/federated/CMakeLists.txt +++ b/plugin/federated/CMakeLists.txt @@ -1,30 +1,12 @@ -# Download and build gRPC -if (USE_GRPC_FROM_SYSTEM) - message(STATUS "Attempting to locate gRPC installation from the system...") - set(protobuf_MODULE_COMPATIBLE TRUE) - find_package(Protobuf CONFIG REQUIRED) - find_package(gRPC CONFIG REQUIRED) - message(STATUS "Found gRPC: ${gRPC_CONFIG}") - set(_PROTOBUF_LIBPROTOBUF protobuf::libprotobuf) - set(_PROTOBUF_PROTOC $) - set(_GRPC_GRPCPP gRPC::grpc++) - set(_GRPC_CPP_PLUGIN_EXECUTABLE $) -else () - message(STATUS "Downloading gRPC source from GitHub...") - include(FetchContent) - FetchContent_Declare( - grpc - GIT_REPOSITORY https://github.com/grpc/grpc.git - GIT_TAG v1.49.1 - GIT_SHALLOW ON - ) - set(FETCHCONTENT_QUIET OFF) - FetchContent_MakeAvailable(grpc) - set(_PROTOBUF_LIBPROTOBUF libprotobuf) - set(_PROTOBUF_PROTOC $) - set(_GRPC_GRPCPP grpc++) - set(_GRPC_CPP_PLUGIN_EXECUTABLE $) -endif () +# gRPC needs to be installed first. See README.md. +set(protobuf_MODULE_COMPATIBLE TRUE) +find_package(Protobuf CONFIG REQUIRED) +find_package(gRPC CONFIG REQUIRED) +message(STATUS "Found gRPC: ${gRPC_CONFIG}") +set(_PROTOBUF_LIBPROTOBUF protobuf::libprotobuf) +set(_PROTOBUF_PROTOC $) +set(_GRPC_GRPCPP gRPC::grpc++) +set(_GRPC_CPP_PLUGIN_EXECUTABLE $) # Proto file get_filename_component(federated_proto "federated.proto" ABSOLUTE) diff --git a/plugin/federated/README.md b/plugin/federated/README.md index 132e5e2386bc..5bf25dfbb807 100644 --- a/plugin/federated/README.md +++ b/plugin/federated/README.md @@ -3,10 +3,23 @@ XGBoost Plugin for Federated Learning This folder contains the plugin for federated learning. Follow these steps to build and test it. -Note. Building XGBoost with `-DPLUGIN_FEDERATED=ON` flag will automatically download the gRPC -source code and build it, along with its dependencies. This will increase compilation time. -If you already have gRPC installed on your system, pass additional flag -`-DUSE_GRPC_FROM_SYSTEM=ON` to speed up the build. +Install gRPC +------------ +We highly recommend installing gRPC in a local environment, such as a Conda environment, +by appropriately setting `CMAKE_INSTALL_PREFIX`. +There is no easy way to uninstall gRPC after you've installed it globally. + +In the following example, we show how to build and install gRPC in a Conda environment. +```shell +sudo apt-get install build-essential autoconf libtool pkg-config cmake ninja-build +conda activate your_env +git clone -b v1.49.1 https://github.com/grpc/grpc \ + --recurse-submodules --depth 1 --shallow-submodules +cd grpc +cmake -S . -B build -GNinja -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX +cmake --build build --target install +cd .. +``` Build the Plugin ---------------- From bb6dbbaac179e3e6208010285e232120ba5e27ba Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Sun, 9 Oct 2022 02:08:14 -0700 Subject: [PATCH 13/26] Fix build-containers.sh --- tests/buildkite/build-containers.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/buildkite/build-containers.sh b/tests/buildkite/build-containers.sh index 75353efc8485..41a13eaea5fb 100755 --- a/tests/buildkite/build-containers.sh +++ b/tests/buildkite/build-containers.sh @@ -19,12 +19,13 @@ BUILD_ARGS="" case "${container}" in cpu) ;; + gpu|rmm) BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION" ;; - jvm_gpu_build) + gpu_build_centos7|jvm_gpu_build) BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" ;; From af7e074f0e752c07034509636ceebaad8a450f84 Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Sun, 9 Oct 2022 02:45:54 -0700 Subject: [PATCH 14/26] Don't use ctest --- tests/buildkite/build-cpu.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/buildkite/build-cpu.sh b/tests/buildkite/build-cpu.sh index 52908ee2a07c..aacef36e1028 100755 --- a/tests/buildkite/build-cpu.sh +++ b/tests/buildkite/build-cpu.sh @@ -16,7 +16,7 @@ $command_wrapper rm -fv dmlc-core/include/dmlc/build_config_default.h echo "--- Build libxgboost from the source" $command_wrapper tests/ci_build/build_via_cmake.sh -DPLUGIN_DENSE_PARSER=ON -DPLUGIN_FEDERATED=ON echo "--- Run Google Test" -$command_wrapper bash -c "cd build && ctest --extra-verbose" +$command_wrapper bash -c "cd build && ./testxgboost" echo "--- Stash XGBoost CLI executable" buildkite-agent artifact upload ./xgboost From 9238d8b322179f61f93127ffa0aa7fbd76438081 Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Mon, 10 Oct 2022 15:36:07 -0700 Subject: [PATCH 15/26] Revert "Don't use ctest" This reverts commit af7e074f0e752c07034509636ceebaad8a450f84. --- tests/buildkite/build-cpu.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/buildkite/build-cpu.sh b/tests/buildkite/build-cpu.sh index aacef36e1028..52908ee2a07c 100755 --- a/tests/buildkite/build-cpu.sh +++ b/tests/buildkite/build-cpu.sh @@ -16,7 +16,7 @@ $command_wrapper rm -fv dmlc-core/include/dmlc/build_config_default.h echo "--- Build libxgboost from the source" $command_wrapper tests/ci_build/build_via_cmake.sh -DPLUGIN_DENSE_PARSER=ON -DPLUGIN_FEDERATED=ON echo "--- Run Google Test" -$command_wrapper bash -c "cd build && ./testxgboost" +$command_wrapper bash -c "cd build && ctest --extra-verbose" echo "--- Stash XGBoost CLI executable" buildkite-agent artifact upload ./xgboost From 90462c0f008199ca16ba53fcc5d0c070fd8c31e4 Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Mon, 10 Oct 2022 18:09:32 -0700 Subject: [PATCH 16/26] Install gRPC in a separate Conda env --- tests/buildkite/build-cpu.sh | 3 ++- tests/buildkite/build-cuda.sh | 6 +++--- tests/ci_build/Dockerfile.cpu | 13 +++++++------ tests/ci_build/Dockerfile.gpu_build_centos7 | 14 ++++++++------ 4 files changed, 20 insertions(+), 16 deletions(-) diff --git a/tests/buildkite/build-cpu.sh b/tests/buildkite/build-cpu.sh index 52908ee2a07c..a2637c6253d6 100755 --- a/tests/buildkite/build-cpu.sh +++ b/tests/buildkite/build-cpu.sh @@ -14,7 +14,8 @@ $command_wrapper rm -fv dmlc-core/include/dmlc/build_config_default.h # the configured header build/dmlc/build_config.h instead of # include/dmlc/build_config_default.h. echo "--- Build libxgboost from the source" -$command_wrapper tests/ci_build/build_via_cmake.sh -DPLUGIN_DENSE_PARSER=ON -DPLUGIN_FEDERATED=ON +$command_wrapper tests/ci_build/build_via_cmake.sh --conda-env=build_env \ + -DPLUGIN_DENSE_PARSER=ON -DPLUGIN_FEDERATED=ON echo "--- Run Google Test" $command_wrapper bash -c "cd build && ctest --extra-verbose" echo "--- Stash XGBoost CLI executable" diff --git a/tests/buildkite/build-cuda.sh b/tests/buildkite/build-cuda.sh index 686c346b352e..703f391293a2 100755 --- a/tests/buildkite/build-cuda.sh +++ b/tests/buildkite/build-cuda.sh @@ -20,9 +20,9 @@ command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg echo "--- Build libxgboost from the source" $command_wrapper tests/ci_build/prune_libnccl.sh -$command_wrapper tests/ci_build/build_via_cmake.sh -DUSE_CUDA=ON -DUSE_NCCL=ON \ - -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DPLUGIN_FEDERATED=ON \ - -DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \ +$command_wrapper tests/ci_build/build_via_cmake.sh --conda-env=build_env \ + -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON \ + -DPLUGIN_FEDERATED=ON -DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \ -DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag} echo "--- Build binary wheel" $command_wrapper bash -c \ diff --git a/tests/ci_build/Dockerfile.cpu b/tests/ci_build/Dockerfile.cpu index 517ed49af3f9..c5ca3ea4b81e 100644 --- a/tests/ci_build/Dockerfile.cpu +++ b/tests/ci_build/Dockerfile.cpu @@ -14,7 +14,7 @@ RUN \ # CMake wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \ bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr && \ - # Python + # Conda wget -nv https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh && \ bash Mambaforge-Linux-x86_64.sh -b -p /opt/python @@ -26,19 +26,20 @@ ENV CPP=cpp-8 ENV GOSU_VERSION 1.10 ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/ +# Create new Conda environment +COPY conda_env/cpu_test.yml /scripts/ +RUN mamba env create -n cpu_test --file=/scripts/cpu_test.yml && \ + mamba create -y -n build_env -c conda-forge python=3.9 + # Install gRPC RUN git clone -b v1.49.1 https://github.com/grpc/grpc.git \ --recurse-submodules --depth 1 --shallow-submodules && \ pushd grpc && \ - cmake -S . -B build -GNinja && \ + cmake -S . -B build -GNinja -DCMAKE_PREFIX_PATH=/opt/python/envs/build_env && \ cmake --build build --target install && \ popd && \ rm -rf grpc -# Create new Conda environment -COPY conda_env/cpu_test.yml /scripts/ -RUN mamba env create -n cpu_test --file=/scripts/cpu_test.yml - # Install lightweight sudo (not bound to TTY) RUN set -ex; \ wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ diff --git a/tests/ci_build/Dockerfile.gpu_build_centos7 b/tests/ci_build/Dockerfile.gpu_build_centos7 index 5954950a1914..f1e9c766884f 100644 --- a/tests/ci_build/Dockerfile.gpu_build_centos7 +++ b/tests/ci_build/Dockerfile.gpu_build_centos7 @@ -10,13 +10,12 @@ RUN \ yum-config-manager --enable centos-sclo-rh-testing && \ yum -y update && \ yum install -y tar unzip wget xz git which ninja-build devtoolset-8-gcc devtoolset-8-binutils devtoolset-8-gcc-c++ && \ - # Python - wget -nv -nc -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ - bash Miniconda3.sh -b -p /opt/python && \ - /opt/python/bin/python -m pip install awscli && \ # CMake wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \ - bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr + bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr && \ + # Conda + wget -nv https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh && \ + bash Mambaforge-Linux-x86_64.sh -b -p /opt/python # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) RUN \ @@ -35,11 +34,14 @@ ENV CPP=/opt/rh/devtoolset-8/root/usr/bin/cpp ENV GOSU_VERSION 1.10 +# Create new Conda environment +RUN mamba create -y -n build_env -c conda-forge python=3.9 + # Install gRPC RUN git clone -b v1.49.1 https://github.com/grpc/grpc.git \ --recurse-submodules --depth 1 && \ pushd grpc && \ - cmake -S . -B build -GNinja && \ + cmake -S . -B build -GNinja -DCMAKE_PREFIX_PATH=/opt/python/envs/build_env && \ cmake --build build --target install && \ popd && \ rm -rf grpc From 69ec17e672daafcb35e8ebef81ab21d03778f08c Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Mon, 10 Oct 2022 23:00:12 -0700 Subject: [PATCH 17/26] Address reviewer's comment --- plugin/federated/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/plugin/federated/CMakeLists.txt b/plugin/federated/CMakeLists.txt index f83c24b552ec..f5c071383d7f 100644 --- a/plugin/federated/CMakeLists.txt +++ b/plugin/federated/CMakeLists.txt @@ -1,5 +1,6 @@ # gRPC needs to be installed first. See README.md. set(protobuf_MODULE_COMPATIBLE TRUE) +set(protobuf_BUILD_SHARED_LIBS TRUE) find_package(Protobuf CONFIG REQUIRED) find_package(gRPC CONFIG REQUIRED) message(STATUS "Found gRPC: ${gRPC_CONFIG}") @@ -36,7 +37,7 @@ target_link_libraries(federated_proto_lib PUBLIC ${_PROTOBUF_LIBPROTOBUF} ${_GRPC_GRPCPP}) target_include_directories(federated_proto_lib PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) -set_property(TARGET federated_proto_lib PROPERTY POSITION_INDEPENDENT_CODE ON) +xgboost_target_properties(federated_proto_lib) # Wrapper for the gRPC client. add_library(federated_client INTERFACE) From 16047762393cfb9c448a80469c2f62e76a75fe14 Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Mon, 10 Oct 2022 23:30:18 -0700 Subject: [PATCH 18/26] Use CMAKE_INSTALL_PREFIX --- tests/ci_build/Dockerfile.cpu | 2 +- tests/ci_build/Dockerfile.gpu_build_centos7 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/ci_build/Dockerfile.cpu b/tests/ci_build/Dockerfile.cpu index c5ca3ea4b81e..1f40dcdf918e 100644 --- a/tests/ci_build/Dockerfile.cpu +++ b/tests/ci_build/Dockerfile.cpu @@ -35,7 +35,7 @@ RUN mamba env create -n cpu_test --file=/scripts/cpu_test.yml && \ RUN git clone -b v1.49.1 https://github.com/grpc/grpc.git \ --recurse-submodules --depth 1 --shallow-submodules && \ pushd grpc && \ - cmake -S . -B build -GNinja -DCMAKE_PREFIX_PATH=/opt/python/envs/build_env && \ + cmake -S . -B build -GNinja -DCMAKE_INSTALL_PREFIX=/opt/python/envs/build_env && \ cmake --build build --target install && \ popd && \ rm -rf grpc diff --git a/tests/ci_build/Dockerfile.gpu_build_centos7 b/tests/ci_build/Dockerfile.gpu_build_centos7 index f1e9c766884f..6efac1903f0e 100644 --- a/tests/ci_build/Dockerfile.gpu_build_centos7 +++ b/tests/ci_build/Dockerfile.gpu_build_centos7 @@ -41,7 +41,7 @@ RUN mamba create -y -n build_env -c conda-forge python=3.9 RUN git clone -b v1.49.1 https://github.com/grpc/grpc.git \ --recurse-submodules --depth 1 && \ pushd grpc && \ - cmake -S . -B build -GNinja -DCMAKE_PREFIX_PATH=/opt/python/envs/build_env && \ + cmake -S . -B build -GNinja -DCMAKE_INSTALL_PREFIX=/opt/python/envs/build_env && \ cmake --build build --target install && \ popd && \ rm -rf grpc From 85110d50c39d3344887488284cb6d9ca796886b3 Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Tue, 11 Oct 2022 10:56:39 -0700 Subject: [PATCH 19/26] Randomize server port in gtest --- tests/cpp/plugin/test_federated_adapter.cu | 22 +++++--- .../cpp/plugin/test_federated_communicator.cc | 50 +++++++++++++------ tests/cpp/plugin/test_federated_server.cc | 39 +++++++++------ 3 files changed, 74 insertions(+), 37 deletions(-) diff --git a/tests/cpp/plugin/test_federated_adapter.cu b/tests/cpp/plugin/test_federated_adapter.cu index 65c673be8c3b..5b63f74cc4f1 100644 --- a/tests/cpp/plugin/test_federated_adapter.cu +++ b/tests/cpp/plugin/test_federated_adapter.cu @@ -6,7 +6,9 @@ #include #include +#include +#include "../helpers.h" #include "../../../plugin/federated/federated_communicator.h" #include "../../../plugin/federated/federated_server.h" #include "../../../src/collective/device_communicator_adapter.cuh" @@ -14,15 +16,20 @@ namespace xgboost { namespace collective { -std::string const kServerAddress{"localhost:56789"}; // NOLINT(cert-err58-cpp) - class FederatedAdapterTest : public ::testing::Test { protected: + std::string GetServerAddress() { + SimpleLCG lcg(std::time(NULL)); + std::uniform_int_distribution dist(50000, 60000); + int port = dist(lcg); + return std::string("localhost:") + std::to_string(port); + } void SetUp() override { + server_address_ = GetServerAddress(); server_thread_.reset(new std::thread([this] { grpc::ServerBuilder builder; federated::FederatedService service{kWorldSize}; - builder.AddListeningPort(kServerAddress, grpc::InsecureServerCredentials()); + builder.AddListeningPort(server_address_, grpc::InsecureServerCredentials()); builder.RegisterService(&service); server_ = builder.BuildAndStart(); server_->Wait(); @@ -35,6 +42,7 @@ class FederatedAdapterTest : public ::testing::Test { } static int const kWorldSize{2}; + std::string server_address_; std::unique_ptr server_thread_; std::unique_ptr server_; }; @@ -52,8 +60,8 @@ TEST(FederatedAdapterSimpleTest, ThrowOnInvalidCommunicator) { TEST_F(FederatedAdapterTest, DeviceAllReduceSum) { std::vector threads; for (auto rank = 0; rank < kWorldSize; rank++) { - threads.emplace_back(std::thread([rank] { - FederatedCommunicator comm{kWorldSize, rank, kServerAddress}; + threads.emplace_back(std::thread([rank, server_address=server_address_] { + FederatedCommunicator comm{kWorldSize, rank, server_address}; DeviceCommunicatorAdapter adapter{0, &comm}; int const count = 3; thrust::device_vector buffer(count, 0); @@ -74,8 +82,8 @@ TEST_F(FederatedAdapterTest, DeviceAllReduceSum) { TEST_F(FederatedAdapterTest, DeviceAllGatherV) { std::vector threads; for (auto rank = 0; rank < kWorldSize; rank++) { - threads.emplace_back(std::thread([rank] { - FederatedCommunicator comm{kWorldSize, rank, kServerAddress}; + threads.emplace_back(std::thread([rank, server_address=server_address_] { + FederatedCommunicator comm{kWorldSize, rank, server_address}; DeviceCommunicatorAdapter adapter{0, &comm}; int const count = rank + 2; diff --git a/tests/cpp/plugin/test_federated_communicator.cc b/tests/cpp/plugin/test_federated_communicator.cc index 2d9f233db573..23694b1a8e4b 100644 --- a/tests/cpp/plugin/test_federated_communicator.cc +++ b/tests/cpp/plugin/test_federated_communicator.cc @@ -6,33 +6,41 @@ #include #include +#include +#include "../helpers.h" #include "../../../plugin/federated/federated_communicator.h" #include "../../../plugin/federated/federated_server.h" namespace xgboost { namespace collective { - -std::string const kServerAddress{"localhost:56789"}; // NOLINT(cert-err58-cpp) + +std::string GetServerAddress() { + SimpleLCG lcg(std::time(NULL)); + std::uniform_int_distribution dist(50000, 60000); + int port = dist(lcg); + return std::string("localhost:") + std::to_string(port); +} class FederatedCommunicatorTest : public ::testing::Test { public: - static void VerifyAllreduce(int rank) { - FederatedCommunicator comm{kWorldSize, rank, kServerAddress}; + static void VerifyAllreduce(int rank, const std::string& server_address) { + FederatedCommunicator comm{kWorldSize, rank, server_address}; CheckAllreduce(comm); } - static void VerifyBroadcast(int rank) { - FederatedCommunicator comm{kWorldSize, rank, kServerAddress}; + static void VerifyBroadcast(int rank, const std::string& server_address) { + FederatedCommunicator comm{kWorldSize, rank, server_address}; CheckBroadcast(comm, rank); } protected: void SetUp() override { + server_address_ = GetServerAddress(); server_thread_.reset(new std::thread([this] { grpc::ServerBuilder builder; federated::FederatedService service{kWorldSize}; - builder.AddListeningPort(kServerAddress, grpc::InsecureServerCredentials()); + builder.AddListeningPort(server_address_, grpc::InsecureServerCredentials()); builder.RegisterService(&service); server_ = builder.BuildAndStart(); server_->Wait(); @@ -66,40 +74,53 @@ class FederatedCommunicatorTest : public ::testing::Test { } static int const kWorldSize{3}; + std::string server_address_; std::unique_ptr server_thread_; std::unique_ptr server_; }; TEST(FederatedCommunicatorSimpleTest, ThrowOnWorldSizeTooSmall) { - auto construct = []() { FederatedCommunicator comm{0, 0, kServerAddress, "", "", ""}; }; + std::string server_address{GetServerAddress()}; + auto construct = [server_address]() { + FederatedCommunicator comm{0, 0, server_address, "", "", ""}; + }; EXPECT_THROW(construct(), dmlc::Error); } TEST(FederatedCommunicatorSimpleTest, ThrowOnRankTooSmall) { - auto construct = []() { FederatedCommunicator comm{1, -1, kServerAddress, "", "", ""}; }; + std::string server_address{GetServerAddress()}; + auto construct = [server_address]() { + FederatedCommunicator comm{1, -1, server_address, "", "", ""}; + }; EXPECT_THROW(construct(), dmlc::Error); } TEST(FederatedCommunicatorSimpleTest, ThrowOnRankTooBig) { - auto construct = []() { FederatedCommunicator comm{1, 1, kServerAddress, "", "", ""}; }; + std::string server_address{GetServerAddress()}; + auto construct = [server_address]() { + FederatedCommunicator comm{1, 1, server_address, "", "", ""}; + }; EXPECT_THROW(construct(), dmlc::Error); } TEST(FederatedCommunicatorSimpleTest, GetWorldSizeAndRank) { - FederatedCommunicator comm{6, 3, kServerAddress}; + std::string server_address{GetServerAddress()}; + FederatedCommunicator comm{6, 3, server_address}; EXPECT_EQ(comm.GetWorldSize(), 6); EXPECT_EQ(comm.GetRank(), 3); } TEST(FederatedCommunicatorSimpleTest, IsDistributed) { - FederatedCommunicator comm{2, 1, kServerAddress}; + std::string server_address{GetServerAddress()}; + FederatedCommunicator comm{2, 1, server_address}; EXPECT_TRUE(comm.IsDistributed()); } TEST_F(FederatedCommunicatorTest, Allreduce) { std::vector threads; for (auto rank = 0; rank < kWorldSize; rank++) { - threads.emplace_back(std::thread(&FederatedCommunicatorTest::VerifyAllreduce, rank)); + threads.emplace_back( + std::thread(&FederatedCommunicatorTest::VerifyAllreduce, rank, server_address_)); } for (auto &thread : threads) { thread.join(); @@ -109,7 +130,8 @@ TEST_F(FederatedCommunicatorTest, Allreduce) { TEST_F(FederatedCommunicatorTest, Broadcast) { std::vector threads; for (auto rank = 0; rank < kWorldSize; rank++) { - threads.emplace_back(std::thread(&FederatedCommunicatorTest::VerifyBroadcast, rank)); + threads.emplace_back( + std::thread(&FederatedCommunicatorTest::VerifyBroadcast, rank, server_address_)); } for (auto &thread : threads) { thread.join(); diff --git a/tests/cpp/plugin/test_federated_server.cc b/tests/cpp/plugin/test_federated_server.cc index 1c3e4f0bc84c..12b1eb0b8f90 100644 --- a/tests/cpp/plugin/test_federated_server.cc +++ b/tests/cpp/plugin/test_federated_server.cc @@ -5,7 +5,9 @@ #include #include +#include +#include "../helpers.h" #include "federated_client.h" #include "federated_server.h" @@ -13,23 +15,23 @@ namespace xgboost { class FederatedServerTest : public ::testing::Test { public: - static void VerifyAllgather(int rank) { - federated::FederatedClient client{kServerAddress, rank}; + static void VerifyAllgather(int rank, const std::string& server_address) { + federated::FederatedClient client{server_address, rank}; CheckAllgather(client, rank); } - static void VerifyAllreduce(int rank) { - federated::FederatedClient client{kServerAddress, rank}; + static void VerifyAllreduce(int rank, const std::string& server_address) { + federated::FederatedClient client{server_address, rank}; CheckAllreduce(client); } - static void VerifyBroadcast(int rank) { - federated::FederatedClient client{kServerAddress, rank}; + static void VerifyBroadcast(int rank, const std::string& server_address) { + federated::FederatedClient client{server_address, rank}; CheckBroadcast(client, rank); } - static void VerifyMixture(int rank) { - federated::FederatedClient client{kServerAddress, rank}; + static void VerifyMixture(int rank, const std::string& server_address) { + federated::FederatedClient client{server_address, rank}; for (auto i = 0; i < 10; i++) { CheckAllgather(client, rank); CheckAllreduce(client); @@ -38,11 +40,18 @@ class FederatedServerTest : public ::testing::Test { } protected: + std::string GetServerAddress() { + SimpleLCG lcg(std::time(NULL)); + std::uniform_int_distribution dist(50000, 60000); + int port = dist(lcg); + return std::string("localhost:") + std::to_string(port); + } void SetUp() override { + server_address_ = GetServerAddress(); server_thread_.reset(new std::thread([this] { grpc::ServerBuilder builder; federated::FederatedService service{kWorldSize}; - builder.AddListeningPort(kServerAddress, grpc::InsecureServerCredentials()); + builder.AddListeningPort(server_address_, grpc::InsecureServerCredentials()); builder.RegisterService(&service); server_ = builder.BuildAndStart(); server_->Wait(); @@ -80,17 +89,15 @@ class FederatedServerTest : public ::testing::Test { } static int const kWorldSize{3}; - static std::string const kServerAddress; + std::string server_address_; std::unique_ptr server_thread_; std::unique_ptr server_; }; -std::string const FederatedServerTest::kServerAddress{"localhost:56789"}; // NOLINT(cert-err58-cpp) - TEST_F(FederatedServerTest, Allgather) { std::vector threads; for (auto rank = 0; rank < kWorldSize; rank++) { - threads.emplace_back(std::thread(&FederatedServerTest::VerifyAllgather, rank)); + threads.emplace_back(std::thread(&FederatedServerTest::VerifyAllgather, rank, server_address_)); } for (auto& thread : threads) { thread.join(); @@ -100,7 +107,7 @@ TEST_F(FederatedServerTest, Allgather) { TEST_F(FederatedServerTest, Allreduce) { std::vector threads; for (auto rank = 0; rank < kWorldSize; rank++) { - threads.emplace_back(std::thread(&FederatedServerTest::VerifyAllreduce, rank)); + threads.emplace_back(std::thread(&FederatedServerTest::VerifyAllreduce, rank, server_address_)); } for (auto& thread : threads) { thread.join(); @@ -110,7 +117,7 @@ TEST_F(FederatedServerTest, Allreduce) { TEST_F(FederatedServerTest, Broadcast) { std::vector threads; for (auto rank = 0; rank < kWorldSize; rank++) { - threads.emplace_back(std::thread(&FederatedServerTest::VerifyBroadcast, rank)); + threads.emplace_back(std::thread(&FederatedServerTest::VerifyBroadcast, rank, server_address_)); } for (auto& thread : threads) { thread.join(); @@ -120,7 +127,7 @@ TEST_F(FederatedServerTest, Broadcast) { TEST_F(FederatedServerTest, Mixture) { std::vector threads; for (auto rank = 0; rank < kWorldSize; rank++) { - threads.emplace_back(std::thread(&FederatedServerTest::VerifyMixture, rank)); + threads.emplace_back(std::thread(&FederatedServerTest::VerifyMixture, rank, server_address_)); } for (auto& thread : threads) { thread.join(); From b8c71bf5537ee43ccbf36a1311e25d31fe754273 Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Tue, 11 Oct 2022 13:39:11 -0700 Subject: [PATCH 20/26] Revert "Install gRPC in a separate Conda env" This reverts commit 90462c0f008199ca16ba53fcc5d0c070fd8c31e4. --- tests/buildkite/build-cpu.sh | 3 +-- tests/buildkite/build-cuda.sh | 6 +++--- tests/ci_build/Dockerfile.cpu | 13 ++++++------- tests/ci_build/Dockerfile.gpu_build_centos7 | 14 ++++++-------- 4 files changed, 16 insertions(+), 20 deletions(-) diff --git a/tests/buildkite/build-cpu.sh b/tests/buildkite/build-cpu.sh index a2637c6253d6..52908ee2a07c 100755 --- a/tests/buildkite/build-cpu.sh +++ b/tests/buildkite/build-cpu.sh @@ -14,8 +14,7 @@ $command_wrapper rm -fv dmlc-core/include/dmlc/build_config_default.h # the configured header build/dmlc/build_config.h instead of # include/dmlc/build_config_default.h. echo "--- Build libxgboost from the source" -$command_wrapper tests/ci_build/build_via_cmake.sh --conda-env=build_env \ - -DPLUGIN_DENSE_PARSER=ON -DPLUGIN_FEDERATED=ON +$command_wrapper tests/ci_build/build_via_cmake.sh -DPLUGIN_DENSE_PARSER=ON -DPLUGIN_FEDERATED=ON echo "--- Run Google Test" $command_wrapper bash -c "cd build && ctest --extra-verbose" echo "--- Stash XGBoost CLI executable" diff --git a/tests/buildkite/build-cuda.sh b/tests/buildkite/build-cuda.sh index 703f391293a2..686c346b352e 100755 --- a/tests/buildkite/build-cuda.sh +++ b/tests/buildkite/build-cuda.sh @@ -20,9 +20,9 @@ command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg echo "--- Build libxgboost from the source" $command_wrapper tests/ci_build/prune_libnccl.sh -$command_wrapper tests/ci_build/build_via_cmake.sh --conda-env=build_env \ - -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON \ - -DPLUGIN_FEDERATED=ON -DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \ +$command_wrapper tests/ci_build/build_via_cmake.sh -DUSE_CUDA=ON -DUSE_NCCL=ON \ + -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DPLUGIN_FEDERATED=ON \ + -DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \ -DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag} echo "--- Build binary wheel" $command_wrapper bash -c \ diff --git a/tests/ci_build/Dockerfile.cpu b/tests/ci_build/Dockerfile.cpu index 1f40dcdf918e..517ed49af3f9 100644 --- a/tests/ci_build/Dockerfile.cpu +++ b/tests/ci_build/Dockerfile.cpu @@ -14,7 +14,7 @@ RUN \ # CMake wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \ bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr && \ - # Conda + # Python wget -nv https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh && \ bash Mambaforge-Linux-x86_64.sh -b -p /opt/python @@ -26,20 +26,19 @@ ENV CPP=cpp-8 ENV GOSU_VERSION 1.10 ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/ -# Create new Conda environment -COPY conda_env/cpu_test.yml /scripts/ -RUN mamba env create -n cpu_test --file=/scripts/cpu_test.yml && \ - mamba create -y -n build_env -c conda-forge python=3.9 - # Install gRPC RUN git clone -b v1.49.1 https://github.com/grpc/grpc.git \ --recurse-submodules --depth 1 --shallow-submodules && \ pushd grpc && \ - cmake -S . -B build -GNinja -DCMAKE_INSTALL_PREFIX=/opt/python/envs/build_env && \ + cmake -S . -B build -GNinja && \ cmake --build build --target install && \ popd && \ rm -rf grpc +# Create new Conda environment +COPY conda_env/cpu_test.yml /scripts/ +RUN mamba env create -n cpu_test --file=/scripts/cpu_test.yml + # Install lightweight sudo (not bound to TTY) RUN set -ex; \ wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ diff --git a/tests/ci_build/Dockerfile.gpu_build_centos7 b/tests/ci_build/Dockerfile.gpu_build_centos7 index 6efac1903f0e..5954950a1914 100644 --- a/tests/ci_build/Dockerfile.gpu_build_centos7 +++ b/tests/ci_build/Dockerfile.gpu_build_centos7 @@ -10,12 +10,13 @@ RUN \ yum-config-manager --enable centos-sclo-rh-testing && \ yum -y update && \ yum install -y tar unzip wget xz git which ninja-build devtoolset-8-gcc devtoolset-8-binutils devtoolset-8-gcc-c++ && \ + # Python + wget -nv -nc -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + bash Miniconda3.sh -b -p /opt/python && \ + /opt/python/bin/python -m pip install awscli && \ # CMake wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \ - bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr && \ - # Conda - wget -nv https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh && \ - bash Mambaforge-Linux-x86_64.sh -b -p /opt/python + bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) RUN \ @@ -34,14 +35,11 @@ ENV CPP=/opt/rh/devtoolset-8/root/usr/bin/cpp ENV GOSU_VERSION 1.10 -# Create new Conda environment -RUN mamba create -y -n build_env -c conda-forge python=3.9 - # Install gRPC RUN git clone -b v1.49.1 https://github.com/grpc/grpc.git \ --recurse-submodules --depth 1 && \ pushd grpc && \ - cmake -S . -B build -GNinja -DCMAKE_INSTALL_PREFIX=/opt/python/envs/build_env && \ + cmake -S . -B build -GNinja && \ cmake --build build --target install && \ popd && \ rm -rf grpc From 6ac17caedff77dfafa25f1b23ca37b61d6c8fbb7 Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Tue, 11 Oct 2022 13:44:53 -0700 Subject: [PATCH 21/26] Install grpc in /opt/grpc --- tests/buildkite/build-cpu.sh | 3 ++- tests/buildkite/build-cuda.sh | 4 ++-- tests/ci_build/Dockerfile.cpu | 2 +- tests/ci_build/Dockerfile.gpu_build_centos7 | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/buildkite/build-cpu.sh b/tests/buildkite/build-cpu.sh index 52908ee2a07c..88da7d39504a 100755 --- a/tests/buildkite/build-cpu.sh +++ b/tests/buildkite/build-cpu.sh @@ -14,7 +14,8 @@ $command_wrapper rm -fv dmlc-core/include/dmlc/build_config_default.h # the configured header build/dmlc/build_config.h instead of # include/dmlc/build_config_default.h. echo "--- Build libxgboost from the source" -$command_wrapper tests/ci_build/build_via_cmake.sh -DPLUGIN_DENSE_PARSER=ON -DPLUGIN_FEDERATED=ON +$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH=/opt/grpc \ + -DPLUGIN_DENSE_PARSER=ON -DPLUGIN_FEDERATED=ON echo "--- Run Google Test" $command_wrapper bash -c "cd build && ctest --extra-verbose" echo "--- Stash XGBoost CLI executable" diff --git a/tests/buildkite/build-cuda.sh b/tests/buildkite/build-cuda.sh index 686c346b352e..a50963f7c7fc 100755 --- a/tests/buildkite/build-cuda.sh +++ b/tests/buildkite/build-cuda.sh @@ -20,8 +20,8 @@ command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg echo "--- Build libxgboost from the source" $command_wrapper tests/ci_build/prune_libnccl.sh -$command_wrapper tests/ci_build/build_via_cmake.sh -DUSE_CUDA=ON -DUSE_NCCL=ON \ - -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DPLUGIN_FEDERATED=ON \ +$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH=/opt/grpc \ + -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DPLUGIN_FEDERATED=ON \ -DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \ -DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag} echo "--- Build binary wheel" diff --git a/tests/ci_build/Dockerfile.cpu b/tests/ci_build/Dockerfile.cpu index 517ed49af3f9..5111f4d00184 100644 --- a/tests/ci_build/Dockerfile.cpu +++ b/tests/ci_build/Dockerfile.cpu @@ -30,7 +30,7 @@ ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/ RUN git clone -b v1.49.1 https://github.com/grpc/grpc.git \ --recurse-submodules --depth 1 --shallow-submodules && \ pushd grpc && \ - cmake -S . -B build -GNinja && \ + cmake -S . -B build -GNinja -DCMAKE_INSTALL_PREFIX=/opt/grpc && \ cmake --build build --target install && \ popd && \ rm -rf grpc diff --git a/tests/ci_build/Dockerfile.gpu_build_centos7 b/tests/ci_build/Dockerfile.gpu_build_centos7 index 5954950a1914..b6b38575bd6a 100644 --- a/tests/ci_build/Dockerfile.gpu_build_centos7 +++ b/tests/ci_build/Dockerfile.gpu_build_centos7 @@ -39,7 +39,7 @@ ENV GOSU_VERSION 1.10 RUN git clone -b v1.49.1 https://github.com/grpc/grpc.git \ --recurse-submodules --depth 1 && \ pushd grpc && \ - cmake -S . -B build -GNinja && \ + cmake -S . -B build -GNinja -DCMAKE_INSTALL_PREFIX=/opt/grpc && \ cmake --build build --target install && \ popd && \ rm -rf grpc From 91280ec0fd5580bf29eb909a75c749fa4ebce67a Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Tue, 11 Oct 2022 13:46:54 -0700 Subject: [PATCH 22/26] Update doc --- plugin/federated/README.md | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/plugin/federated/README.md b/plugin/federated/README.md index 5bf25dfbb807..061cb77149d0 100644 --- a/plugin/federated/README.md +++ b/plugin/federated/README.md @@ -5,21 +5,7 @@ This folder contains the plugin for federated learning. Follow these steps to bu Install gRPC ------------ -We highly recommend installing gRPC in a local environment, such as a Conda environment, -by appropriately setting `CMAKE_INSTALL_PREFIX`. -There is no easy way to uninstall gRPC after you've installed it globally. - -In the following example, we show how to build and install gRPC in a Conda environment. -```shell -sudo apt-get install build-essential autoconf libtool pkg-config cmake ninja-build -conda activate your_env -git clone -b v1.49.1 https://github.com/grpc/grpc \ - --recurse-submodules --depth 1 --shallow-submodules -cd grpc -cmake -S . -B build -GNinja -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -cmake --build build --target install -cd .. -``` +Refer to the [installation guide from the gRPC website](https://grpc.io/docs/languages/cpp/quickstart/). Build the Plugin ---------------- @@ -27,15 +13,16 @@ Build the Plugin # Under xgboost source tree. mkdir build cd build -cmake .. -GNinja\ - -DPLUGIN_FEDERATED=ON\ - -DBUILD_WITH_CUDA_CUB=ON\ +cmake .. -GNinja \ + -DPLUGIN_FEDERATED=ON \ + -DBUILD_WITH_CUDA_CUB=ON \ -DUSE_CUDA=ON\ -DUSE_NCCL=ON ninja cd ../python-package pip install -e . # or equivalently python setup.py develop ``` +If CMake fails to locate gRPC, you may need to pass `-DCMAKE_PREFIX_PATH=` to CMake. Test Federated XGBoost ---------------------- From 22882cdbd0618ea9acc6374eb7a733fcf1a03e8f Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Tue, 11 Oct 2022 14:32:57 -0700 Subject: [PATCH 23/26] Simplify Protobuf codegen logic in CMake --- plugin/federated/CMakeLists.txt | 51 ++++++++++++--------------------- 1 file changed, 18 insertions(+), 33 deletions(-) diff --git a/plugin/federated/CMakeLists.txt b/plugin/federated/CMakeLists.txt index f5c071383d7f..f2023a651e9b 100644 --- a/plugin/federated/CMakeLists.txt +++ b/plugin/federated/CMakeLists.txt @@ -4,45 +4,30 @@ set(protobuf_BUILD_SHARED_LIBS TRUE) find_package(Protobuf CONFIG REQUIRED) find_package(gRPC CONFIG REQUIRED) message(STATUS "Found gRPC: ${gRPC_CONFIG}") -set(_PROTOBUF_LIBPROTOBUF protobuf::libprotobuf) -set(_PROTOBUF_PROTOC $) -set(_GRPC_GRPCPP gRPC::grpc++) -set(_GRPC_CPP_PLUGIN_EXECUTABLE $) -# Proto file -get_filename_component(federated_proto "federated.proto" ABSOLUTE) -get_filename_component(federated_proto_path "${federated_proto}" PATH) +# Generate code from federated.proto +add_library(federated_proto STATIC federated.proto) +set(PROTO_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/proto-gen") +target_link_libraries(federated_proto PUBLIC + protobuf::libprotobuf gRPC::grpc++) +target_include_directories(federated_proto PUBLIC + "$") +xgboost_target_properties(federated_proto) -# Generated code from the protobuf definition. -set(federated_srcs "${CMAKE_CURRENT_BINARY_DIR}/federated.pb.cc") -set(federated_hdrs "${CMAKE_CURRENT_BINARY_DIR}/federated.pb.h") -set(federated_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/federated.grpc.pb.cc") -set(federated_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/federated.grpc.pb.h") -add_custom_command( - OUTPUT "${federated_srcs}" "${federated_hdrs}" - "${federated_grpc_srcs}" "${federated_grpc_hdrs}" - COMMAND ${_PROTOBUF_PROTOC} - ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" - --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" - -I "${federated_proto_path}" - --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}" - "${federated_proto}" - DEPENDS "${federated_proto}") - -add_library(federated_proto_lib STATIC) -target_sources(federated_proto_lib PRIVATE - ${federated_srcs} ${federated_hdrs} - ${federated_grpc_srcs} ${federated_grpc_hdrs}) -target_link_libraries(federated_proto_lib PUBLIC - ${_PROTOBUF_LIBPROTOBUF} ${_GRPC_GRPCPP}) -target_include_directories(federated_proto_lib PUBLIC - ${CMAKE_CURRENT_BINARY_DIR}) -xgboost_target_properties(federated_proto_lib) +protobuf_generate( + TARGET federated_proto + PROTOC_OUT_DIR "${PROTO_BINARY_DIR}") +protobuf_generate( + TARGET federated_proto + LANGUAGE grpc + GENERATE_EXTENSIONS .grpc.pb.h .grpc.pb.cc + PLUGIN "protoc-gen-grpc=\$" + PROTOC_OUT_DIR "${PROTO_BINARY_DIR}") # Wrapper for the gRPC client. add_library(federated_client INTERFACE) target_sources(federated_client INTERFACE federated_client.h) -target_link_libraries(federated_client INTERFACE federated_proto_lib) +target_link_libraries(federated_client INTERFACE federated_proto) # Rabit engine for Federated Learning. target_sources(objxgboost PRIVATE federated_server.cc) From ad6336063a5abb8faf211741b4507526e048c411 Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Tue, 11 Oct 2022 14:35:08 -0700 Subject: [PATCH 24/26] Specify language --- plugin/federated/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/plugin/federated/CMakeLists.txt b/plugin/federated/CMakeLists.txt index f2023a651e9b..471588ba2ce9 100644 --- a/plugin/federated/CMakeLists.txt +++ b/plugin/federated/CMakeLists.txt @@ -16,6 +16,7 @@ xgboost_target_properties(federated_proto) protobuf_generate( TARGET federated_proto + LANGUAGE cpp PROTOC_OUT_DIR "${PROTO_BINARY_DIR}") protobuf_generate( TARGET federated_proto From 90a0768b359e253cb3a32ecc0542807ed235ba8d Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 11 Oct 2022 23:15:15 +0000 Subject: [PATCH 25/26] Ensure that server port is actually random --- tests/cpp/plugin/helpers.cc | 19 ++++++++++++++++++ tests/cpp/plugin/helpers.h | 10 ++++++++++ tests/cpp/plugin/test_federated_adapter.cu | 19 +++++++++++------- .../cpp/plugin/test_federated_communicator.cc | 20 +++++++++++-------- tests/cpp/plugin/test_federated_server.cc | 19 +++++++++++------- 5 files changed, 65 insertions(+), 22 deletions(-) create mode 100644 tests/cpp/plugin/helpers.cc create mode 100644 tests/cpp/plugin/helpers.h diff --git a/tests/cpp/plugin/helpers.cc b/tests/cpp/plugin/helpers.cc new file mode 100644 index 000000000000..a70479b1bb1c --- /dev/null +++ b/tests/cpp/plugin/helpers.cc @@ -0,0 +1,19 @@ +#include +#include +#include +#include + +#include "helpers.h" + +using namespace std::chrono_literals; + +int GenerateRandomPort(int low, int high) { + // Ensure unique timestamp by introducing a small artificial delay + std::this_thread::sleep_for(100ms); + auto timestamp = static_cast(std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count()); + std::mt19937_64 rng(timestamp); + std::uniform_int_distribution dist(low, high); + int port = dist(rng); + return port; +} diff --git a/tests/cpp/plugin/helpers.h b/tests/cpp/plugin/helpers.h new file mode 100644 index 000000000000..ea72f1538af6 --- /dev/null +++ b/tests/cpp/plugin/helpers.h @@ -0,0 +1,10 @@ +/*! + * Copyright 2022 XGBoost contributors + */ + +#ifndef XGBOOST_TESTS_CPP_PLUGIN_HELPERS_H_ +#define XGBOOST_TESTS_CPP_PLUGIN_HELPERS_H_ + +int GenerateRandomPort(int low, int high); + +#endif // XGBOOST_TESTS_CPP_PLUGIN_HELPERS_H_ diff --git a/tests/cpp/plugin/test_federated_adapter.cu b/tests/cpp/plugin/test_federated_adapter.cu index 5b63f74cc4f1..859e0af9b32b 100644 --- a/tests/cpp/plugin/test_federated_adapter.cu +++ b/tests/cpp/plugin/test_federated_adapter.cu @@ -5,25 +5,30 @@ #include #include +#include #include #include -#include "../helpers.h" +#include "./helpers.h" #include "../../../plugin/federated/federated_communicator.h" #include "../../../plugin/federated/federated_server.h" #include "../../../src/collective/device_communicator_adapter.cuh" +namespace { + +std::string GetServerAddress() { + int port = GenerateRandomPort(50000, 60000); + std::string address = std::string("localhost:") + std::to_string(port); + return address; +} + +} // anonymous namespace + namespace xgboost { namespace collective { class FederatedAdapterTest : public ::testing::Test { protected: - std::string GetServerAddress() { - SimpleLCG lcg(std::time(NULL)); - std::uniform_int_distribution dist(50000, 60000); - int port = dist(lcg); - return std::string("localhost:") + std::to_string(port); - } void SetUp() override { server_address_ = GetServerAddress(); server_thread_.reset(new std::thread([this] { diff --git a/tests/cpp/plugin/test_federated_communicator.cc b/tests/cpp/plugin/test_federated_communicator.cc index 23694b1a8e4b..4ec67f38ffb6 100644 --- a/tests/cpp/plugin/test_federated_communicator.cc +++ b/tests/cpp/plugin/test_federated_communicator.cc @@ -5,23 +5,27 @@ #include #include +#include #include #include -#include "../helpers.h" +#include "helpers.h" #include "../../../plugin/federated/federated_communicator.h" #include "../../../plugin/federated/federated_server.h" -namespace xgboost { -namespace collective { - +namespace { + std::string GetServerAddress() { - SimpleLCG lcg(std::time(NULL)); - std::uniform_int_distribution dist(50000, 60000); - int port = dist(lcg); - return std::string("localhost:") + std::to_string(port); + int port = GenerateRandomPort(50000, 60000); + std::string address = std::string("localhost:") + std::to_string(port); + return address; } +} // anonymous namespace + +namespace xgboost { +namespace collective { + class FederatedCommunicatorTest : public ::testing::Test { public: static void VerifyAllreduce(int rank, const std::string& server_address) { diff --git a/tests/cpp/plugin/test_federated_server.cc b/tests/cpp/plugin/test_federated_server.cc index 12b1eb0b8f90..2e7afe5a294d 100644 --- a/tests/cpp/plugin/test_federated_server.cc +++ b/tests/cpp/plugin/test_federated_server.cc @@ -4,13 +4,24 @@ #include #include +#include #include #include -#include "../helpers.h" +#include "helpers.h" #include "federated_client.h" #include "federated_server.h" +namespace { + +std::string GetServerAddress() { + int port = GenerateRandomPort(50000, 60000); + std::string address = std::string("localhost:") + std::to_string(port); + return address; +} + +} // anonymous namespace + namespace xgboost { class FederatedServerTest : public ::testing::Test { @@ -40,12 +51,6 @@ class FederatedServerTest : public ::testing::Test { } protected: - std::string GetServerAddress() { - SimpleLCG lcg(std::time(NULL)); - std::uniform_int_distribution dist(50000, 60000); - int port = dist(lcg); - return std::string("localhost:") + std::to_string(port); - } void SetUp() override { server_address_ = GetServerAddress(); server_thread_.reset(new std::thread([this] { From 7ea3255538070936ee582d53df8cfadf43c38350 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 11 Oct 2022 23:17:47 +0000 Subject: [PATCH 26/26] Add comment about use of device 0 --- tests/cpp/plugin/test_federated_adapter.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/cpp/plugin/test_federated_adapter.cu b/tests/cpp/plugin/test_federated_adapter.cu index 859e0af9b32b..794c60909e76 100644 --- a/tests/cpp/plugin/test_federated_adapter.cu +++ b/tests/cpp/plugin/test_federated_adapter.cu @@ -67,6 +67,7 @@ TEST_F(FederatedAdapterTest, DeviceAllReduceSum) { for (auto rank = 0; rank < kWorldSize; rank++) { threads.emplace_back(std::thread([rank, server_address=server_address_] { FederatedCommunicator comm{kWorldSize, rank, server_address}; + // Assign device 0 to all workers, since we run gtest in a single-GPU machine DeviceCommunicatorAdapter adapter{0, &comm}; int const count = 3; thrust::device_vector buffer(count, 0); @@ -89,6 +90,7 @@ TEST_F(FederatedAdapterTest, DeviceAllGatherV) { for (auto rank = 0; rank < kWorldSize; rank++) { threads.emplace_back(std::thread([rank, server_address=server_address_] { FederatedCommunicator comm{kWorldSize, rank, server_address}; + // Assign device 0 to all workers, since we run gtest in a single-GPU machine DeviceCommunicatorAdapter adapter{0, &comm}; int const count = rank + 2;