Skip to content

Commit

Permalink
[CI] Test federated learning plugin in the CI (#8325)
Browse files Browse the repository at this point in the history
  • Loading branch information
hcho3 committed Oct 12, 2022
1 parent 97a5b08 commit 2faa744
Show file tree
Hide file tree
Showing 16 changed files with 190 additions and 117 deletions.
15 changes: 15 additions & 0 deletions CMakeLists.txt
Expand Up @@ -4,6 +4,7 @@ include(cmake/Utils.cmake)
list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
cmake_policy(SET CMP0022 NEW)
cmake_policy(SET CMP0079 NEW)
cmake_policy(SET CMP0076 NEW)
set(CMAKE_POLICY_DEFAULT_CMP0063 NEW)
cmake_policy(SET CMP0063 NEW)

Expand Down Expand Up @@ -117,6 +118,20 @@ endif (BUILD_STATIC_LIB AND (R_LIB OR JVM_BINDINGS))
if (PLUGIN_RMM AND (NOT BUILD_WITH_CUDA_CUB))
message(SEND_ERROR "Cannot build with RMM using cub submodule.")
endif (PLUGIN_RMM AND (NOT BUILD_WITH_CUDA_CUB))
if (PLUGIN_FEDERATED)
if (CMAKE_CROSSCOMPILING)
message(SEND_ERROR "Cannot cross compile with federated learning support")
endif ()
if (BUILD_STATIC_LIB)
message(SEND_ERROR "Cannot build static lib with federated learning support")
endif ()
if (R_LIB OR JVM_BINDINGS)
message(SEND_ERROR "Cannot enable federated learning support when R or JVM packages are enabled.")
endif ()
if (WIN32)
message(SEND_ERROR "Federated learning not supported for Windows platform")
endif ()
endif ()

#-- Sanitizer
if (USE_SANITIZER)
Expand Down
13 changes: 9 additions & 4 deletions plugin/federated/CMakeLists.txt
@@ -1,21 +1,26 @@
# gRPC needs to be installed first. See README.md.
set(protobuf_MODULE_COMPATIBLE TRUE)
set(protobuf_BUILD_SHARED_LIBS TRUE)
find_package(Protobuf CONFIG REQUIRED)
find_package(gRPC CONFIG REQUIRED)
find_package(Threads)
message(STATUS "Found gRPC: ${gRPC_CONFIG}")

# Generated code from the protobuf definition.
add_library(federated_proto federated.proto)
target_link_libraries(federated_proto PUBLIC protobuf::libprotobuf gRPC::grpc gRPC::grpc++)
target_include_directories(federated_proto PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
xgboost_target_properties(federated_proto)

get_target_property(grpc_cpp_plugin_location gRPC::grpc_cpp_plugin LOCATION)
protobuf_generate(TARGET federated_proto LANGUAGE cpp)
protobuf_generate(
TARGET federated_proto
LANGUAGE cpp
PROTOC_OUT_DIR "${PROTO_BINARY_DIR}")
protobuf_generate(
TARGET federated_proto
LANGUAGE grpc
GENERATE_EXTENSIONS .grpc.pb.h .grpc.pb.cc
PLUGIN "protoc-gen-grpc=${grpc_cpp_plugin_location}")
PLUGIN "protoc-gen-grpc=\$<TARGET_FILE:gRPC::grpc_cpp_plugin>"
PROTOC_OUT_DIR "${PROTO_BINARY_DIR}")

# Wrapper for the gRPC client.
add_library(federated_client INTERFACE)
Expand Down
19 changes: 6 additions & 13 deletions plugin/federated/README.md
Expand Up @@ -5,31 +5,24 @@ This folder contains the plugin for federated learning. Follow these steps to bu

Install gRPC
------------
```shell
sudo apt-get install build-essential autoconf libtool pkg-config cmake ninja-build
git clone -b v1.47.0 https://github.com/grpc/grpc
cd grpc
git submodule update --init
cmake -S . -B build -GNinja -DABSL_PROPAGATE_CXX_STD=ON
cmake --build build --target install
```
Refer to the [installation guide from the gRPC website](https://grpc.io/docs/languages/cpp/quickstart/).

Build the Plugin
----------------
```shell
# Under xgboost source tree.
mkdir build
cd build
# For now NCCL needs to be turned off.
cmake .. -GNinja\
-DPLUGIN_FEDERATED=ON\
cmake .. -GNinja \
-DPLUGIN_FEDERATED=ON \
-DBUILD_WITH_CUDA_CUB=ON \
-DUSE_CUDA=ON\
-DBUILD_WITH_CUDA_CUB=ON\
-DUSE_NCCL=OFF
-DUSE_NCCL=ON
ninja
cd ../python-package
pip install -e . # or equivalently python setup.py develop
```
If CMake fails to locate gRPC, you may need to pass `-DCMAKE_PREFIX_PATH=<grpc path>` to CMake.

Test Federated XGBoost
----------------------
Expand Down
9 changes: 6 additions & 3 deletions tests/buildkite/build-containers.sh
Expand Up @@ -6,7 +6,7 @@ set -x
if [ "$#" -lt 1 ]
then
echo "Usage: $0 [container to build]"
return 1
exit 1
fi
container=$1

Expand All @@ -17,18 +17,21 @@ echo "--- Build container ${container}"
BUILD_ARGS=""

case "${container}" in
cpu)
;;

gpu|rmm)
BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
;;

jvm_gpu_build)
gpu_build_centos7|jvm_gpu_build)
BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
;;

*)
echo "Unrecognized container ID: ${container}"
return 2
exit 2
;;
esac

Expand Down
3 changes: 2 additions & 1 deletion tests/buildkite/build-cpu.sh
Expand Up @@ -14,7 +14,8 @@ $command_wrapper rm -fv dmlc-core/include/dmlc/build_config_default.h
# the configured header build/dmlc/build_config.h instead of
# include/dmlc/build_config_default.h.
echo "--- Build libxgboost from the source"
$command_wrapper tests/ci_build/build_via_cmake.sh -DPLUGIN_DENSE_PARSER=ON
$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH=/opt/grpc \
-DPLUGIN_DENSE_PARSER=ON -DPLUGIN_FEDERATED=ON
echo "--- Run Google Test"
$command_wrapper bash -c "cd build && ctest --extra-verbose"
echo "--- Stash XGBoost CLI executable"
Expand Down
8 changes: 4 additions & 4 deletions tests/buildkite/build-cuda.sh
Expand Up @@ -20,10 +20,10 @@ command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg

echo "--- Build libxgboost from the source"
$command_wrapper tests/ci_build/prune_libnccl.sh
$command_wrapper tests/ci_build/build_via_cmake.sh -DUSE_CUDA=ON -DUSE_NCCL=ON \
-DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DUSE_NCCL_LIB_PATH=ON \
-DNCCL_INCLUDE_DIR=/usr/include -DNCCL_LIBRARY=/workspace/libnccl_static.a \
${arch_flag}
$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH=/opt/grpc \
-DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DPLUGIN_FEDERATED=ON \
-DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \
-DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag}
echo "--- Build binary wheel"
$command_wrapper bash -c \
"cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal"
Expand Down
1 change: 1 addition & 0 deletions tests/buildkite/pipeline-mgpu.yml
Expand Up @@ -17,6 +17,7 @@ steps:
- label: ":docker: Build containers"
commands:
- "tests/buildkite/build-containers.sh gpu"
- "tests/buildkite/build-containers.sh gpu_build_centos7"
- "tests/buildkite/build-containers.sh jvm_gpu_build"
key: build-containers
agents:
Expand Down
2 changes: 2 additions & 0 deletions tests/buildkite/pipeline.yml
Expand Up @@ -13,7 +13,9 @@ steps:
#### -------- CONTAINER BUILD --------
- label: ":docker: Build containers"
commands:
- "tests/buildkite/build-containers.sh cpu"
- "tests/buildkite/build-containers.sh gpu"
- "tests/buildkite/build-containers.sh gpu_build_centos7"
- "tests/buildkite/build-containers.sh rmm"
key: build-containers
agents:
Expand Down
9 changes: 9 additions & 0 deletions tests/ci_build/Dockerfile.cpu
Expand Up @@ -26,6 +26,15 @@ ENV CPP=cpp-8
ENV GOSU_VERSION 1.10
ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/

# Install gRPC
RUN git clone -b v1.49.1 https://github.com/grpc/grpc.git \
--recurse-submodules --depth 1 --shallow-submodules && \
pushd grpc && \
cmake -S . -B build -GNinja -DCMAKE_INSTALL_PREFIX=/opt/grpc && \
cmake --build build --target install && \
popd && \
rm -rf grpc

# Create new Conda environment
COPY conda_env/cpu_test.yml /scripts/
RUN mamba env create -n cpu_test --file=/scripts/cpu_test.yml
Expand Down
49 changes: 0 additions & 49 deletions tests/ci_build/Dockerfile.gpu_build

This file was deleted.

9 changes: 9 additions & 0 deletions tests/ci_build/Dockerfile.gpu_build_centos7
Expand Up @@ -35,6 +35,15 @@ ENV CPP=/opt/rh/devtoolset-8/root/usr/bin/cpp

ENV GOSU_VERSION 1.10

# Install gRPC
RUN git clone -b v1.49.1 https://github.com/grpc/grpc.git \
--recurse-submodules --depth 1 && \
pushd grpc && \
cmake -S . -B build -GNinja -DCMAKE_INSTALL_PREFIX=/opt/grpc && \
cmake --build build --target install && \
popd && \
rm -rf grpc

# Install lightweight sudo (not bound to TTY)
RUN set -ex; \
wget -nv -nc -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
Expand Down
19 changes: 19 additions & 0 deletions tests/cpp/plugin/helpers.cc
@@ -0,0 +1,19 @@
#include <chrono>
#include <thread>
#include <random>
#include <cstdint>

#include "helpers.h"

using namespace std::chrono_literals;

int GenerateRandomPort(int low, int high) {
// Ensure unique timestamp by introducing a small artificial delay
std::this_thread::sleep_for(100ms);
auto timestamp = static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now().time_since_epoch()).count());
std::mt19937_64 rng(timestamp);
std::uniform_int_distribution<int> dist(low, high);
int port = dist(rng);
return port;
}
10 changes: 10 additions & 0 deletions tests/cpp/plugin/helpers.h
@@ -0,0 +1,10 @@
/*!
* Copyright 2022 XGBoost contributors
*/

#ifndef XGBOOST_TESTS_CPP_PLUGIN_HELPERS_H_
#define XGBOOST_TESTS_CPP_PLUGIN_HELPERS_H_

int GenerateRandomPort(int low, int high);

#endif // XGBOOST_TESTS_CPP_PLUGIN_HELPERS_H_
33 changes: 24 additions & 9 deletions tests/cpp/plugin/test_federated_adapter.cu
Expand Up @@ -5,24 +5,36 @@
#include <gtest/gtest.h>
#include <thrust/host_vector.h>

#include <iostream>
#include <thread>
#include <ctime>

#include "./helpers.h"
#include "../../../plugin/federated/federated_communicator.h"
#include "../../../plugin/federated/federated_server.h"
#include "../../../src/collective/device_communicator_adapter.cuh"

namespace {

std::string GetServerAddress() {
int port = GenerateRandomPort(50000, 60000);
std::string address = std::string("localhost:") + std::to_string(port);
return address;
}

} // anonymous namespace

namespace xgboost {
namespace collective {

std::string const kServerAddress{"localhost:56789"}; // NOLINT(cert-err58-cpp)

class FederatedAdapterTest : public ::testing::Test {
protected:
void SetUp() override {
server_address_ = GetServerAddress();
server_thread_.reset(new std::thread([this] {
grpc::ServerBuilder builder;
federated::FederatedService service{kWorldSize};
builder.AddListeningPort(kServerAddress, grpc::InsecureServerCredentials());
builder.AddListeningPort(server_address_, grpc::InsecureServerCredentials());
builder.RegisterService(&service);
server_ = builder.BuildAndStart();
server_->Wait();
Expand All @@ -35,6 +47,7 @@ class FederatedAdapterTest : public ::testing::Test {
}

static int const kWorldSize{2};
std::string server_address_;
std::unique_ptr<std::thread> server_thread_;
std::unique_ptr<grpc::Server> server_;
};
Expand All @@ -52,9 +65,10 @@ TEST(FederatedAdapterSimpleTest, ThrowOnInvalidCommunicator) {
TEST_F(FederatedAdapterTest, DeviceAllReduceSum) {
std::vector<std::thread> threads;
for (auto rank = 0; rank < kWorldSize; rank++) {
threads.emplace_back(std::thread([rank] {
FederatedCommunicator comm{kWorldSize, rank, kServerAddress};
DeviceCommunicatorAdapter adapter{rank, &comm};
threads.emplace_back(std::thread([rank, server_address=server_address_] {
FederatedCommunicator comm{kWorldSize, rank, server_address};
// Assign device 0 to all workers, since we run gtest in a single-GPU machine
DeviceCommunicatorAdapter adapter{0, &comm};
int const count = 3;
thrust::device_vector<double> buffer(count, 0);
thrust::sequence(buffer.begin(), buffer.end());
Expand All @@ -74,9 +88,10 @@ TEST_F(FederatedAdapterTest, DeviceAllReduceSum) {
TEST_F(FederatedAdapterTest, DeviceAllGatherV) {
std::vector<std::thread> threads;
for (auto rank = 0; rank < kWorldSize; rank++) {
threads.emplace_back(std::thread([rank] {
FederatedCommunicator comm{kWorldSize, rank, kServerAddress};
DeviceCommunicatorAdapter adapter{rank, &comm};
threads.emplace_back(std::thread([rank, server_address=server_address_] {
FederatedCommunicator comm{kWorldSize, rank, server_address};
// Assign device 0 to all workers, since we run gtest in a single-GPU machine
DeviceCommunicatorAdapter adapter{0, &comm};

int const count = rank + 2;
thrust::device_vector<char> buffer(count, 0);
Expand Down

0 comments on commit 2faa744

Please sign in to comment.