Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CI] Test federated learning plugin in the CI #8325

Merged
merged 27 commits into from Oct 12, 2022
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
1530ae7
Set up CMake to fetch gRPC on the fly
hcho3 Oct 9, 2022
be9edac
Always use device 0 in FederatedAdapterTest, as gtest only uses a sin…
hcho3 Oct 9, 2022
4ce6541
Disallow federated plugin when Windows is used
hcho3 Oct 9, 2022
962c30a
Add option to use gRPC from system / Conda
hcho3 Oct 9, 2022
157cc3d
Remove unused Dockerfile.gpu_build
hcho3 Oct 9, 2022
ae62808
Speed up FetchContent
hcho3 Oct 9, 2022
2b95107
Install gRPC into build containers
hcho3 Oct 9, 2022
c8bf4d4
Build CPU and GPU binary with federated learning support
hcho3 Oct 9, 2022
59a14d0
Update doc, since USE_NCCL can be used
hcho3 Oct 9, 2022
e714002
Explicitly build containers cpu / gpu_build_centos7
hcho3 Oct 9, 2022
6052cf2
Fix
hcho3 Oct 9, 2022
66b410b
Remove option for FetchContent
hcho3 Oct 9, 2022
bb6dbba
Fix build-containers.sh
hcho3 Oct 9, 2022
af7e074
Don't use ctest
hcho3 Oct 9, 2022
9238d8b
Revert "Don't use ctest"
hcho3 Oct 10, 2022
90462c0
Install gRPC in a separate Conda env
hcho3 Oct 11, 2022
69ec17e
Address reviewer's comment
hcho3 Oct 11, 2022
1604776
Use CMAKE_INSTALL_PREFIX
hcho3 Oct 11, 2022
85110d5
Randomize server port in gtest
hcho3 Oct 11, 2022
b8c71bf
Revert "Install gRPC in a separate Conda env"
hcho3 Oct 11, 2022
6ac17ca
Install grpc in /opt/grpc
hcho3 Oct 11, 2022
91280ec
Update doc
hcho3 Oct 11, 2022
22882cd
Simplify Protobuf codegen logic in CMake
hcho3 Oct 11, 2022
ad63360
Specify language
hcho3 Oct 11, 2022
90a0768
Ensure that server port is actually random
hcho3 Oct 11, 2022
7ea3255
Add comment about use of device 0
hcho3 Oct 11, 2022
c44744f
Merge remote-tracking branch 'origin/master' into setup_grpc
hcho3 Oct 12, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
17 changes: 17 additions & 0 deletions CMakeLists.txt
Expand Up @@ -4,6 +4,7 @@ include(cmake/Utils.cmake)
list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
cmake_policy(SET CMP0022 NEW)
cmake_policy(SET CMP0079 NEW)
cmake_policy(SET CMP0076 NEW)
set(CMAKE_POLICY_DEFAULT_CMP0063 NEW)
cmake_policy(SET CMP0063 NEW)

Expand Down Expand Up @@ -67,6 +68,8 @@ address, leak, undefined and thread.")
option(PLUGIN_DENSE_PARSER "Build dense parser plugin" OFF)
option(PLUGIN_RMM "Build with RAPIDS Memory Manager (RMM)" OFF)
option(PLUGIN_FEDERATED "Build with Federated Learning" OFF)
option(USE_GRPC_FROM_SYSTEM "Use gRPC from the system; if OFF, CMake will download
hcho3 marked this conversation as resolved.
Show resolved Hide resolved
gRPC automatically. Only applicable if PLUGIN_FEDERATED=ON" OFF)
## TODO: 1. Add check if DPC++ compiler is used for building
option(PLUGIN_UPDATER_ONEAPI "DPC++ updater" OFF)
option(ADD_PKGCONFIG "Add xgboost.pc into system." ON)
Expand Down Expand Up @@ -117,6 +120,20 @@ endif (BUILD_STATIC_LIB AND (R_LIB OR JVM_BINDINGS))
if (PLUGIN_RMM AND (NOT BUILD_WITH_CUDA_CUB))
message(SEND_ERROR "Cannot build with RMM using cub submodule.")
endif (PLUGIN_RMM AND (NOT BUILD_WITH_CUDA_CUB))
if (PLUGIN_FEDERATED)
if (CMAKE_CROSSCOMPILING)
message(SEND_ERROR "Cannot cross compile with federated learning support")
hcho3 marked this conversation as resolved.
Show resolved Hide resolved
endif ()
if (BUILD_STATIC_LIB)
message(SEND_ERROR "Cannot build static lib with federated learning support")
hcho3 marked this conversation as resolved.
Show resolved Hide resolved
endif ()
if (R_LIB OR JVM_BINDINGS)
message(SEND_ERROR "Cannot enable federated learning support when R or JVM packages are enabled.")
endif ()
if (WIN32)
message(SEND_ERROR "Federated learning not supported for Windows platform")
endif ()
endif ()

#-- Sanitizer
if (USE_SANITIZER)
Expand Down
71 changes: 55 additions & 16 deletions plugin/federated/CMakeLists.txt
@@ -1,26 +1,65 @@
# gRPC needs to be installed first. See README.md.
find_package(Protobuf CONFIG REQUIRED)
find_package(gRPC CONFIG REQUIRED)
find_package(Threads)
# Download and build gRPC
if (USE_GRPC_FROM_SYSTEM)
message(STATUS "Attempting to locate gRPC installation from the system...")
set(protobuf_MODULE_COMPATIBLE TRUE)
find_package(Protobuf CONFIG REQUIRED)
find_package(gRPC CONFIG REQUIRED)
message(STATUS "Found gRPC: ${gRPC_CONFIG}")
set(_PROTOBUF_LIBPROTOBUF protobuf::libprotobuf)
set(_PROTOBUF_PROTOC $<TARGET_FILE:protobuf::protoc>)
set(_GRPC_GRPCPP gRPC::grpc++)
set(_GRPC_CPP_PLUGIN_EXECUTABLE $<TARGET_FILE:gRPC::grpc_cpp_plugin>)
else ()
message(STATUS "Downloading gRPC source from GitHub...")
include(FetchContent)
FetchContent_Declare(
grpc
GIT_REPOSITORY https://github.com/grpc/grpc.git
GIT_TAG v1.49.1
GIT_SHALLOW ON
)
set(FETCHCONTENT_QUIET OFF)
FetchContent_MakeAvailable(grpc)
set(_PROTOBUF_LIBPROTOBUF libprotobuf)
set(_PROTOBUF_PROTOC $<TARGET_FILE:protoc>)
set(_GRPC_GRPCPP grpc++)
set(_GRPC_CPP_PLUGIN_EXECUTABLE $<TARGET_FILE:grpc_cpp_plugin>)
endif ()

# Proto file
get_filename_component(federated_proto "federated.proto" ABSOLUTE)
get_filename_component(federated_proto_path "${federated_proto}" PATH)

# Generated code from the protobuf definition.
add_library(federated_proto federated.proto)
target_link_libraries(federated_proto PUBLIC protobuf::libprotobuf gRPC::grpc gRPC::grpc++)
target_include_directories(federated_proto PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
set_property(TARGET federated_proto PROPERTY POSITION_INDEPENDENT_CODE ON)
set(federated_srcs "${CMAKE_CURRENT_BINARY_DIR}/federated.pb.cc")
set(federated_hdrs "${CMAKE_CURRENT_BINARY_DIR}/federated.pb.h")
set(federated_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/federated.grpc.pb.cc")
set(federated_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/federated.grpc.pb.h")
add_custom_command(
OUTPUT "${federated_srcs}" "${federated_hdrs}"
"${federated_grpc_srcs}" "${federated_grpc_hdrs}"
COMMAND ${_PROTOBUF_PROTOC}
ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
--cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
-I "${federated_proto_path}"
--plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
"${federated_proto}"
DEPENDS "${federated_proto}")

get_target_property(grpc_cpp_plugin_location gRPC::grpc_cpp_plugin LOCATION)
protobuf_generate(TARGET federated_proto LANGUAGE cpp)
protobuf_generate(
TARGET federated_proto
LANGUAGE grpc
GENERATE_EXTENSIONS .grpc.pb.h .grpc.pb.cc
PLUGIN "protoc-gen-grpc=${grpc_cpp_plugin_location}")
add_library(federated_proto_lib STATIC)
target_sources(federated_proto_lib PRIVATE
${federated_srcs} ${federated_hdrs}
${federated_grpc_srcs} ${federated_grpc_hdrs})
target_link_libraries(federated_proto_lib PUBLIC
${_PROTOBUF_LIBPROTOBUF} ${_GRPC_GRPCPP})
target_include_directories(federated_proto_lib PUBLIC
${CMAKE_CURRENT_BINARY_DIR})
set_property(TARGET federated_proto_lib PROPERTY POSITION_INDEPENDENT_CODE ON)

# Wrapper for the gRPC client.
add_library(federated_client INTERFACE)
target_sources(federated_client INTERFACE federated_client.h)
target_link_libraries(federated_client INTERFACE federated_proto)
target_link_libraries(federated_client INTERFACE federated_proto_lib)

# Rabit engine for Federated Learning.
target_sources(objxgboost PRIVATE federated_server.cc)
Expand Down
19 changes: 6 additions & 13 deletions plugin/federated/README.md
Expand Up @@ -3,29 +3,22 @@ XGBoost Plugin for Federated Learning

This folder contains the plugin for federated learning. Follow these steps to build and test it.

Install gRPC
------------
```shell
sudo apt-get install build-essential autoconf libtool pkg-config cmake ninja-build
git clone -b v1.47.0 https://github.com/grpc/grpc
cd grpc
git submodule update --init
cmake -S . -B build -GNinja -DABSL_PROPAGATE_CXX_STD=ON
cmake --build build --target install
```
Note. Building XGBoost with `-DPLUGIN_FEDERATED=ON` flag will automatically download the gRPC
source code and build it, along with its dependencies. This will increase compilation time.
If you already have gRPC installed on your system, pass additional flag
`-DUSE_GRPC_FROM_SYSTEM=ON` to speed up the build.

Build the Plugin
----------------
```shell
# Under xgboost source tree.
mkdir build
cd build
# For now NCCL needs to be turned off.
cmake .. -GNinja\
-DPLUGIN_FEDERATED=ON\
-DUSE_CUDA=ON\
-DBUILD_WITH_CUDA_CUB=ON\
-DUSE_NCCL=OFF
-DUSE_CUDA=ON\
-DUSE_NCCL=ON
ninja
cd ../python-package
pip install -e . # or equivalently python setup.py develop
Expand Down
2 changes: 1 addition & 1 deletion tests/buildkite/build-cpu.sh
Expand Up @@ -14,7 +14,7 @@ $command_wrapper rm -fv dmlc-core/include/dmlc/build_config_default.h
# the configured header build/dmlc/build_config.h instead of
# include/dmlc/build_config_default.h.
echo "--- Build libxgboost from the source"
$command_wrapper tests/ci_build/build_via_cmake.sh -DPLUGIN_DENSE_PARSER=ON
$command_wrapper tests/ci_build/build_via_cmake.sh -DPLUGIN_DENSE_PARSER=ON -DPLUGIN_FEDERATED=ON
echo "--- Run Google Test"
$command_wrapper bash -c "cd build && ctest --extra-verbose"
echo "--- Stash XGBoost CLI executable"
Expand Down
6 changes: 3 additions & 3 deletions tests/buildkite/build-cuda.sh
Expand Up @@ -21,9 +21,9 @@ command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg
echo "--- Build libxgboost from the source"
$command_wrapper tests/ci_build/prune_libnccl.sh
$command_wrapper tests/ci_build/build_via_cmake.sh -DUSE_CUDA=ON -DUSE_NCCL=ON \
-DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DUSE_NCCL_LIB_PATH=ON \
-DNCCL_INCLUDE_DIR=/usr/include -DNCCL_LIBRARY=/workspace/libnccl_static.a \
${arch_flag}
-DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DPLUGIN_FEDERATED=ON \
-DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \
-DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag}
echo "--- Build binary wheel"
$command_wrapper bash -c \
"cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal"
Expand Down
1 change: 1 addition & 0 deletions tests/buildkite/pipeline-mgpu.yml
Expand Up @@ -17,6 +17,7 @@ steps:
- label: ":docker: Build containers"
commands:
- "tests/buildkite/build-containers.sh gpu"
- "tests/buildkite/build-containers.sh gpu_build_centos7"
- "tests/buildkite/build-containers.sh jvm_gpu_build"
key: build-containers
agents:
Expand Down
2 changes: 2 additions & 0 deletions tests/buildkite/pipeline.yml
Expand Up @@ -13,7 +13,9 @@ steps:
#### -------- CONTAINER BUILD --------
- label: ":docker: Build containers"
commands:
- "tests/buildkite/build-containers.sh cpu"
- "tests/buildkite/build-containers.sh gpu"
- "tests/buildkite/build-containers.sh gpu_build_centos7"
- "tests/buildkite/build-containers.sh rmm"
key: build-containers
agents:
Expand Down
9 changes: 9 additions & 0 deletions tests/ci_build/Dockerfile.cpu
Expand Up @@ -26,6 +26,15 @@ ENV CPP=cpp-8
ENV GOSU_VERSION 1.10
ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/

# Install gRPC
RUN git clone -b v1.49.1 https://github.com/grpc/grpc.git \
--recurse-submodules --depth 1 --shallow-submodules && \
pushd grpc && \
cmake -S . -B build -GNinja && \
cmake --build build --target install && \
popd && \
rm -rf grpc

# Create new Conda environment
COPY conda_env/cpu_test.yml /scripts/
RUN mamba env create -n cpu_test --file=/scripts/cpu_test.yml
Expand Down
49 changes: 0 additions & 49 deletions tests/ci_build/Dockerfile.gpu_build

This file was deleted.

9 changes: 9 additions & 0 deletions tests/ci_build/Dockerfile.gpu_build_centos7
Expand Up @@ -35,6 +35,15 @@ ENV CPP=/opt/rh/devtoolset-8/root/usr/bin/cpp

ENV GOSU_VERSION 1.10

# Install gRPC
RUN git clone -b v1.49.1 https://github.com/grpc/grpc.git \
--recurse-submodules --depth 1 && \
pushd grpc && \
cmake -S . -B build -GNinja && \
cmake --build build --target install && \
popd && \
rm -rf grpc

# Install lightweight sudo (not bound to TTY)
RUN set -ex; \
wget -nv -nc -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
Expand Down
4 changes: 2 additions & 2 deletions tests/cpp/plugin/test_federated_adapter.cu
Expand Up @@ -54,7 +54,7 @@ TEST_F(FederatedAdapterTest, DeviceAllReduceSum) {
for (auto rank = 0; rank < kWorldSize; rank++) {
threads.emplace_back(std::thread([rank] {
FederatedCommunicator comm{kWorldSize, rank, kServerAddress};
DeviceCommunicatorAdapter adapter{rank, &comm};
DeviceCommunicatorAdapter adapter{0, &comm};
hcho3 marked this conversation as resolved.
Show resolved Hide resolved
int const count = 3;
thrust::device_vector<double> buffer(count, 0);
thrust::sequence(buffer.begin(), buffer.end());
Expand All @@ -76,7 +76,7 @@ TEST_F(FederatedAdapterTest, DeviceAllGatherV) {
for (auto rank = 0; rank < kWorldSize; rank++) {
threads.emplace_back(std::thread([rank] {
FederatedCommunicator comm{kWorldSize, rank, kServerAddress};
DeviceCommunicatorAdapter adapter{rank, &comm};
DeviceCommunicatorAdapter adapter{0, &comm};

int const count = rank + 2;
thrust::device_vector<char> buffer(count, 0);
Expand Down