Skip to content

Commit

Permalink
Enable distributed GPU training over Rabit (#7930)
Browse files Browse the repository at this point in the history
  • Loading branch information
rongou committed May 30, 2022
1 parent 6275cdc commit 80339c3
Show file tree
Hide file tree
Showing 9 changed files with 452 additions and 123 deletions.
4 changes: 2 additions & 2 deletions doc/build.rst
Expand Up @@ -136,9 +136,9 @@ From the command line on Linux starting from the XGBoost directory:

To speed up compilation, the compute version specific to your GPU could be passed to cmake as, e.g., ``-DGPU_COMPUTE_VER=50``. A quick explanation and numbers for some architectures can be found `in this page <https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/>`_.

.. note:: Enabling distributed GPU training
.. note:: Faster distributed GPU training with NCCL

By default, distributed GPU training is disabled and only a single GPU will be used. To enable distributed GPU training, set the option ``USE_NCCL=ON``. Distributed GPU training depends on NCCL2, available at `this link <https://developer.nvidia.com/nccl>`_. Since NCCL2 is only available for Linux machines, **distributed GPU training is available only for Linux**.
By default, distributed GPU training is enabled and uses Rabit for communication. For faster training, set the option ``USE_NCCL=ON``. Faster distributed GPU training depends on NCCL2, available at `this link <https://developer.nvidia.com/nccl>`_. Since NCCL2 is only available for Linux machines, **faster distributed GPU training is available only for Linux**.

.. code-block:: bash
Expand Down
18 changes: 18 additions & 0 deletions src/common/common.h
Expand Up @@ -274,6 +274,24 @@ template <typename Indexable>
XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) {
return indptr[group + 1] - 1;
}

/**
* @brief A CRTP (curiously recurring template pattern) helper function.
*
* https://www.fluentcpp.com/2017/05/19/crtp-helper/
*
* Does two things:
* 1. Makes "crtp" explicit in the inheritance structure of a CRTP base class.
* 2. Avoids having to `static_cast` in a lot of places.
*
* @tparam T The derived class in a CRTP hierarchy.
*/
template <typename T>
struct Crtp {
T &Underlying() { return static_cast<T &>(*this); }
T const &Underlying() const { return static_cast<T const &>(*this); }
};

} // namespace common
} // namespace xgboost
#endif // XGBOOST_COMMON_COMMON_H_
63 changes: 41 additions & 22 deletions src/common/device_helpers.cu
Expand Up @@ -30,19 +30,15 @@ std::string PrintUUID(xgboost::common::Span<uint64_t, kUuidLength> uuid) {
return ss.str();
}


void AllReducer::Init(int _device_ordinal) {
#ifdef XGBOOST_USE_NCCL
device_ordinal_ = _device_ordinal;
dh::safe_cuda(cudaSetDevice(device_ordinal_));

void NcclAllReducer::DoInit(int _device_ordinal) {
int32_t const rank = rabit::GetRank();
int32_t const world = rabit::GetWorldSize();

std::vector<uint64_t> uuids(world * kUuidLength, 0);
auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
auto s_this_uuid = s_uuid.subspan(rank * kUuidLength, kUuidLength);
GetCudaUUID(world, rank, device_ordinal_, s_this_uuid);
GetCudaUUID(world, rank, _device_ordinal, s_this_uuid);

// No allgather yet.
rabit::Allreduce<rabit::op::Sum, uint64_t>(uuids.data(), uuids.size());
Expand All @@ -66,20 +62,11 @@ void AllReducer::Init(int _device_ordinal) {
id_ = GetUniqueId();
dh::safe_nccl(ncclCommInitRank(&comm_, rabit::GetWorldSize(), id_, rank));
safe_cuda(cudaStreamCreate(&stream_));
initialised_ = true;
#else
if (rabit::IsDistributed()) {
LOG(FATAL) << "XGBoost is not compiled with NCCL.";
}
#endif // XGBOOST_USE_NCCL
}

void AllReducer::AllGather(void const *data, size_t length_bytes,
std::vector<size_t> *segments,
dh::caching_device_vector<char> *recvbuf) {
#ifdef XGBOOST_USE_NCCL
CHECK(initialised_);
dh::safe_cuda(cudaSetDevice(device_ordinal_));
void NcclAllReducer::DoAllGather(void const *data, size_t length_bytes,
std::vector<size_t> *segments,
dh::caching_device_vector<char> *recvbuf) {
size_t world = rabit::GetWorldSize();
segments->clear();
segments->resize(world, 0);
Expand All @@ -98,11 +85,9 @@ void AllReducer::AllGather(void const *data, size_t length_bytes,
offset += as_bytes;
}
safe_nccl(ncclGroupEnd());
#endif // XGBOOST_USE_NCCL
}

AllReducer::~AllReducer() {
#ifdef XGBOOST_USE_NCCL
NcclAllReducer::~NcclAllReducer() {
if (initialised_) {
dh::safe_cuda(cudaStreamDestroy(stream_));
ncclCommDestroy(comm_);
Expand All @@ -112,7 +97,41 @@ AllReducer::~AllReducer() {
LOG(CONSOLE) << "AllReduce calls: " << allreduce_calls_;
LOG(CONSOLE) << "AllReduce total MiB communicated: " << allreduce_bytes_/1048576;
}
#endif // XGBOOST_USE_NCCL
}
#else
void RabitAllReducer::DoInit(int _device_ordinal) {
#if !defined(XGBOOST_USE_FEDERATED)
if (rabit::IsDistributed()) {
LOG(CONSOLE) << "XGBoost is not compiled with NCCL, falling back to Rabit.";
}
#endif
}

void RabitAllReducer::DoAllGather(void const *data, size_t length_bytes,
std::vector<size_t> *segments,
dh::caching_device_vector<char> *recvbuf) {
size_t world = rabit::GetWorldSize();
segments->clear();
segments->resize(world, 0);
segments->at(rabit::GetRank()) = length_bytes;
rabit::Allreduce<rabit::op::Max>(segments->data(), segments->size());
auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
recvbuf->resize(total_bytes);

sendrecvbuf_.reserve(total_bytes);
auto rank = rabit::GetRank();
size_t offset = 0;
for (int32_t i = 0; i < world; ++i) {
size_t as_bytes = segments->at(i);
if (i == rank) {
safe_cuda(
cudaMemcpy(sendrecvbuf_.data() + offset, data, segments->at(rank), cudaMemcpyDefault));
}
rabit::Broadcast(sendrecvbuf_.data() + offset, as_bytes, i);
offset += as_bytes;
}
safe_cuda(cudaMemcpy(recvbuf->data().get(), sendrecvbuf_.data(), total_bytes, cudaMemcpyDefault));
}
#endif // XGBOOST_USE_NCCL

} // namespace dh

0 comments on commit 80339c3

Please sign in to comment.