dmlc · trivialfis · May 30, 2022 · May 13, 2022 · May 17, 2022 · May 20, 2022
diff --git a/doc/build.rst b/doc/build.rst
@@ -136,9 +136,9 @@ From the command line on Linux starting from the XGBoost directory:
 
   To speed up compilation, the compute version specific to your GPU could be passed to cmake as, e.g., ``-DGPU_COMPUTE_VER=50``. A quick explanation and numbers for some architectures can be found `in this page <https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/>`_.
 
-.. note:: Enabling distributed GPU training
+.. note:: Faster distributed GPU training with NCCL
 
-  By default, distributed GPU training is disabled and only a single GPU will be used. To enable distributed GPU training, set the option ``USE_NCCL=ON``. Distributed GPU training depends on NCCL2, available at `this link <https://developer.nvidia.com/nccl>`_. Since NCCL2 is only available for Linux machines, **distributed GPU training is available only for Linux**.
+  By default, distributed GPU training is enabled and uses Rabit for communication. For faster training, set the option ``USE_NCCL=ON``. Faster distributed GPU training depends on NCCL2, available at `this link <https://developer.nvidia.com/nccl>`_. Since NCCL2 is only available for Linux machines, **faster distributed GPU training is available only for Linux**.
 
   .. code-block:: bash
 

diff --git a/src/common/common.h b/src/common/common.h
@@ -274,6 +274,24 @@ template <typename Indexable>
 XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) {
   return indptr[group + 1] - 1;
 }
+
+/**
+ * @brief A CRTP (curiously recurring template pattern) helper function.
+ *
+ * https://www.fluentcpp.com/2017/05/19/crtp-helper/
+ *
+ * Does two things:
+ * 1. Makes "crtp" explicit in the inheritance structure of a CRTP base class.
+ * 2. Avoids having to `static_cast` in a lot of places.
+ *
+ * @tparam T The derived class in a CRTP hierarchy.
+ */
+template <typename T>
+struct Crtp {
+  T &Underlying() { return static_cast<T &>(*this); }
+  T const &Underlying() const { return static_cast<T const &>(*this); }
+};
+
 }  // namespace common
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_COMMON_H_
diff --git a/src/common/device_helpers.cu b/src/common/device_helpers.cu
@@ -30,19 +30,15 @@ std::string PrintUUID(xgboost::common::Span<uint64_t, kUuidLength> uuid) {
   return ss.str();
 }
 
-
-void AllReducer::Init(int _device_ordinal) {
 #ifdef XGBOOST_USE_NCCL
-  device_ordinal_ = _device_ordinal;
-  dh::safe_cuda(cudaSetDevice(device_ordinal_));
-
+void NcclAllReducer::DoInit(int _device_ordinal) {
   int32_t const rank = rabit::GetRank();
   int32_t const world = rabit::GetWorldSize();
 
   std::vector<uint64_t> uuids(world * kUuidLength, 0);
   auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
   auto s_this_uuid = s_uuid.subspan(rank * kUuidLength, kUuidLength);
-  GetCudaUUID(world, rank, device_ordinal_, s_this_uuid);
+  GetCudaUUID(world, rank, _device_ordinal, s_this_uuid);
 
   // No allgather yet.
   rabit::Allreduce<rabit::op::Sum, uint64_t>(uuids.data(), uuids.size());
@@ -66,20 +62,11 @@ void AllReducer::Init(int _device_ordinal) {
   id_ = GetUniqueId();
   dh::safe_nccl(ncclCommInitRank(&comm_, rabit::GetWorldSize(), id_, rank));
   safe_cuda(cudaStreamCreate(&stream_));
-  initialised_ = true;
-#else
-  if (rabit::IsDistributed()) {
-    LOG(FATAL) << "XGBoost is not compiled with NCCL.";
-  }
-#endif  // XGBOOST_USE_NCCL
 }
 
-void AllReducer::AllGather(void const *data, size_t length_bytes,
-                           std::vector<size_t> *segments,
-                           dh::caching_device_vector<char> *recvbuf) {
-#ifdef XGBOOST_USE_NCCL
-  CHECK(initialised_);
-  dh::safe_cuda(cudaSetDevice(device_ordinal_));
+void NcclAllReducer::DoAllGather(void const *data, size_t length_bytes,
+                                 std::vector<size_t> *segments,
+                                 dh::caching_device_vector<char> *recvbuf) {
   size_t world = rabit::GetWorldSize();
   segments->clear();
   segments->resize(world, 0);
@@ -98,11 +85,9 @@ void AllReducer::AllGather(void const *data, size_t length_bytes,
     offset += as_bytes;
   }
   safe_nccl(ncclGroupEnd());
-#endif  // XGBOOST_USE_NCCL
 }
 
-AllReducer::~AllReducer() {
-#ifdef XGBOOST_USE_NCCL
+NcclAllReducer::~NcclAllReducer() {
   if (initialised_) {
     dh::safe_cuda(cudaStreamDestroy(stream_));
     ncclCommDestroy(comm_);
@@ -112,7 +97,41 @@ AllReducer::~AllReducer() {
     LOG(CONSOLE) << "AllReduce calls: " << allreduce_calls_;
     LOG(CONSOLE) << "AllReduce total MiB communicated: " << allreduce_bytes_/1048576;
   }
-#endif  // XGBOOST_USE_NCCL
 }
+#else
+void RabitAllReducer::DoInit(int _device_ordinal) {
+#if !defined(XGBOOST_USE_FEDERATED)
+  if (rabit::IsDistributed()) {
+    LOG(CONSOLE) << "XGBoost is not compiled with NCCL, falling back to Rabit.";
+  }
+#endif
+}
+
+void RabitAllReducer::DoAllGather(void const *data, size_t length_bytes,
+                                  std::vector<size_t> *segments,
+                                  dh::caching_device_vector<char> *recvbuf) {
+  size_t world = rabit::GetWorldSize();
+  segments->clear();
+  segments->resize(world, 0);
+  segments->at(rabit::GetRank()) = length_bytes;
+  rabit::Allreduce<rabit::op::Max>(segments->data(), segments->size());
+  auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
+  recvbuf->resize(total_bytes);
+
+  sendrecvbuf_.reserve(total_bytes);
+  auto rank = rabit::GetRank();
+  size_t offset = 0;
+  for (int32_t i = 0; i < world; ++i) {
+    size_t as_bytes = segments->at(i);
+    if (i == rank) {
+      safe_cuda(
+          cudaMemcpy(sendrecvbuf_.data() + offset, data, segments->at(rank), cudaMemcpyDefault));
+    }
+    rabit::Broadcast(sendrecvbuf_.data() + offset, as_bytes, i);
+    offset += as_bytes;
+  }
+  safe_cuda(cudaMemcpy(recvbuf->data().get(), sendrecvbuf_.data(), total_bytes, cudaMemcpyDefault));
+}
+#endif  // XGBOOST_USE_NCCL
 
 }  // namespace dh