From 755251c7e04a7e766393ececca9780b909ea8b9c Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Mon, 13 Jun 2022 16:38:48 -0700
Subject: [PATCH 1/5] remove rabit reducer

---
 rabit/include/rabit/internal/rabit-inl.h | 21 -------------
 rabit/include/rabit/rabit.h              | 39 ------------------------
 src/tree/hist/histogram.h                |  5 ++-
 src/tree/updater_histmaker.cc            |  6 ++--
 src/tree/updater_refresh.cc              |  5 ++-
 5 files changed, 6 insertions(+), 70 deletions(-)
diff --git a/rabit/include/rabit/internal/rabit-inl.h b/rabit/include/rabit/internal/rabit-inl.h
index 9289ea8806e6..88cf2128af37 100644
--- a/rabit/include/rabit/internal/rabit-inl.h
+++ b/rabit/include/rabit/internal/rabit-inl.h
@@ -254,22 +254,6 @@ inline void ReducerAlignImpl(const void *src_, void *dst_,
     freduce(pdst[i], psrc[i]);
   }
 }
-template<typename DType, void (*freduce)(DType &dst, const DType &src)>  // NOLINT(*)
-inline Reducer<DType, freduce>::Reducer() {
-  // it is safe to directly use handle for aligned data types
-  if (sizeof(DType) == 8 || sizeof(DType) == 4 || sizeof(DType) == 1) {
-    this->handle_.Init(ReducerAlignImpl<DType, freduce>, sizeof(DType));
-  } else {
-    this->handle_.Init(ReducerSafeImpl<DType, freduce>, sizeof(DType));
-  }
-}
-template<typename DType, void (*freduce)(DType &dst, const DType &src)> // NOLINT(*)
-inline void Reducer<DType, freduce>::Allreduce(DType *sendrecvbuf, size_t count,
-                                               void (*prepare_fun)(void *arg),
-                                               void *prepare_arg) {
-  handle_.Allreduce(sendrecvbuf, sizeof(DType), count, prepare_fun,
-                    prepare_arg);
-}
 // function to perform reduction for SerializeReducer
 template<typename DType>
 inline void SerializeReducerFuncImpl(const void *src_, void *dst_,
@@ -331,11 +315,6 @@ inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
   }
 }
 
-template<typename DType, void (*freduce)(DType &dst, const DType &src)>  // NOLINT(*)g
-inline void Reducer<DType, freduce>::Allreduce(DType *sendrecvbuf, size_t count,
-                                               std::function<void()> prepare_fun) {
-  this->Allreduce(sendrecvbuf, count, InvokeLambda, &prepare_fun);
-}
 template<typename DType>
 inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
                                                size_t max_nbytes, size_t count,
diff --git a/rabit/include/rabit/rabit.h b/rabit/include/rabit/rabit.h
index 8f10cf3f3b32..1f2b16ea1126 100644
--- a/rabit/include/rabit/rabit.h
+++ b/rabit/include/rabit/rabit.h
@@ -279,45 +279,6 @@ inline int VersionNumber();
 namespace engine {
 class ReduceHandle;
 }  // namespace engine
-/*!
- * \brief template class to make customized reduce and all reduce easy
- *  Do not use reducer directly in the function you call Finalize,
- *   because the destructor can execute after Finalize
- * \tparam DType data type that to be reduced
- * \tparam freduce the customized reduction function
- *  DType must be a struct, with no pointer
- */
-template<typename DType, void (*freduce)(DType &dst, const DType &src)>  // NOLINT(*)
-class Reducer {
- public:
-  Reducer();
-  /*!
-   * \brief customized in-place all reduce operation
-   * \param sendrecvbuf the in place send-recv buffer
-   * \param count number of elements to be reduced
-   * \param prepare_fun Lazy preprocessing function, if it is not NULL, prepare_fun(prepare_arg)
-   *                     will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf.
-   *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
-   * \param prepare_arg argument used to pass into the lazy preprocessing function
-   */
-  inline void Allreduce(DType *sendrecvbuf, size_t count,
-                        void (*prepare_fun)(void *) = nullptr,
-                        void *prepare_arg = nullptr);
-#if DMLC_USE_CXX11
-  /*!
-   * \brief customized in-place all reduce operation, with lambda function as preprocessor
-   * \param sendrecvbuf pointer to the array of objects to be reduced
-   * \param count number of elements to be reduced
-   * \param prepare_fun lambda function executed to prepare the data, if necessary
-   */
-  inline void Allreduce(DType *sendrecvbuf, size_t count,
-                        std::function<void()> prepare_fun);
-#endif  // DMLC_USE_CXX11
-
- private:
-  /*! \brief function handle to do reduce */
-  engine::ReduceHandle handle_;
-};
 /*!
  * \brief template class to make customized reduce,
  *  this class defines complex reducer handles all the data structure that can be
diff --git a/src/tree/hist/histogram.h b/src/tree/hist/histogram.h
index 266086af11a4..6b93577a0c6e 100644
--- a/src/tree/hist/histogram.h
+++ b/src/tree/hist/histogram.h
@@ -24,7 +24,6 @@ class HistogramBuilder {
   common::HistCollection hist_local_worker_;
   common::GHistBuilder builder_;
   common::ParallelGHistBuilder buffer_;
-  rabit::Reducer<GradientPairPrecise, GradientPairPrecise::Reduce> reducer_;
   BatchParam param_;
   int32_t n_threads_{-1};
   size_t n_batches_{0};
@@ -199,8 +198,8 @@ class HistogramBuilder {
           }
         });
 
-    reducer_.Allreduce(this->hist_[starting_index].data(),
-                       builder_.GetNumBins() * sync_count);
+    rabit::Allreduce<rabit::op::Sum>(reinterpret_cast<double*>(this->hist_[starting_index].data()),
+                                     builder_.GetNumBins() * sync_count * 2);
 
     ParallelSubtractionHist(space, nodes_for_explicit_hist_build,
                             nodes_for_subtraction_trick, p_tree);
diff --git a/src/tree/updater_histmaker.cc b/src/tree/updater_histmaker.cc
index 1c625954922e..4832ffa5e998 100644
--- a/src/tree/updater_histmaker.cc
+++ b/src/tree/updater_histmaker.cc
@@ -101,8 +101,6 @@ class HistMaker: public BaseMaker {
   };
   // workspace of thread
   ThreadWSpace wspace_;
-  // reducer for histogram
-  rabit::Reducer<GradStats, GradStats::Reduce> histred_;
   // set of working features
   std::vector<bst_feature_t> selected_features_;
   // update function implementation
@@ -359,8 +357,8 @@ class CQHistMaker : public HistMaker {
       }
     };
     // sync the histogram
-    this->histred_.Allreduce(dmlc::BeginPtr(this->wspace_.hset[0].data),
-                             this->wspace_.hset[0].data.size(), lazy_get_hist);
+    rabit::Allreduce<rabit::op::Sum>(&dmlc::BeginPtr(this->wspace_.hset[0].data)->sum_grad,
+                                     this->wspace_.hset[0].data.size() * 2, lazy_get_hist);
   }
 
   void ResetPositionAfterSplit(DMatrix *,
diff --git a/src/tree/updater_refresh.cc b/src/tree/updater_refresh.cc
index 72a81c730884..19bfe7d00195 100644
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -100,7 +100,8 @@ class TreeRefresher : public TreeUpdater {
         }
       });
     };
-    reducer_.Allreduce(dmlc::BeginPtr(stemp[0]), stemp[0].size(), lazy_get_stats);
+    rabit::Allreduce<rabit::op::Sum>(&dmlc::BeginPtr(stemp[0])->sum_grad, stemp[0].size() * 2,
+                                     lazy_get_stats);
     // rescale learning rate according to size of trees
     float lr = param_.learning_rate;
     param_.learning_rate = lr / trees.size();
@@ -154,8 +155,6 @@ class TreeRefresher : public TreeUpdater {
   }
   // training parameter
   TrainParam param_;
-  // reducer
-  rabit::Reducer<GradStats, GradStats::Reduce> reducer_;
 };
 
 XGBOOST_REGISTER_TREE_UPDATER(TreeRefresher, "refresh")

From 39a0147307b0907cc0311ac2b2baab4198780ebd Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Wed, 15 Jun 2022 17:43:09 -0700
Subject: [PATCH 2/5] remove histmaker

---
 amalgamation/xgboost-all0.cc             |   1 -
 plugin/federated/engine_federated.cc     |  30 --
 rabit/include/rabit/internal/engine.h    |  45 --
 rabit/include/rabit/internal/rabit-inl.h |  96 ----
 rabit/include/rabit/rabit.h              |  55 --
 rabit/src/engine.cc                      |  22 -
 src/tree/tree_updater.cc                 |   1 -
 src/tree/updater_histmaker.cc            | 627 -----------------------
 8 files changed, 877 deletions(-)
 delete mode 100644 src/tree/updater_histmaker.cc

diff --git a/amalgamation/xgboost-all0.cc b/amalgamation/xgboost-all0.cc
index ef3c2ffde8a7..cf02b07a097e 100644
--- a/amalgamation/xgboost-all0.cc
+++ b/amalgamation/xgboost-all0.cc
@@ -55,7 +55,6 @@
 #include "../src/tree/tree_updater.cc"
 #include "../src/tree/updater_approx.cc"
 #include "../src/tree/updater_colmaker.cc"
-#include "../src/tree/updater_histmaker.cc"
 #include "../src/tree/updater_prune.cc"
 #include "../src/tree/updater_quantile_hist.cc"
 #include "../src/tree/updater_refresh.cc"
diff --git a/plugin/federated/engine_federated.cc b/plugin/federated/engine_federated.cc
index d18e9e095d0d..8fd396d94744 100644
--- a/plugin/federated/engine_federated.cc
+++ b/plugin/federated/engine_federated.cc
@@ -238,35 +238,5 @@ void Allreduce_(void *sendrecvbuf, size_t type_nbytes, size_t count, IEngine::Re
   if (engine.GetWorldSize() == 1) return;
   engine.Allreduce(sendrecvbuf, type_nbytes * count, dtype, op);
 }
-
-ReduceHandle::ReduceHandle() = default;
-ReduceHandle::~ReduceHandle() = default;
-
-int ReduceHandle::TypeSize(const MPI::Datatype &dtype) { return static_cast<int>(dtype.type_size); }
-
-void ReduceHandle::Init(IEngine::ReduceFunction redfunc,
-                        __attribute__((unused)) size_t type_nbytes) {
-  utils::Assert(redfunc_ == nullptr, "cannot initialize reduce handle twice");
-  redfunc_ = redfunc;
-}
-
-void ReduceHandle::Allreduce(void *sendrecvbuf, size_t type_nbytes, size_t count,
-                             IEngine::PreprocFunction prepare_fun, void *prepare_arg) {
-  utils::Assert(redfunc_ != nullptr, "must initialize handle to call AllReduce");
-  if (prepare_fun != nullptr) prepare_fun(prepare_arg);
-  if (engine.GetWorldSize() == 1) return;
-
-  // Gather all the buffers and call the reduce function locally.
-  auto const buffer_size = type_nbytes * count;
-  auto const gathered = engine.Allgather(sendrecvbuf, buffer_size);
-  auto const *data = gathered.data();
-  for (int i = 0; i < engine.GetWorldSize(); i++) {
-    if (i != engine.GetRank()) {
-      redfunc_(data + buffer_size * i, sendrecvbuf, static_cast<int>(count),
-               MPI::Datatype(type_nbytes));
-    }
-  }
-}
-
 }  // namespace engine
 }  // namespace rabit
diff --git a/rabit/include/rabit/internal/engine.h b/rabit/include/rabit/internal/engine.h
index 50b452f8db1a..84d92143a1d2 100644
--- a/rabit/include/rabit/internal/engine.h
+++ b/rabit/include/rabit/internal/engine.h
@@ -245,51 +245,6 @@ void Allreduce_(void *sendrecvbuf,  // NOLINT
                 mpi::OpType op,
                 IEngine::PreprocFunction prepare_fun = nullptr,
                 void *prepare_arg = nullptr);
-/*!
- * \brief handle for customized reducer, used to handle customized reduce
- *  this class is mainly created for compatiblity issues with MPI's customized reduce
- */
-class ReduceHandle {
- public:
-  // constructor
-  ReduceHandle();
-  // destructor
-  ~ReduceHandle();
-  /*!
-   * \brief initialize the reduce function,
-   *   with the type the reduce function needs to deal with
-   *   the reduce function MUST be communicative
-   */
-  void Init(IEngine::ReduceFunction redfunc, size_t type_nbytes);
-  /*!
-   * \brief customized in-place all reduce operation
-   * \param sendrecvbuf the in place send-recv buffer
-   * \param type_n4bytes size of the type, in terms of 4bytes
-   * \param count number of elements to send
-   * \param prepare_func Lazy preprocessing function, lazy prepare_fun(prepare_arg)
-   *                     will be called by the function before performing Allreduce in order to initialize the data in sendrecvbuf_.
-   *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
-   * \param prepare_arg argument used to pass into the lazy preprocessing function
-   */
-  void Allreduce(void *sendrecvbuf,
-                 size_t type_nbytes,
-                 size_t count,
-                 IEngine::PreprocFunction prepare_fun = nullptr,
-                 void *prepare_arg = nullptr);
-
-  /*! \return the number of bytes occupied by the type */
-  static int TypeSize(const MPI::Datatype &dtype);
-
- protected:
-  // handle function field
-  void *handle_ {nullptr};
-  // reduce function of the reducer
-  IEngine::ReduceFunction *redfunc_{nullptr};
-  // handle to the type field
-  void *htype_{nullptr};
-  // the created type in 4 bytes
-  size_t created_type_nbytes_;
-};
 }  // namespace engine
 }  // namespace rabit
 #endif  // RABIT_INTERNAL_ENGINE_H_
diff --git a/rabit/include/rabit/internal/rabit-inl.h b/rabit/include/rabit/internal/rabit-inl.h
index 88cf2128af37..6ad296f79aab 100644
--- a/rabit/include/rabit/internal/rabit-inl.h
+++ b/rabit/include/rabit/internal/rabit-inl.h
@@ -225,101 +225,5 @@ inline void LazyCheckPoint(const Serializable *global_model) {
 inline int VersionNumber() {
   return engine::GetEngine()->VersionNumber();
 }
-// ---------------------------------
-// Code to handle customized Reduce
-// ---------------------------------
-// function to perform reduction for Reducer
-template<typename DType, void (*freduce)(DType &dst, const DType &src)>
-inline void ReducerSafeImpl(const void *src_, void *dst_, int len_, const MPI::Datatype &dtype) {
-  const size_t kUnit = sizeof(DType);
-  const char *psrc = reinterpret_cast<const char*>(src_);
-  char *pdst = reinterpret_cast<char*>(dst_);
-
-  for (int i = 0; i < len_; ++i) {
-    DType tdst, tsrc;
-    // use memcpy to avoid alignment issue
-    std::memcpy(&tdst, pdst + (i * kUnit), sizeof(tdst));
-    std::memcpy(&tsrc, psrc + (i * kUnit), sizeof(tsrc));
-    freduce(tdst, tsrc);
-    std::memcpy(pdst + i * kUnit, &tdst, sizeof(tdst));
-  }
-}
-// function to perform reduction for Reducer
-template<typename DType, void (*freduce)(DType &dst, const DType &src)> // NOLINT(*)
-inline void ReducerAlignImpl(const void *src_, void *dst_,
-                          int len_, const MPI::Datatype &dtype) {
-  const DType *psrc = reinterpret_cast<const DType*>(src_);
-  DType *pdst = reinterpret_cast<DType*>(dst_);
-  for (int i = 0; i < len_; ++i) {
-    freduce(pdst[i], psrc[i]);
-  }
-}
-// function to perform reduction for SerializeReducer
-template<typename DType>
-inline void SerializeReducerFuncImpl(const void *src_, void *dst_,
-                                     int len_, const MPI::Datatype &dtype) {
-  int nbytes = engine::ReduceHandle::TypeSize(dtype);
-  // temp space
-  for (int i = 0; i < len_; ++i) {
-    DType tsrc, tdst;
-    utils::MemoryFixSizeBuffer fsrc((char*)(src_) + i * nbytes, nbytes); // NOLINT(*)
-    utils::MemoryFixSizeBuffer fdst((char*)(dst_) + i * nbytes, nbytes); // NOLINT(*)
-    tsrc.Load(fsrc);
-    tdst.Load(fdst);
-    // govern const check
-    tdst.Reduce(static_cast<const DType &>(tsrc), nbytes);
-    fdst.Seek(0);
-    tdst.Save(fdst);
-  }
-}
-template<typename DType>
-inline SerializeReducer<DType>::SerializeReducer() {
-  handle_.Init(SerializeReducerFuncImpl<DType>, sizeof(DType));
-}
-// closure to call Allreduce
-template<typename DType>
-struct SerializeReduceClosure {
-  DType *sendrecvobj;
-  size_t max_nbyte, count;
-  void (*prepare_fun)(void *arg);
-  void *prepare_arg;
-  std::string *p_buffer;
-  // invoke the closure
-  inline void Run() {
-    if (prepare_fun != nullptr) prepare_fun(prepare_arg);
-    for (size_t i = 0; i < count; ++i) {
-      utils::MemoryFixSizeBuffer fs(BeginPtr(*p_buffer) + i * max_nbyte, max_nbyte);
-      sendrecvobj[i].Save(fs);
-    }
-  }
-  inline static void Invoke(void *c) {
-    static_cast<SerializeReduceClosure<DType>*>(c)->Run();
-  }
-};
-template<typename DType>
-inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
-                                               size_t max_nbyte, size_t count,
-                                               void (*prepare_fun)(void *arg),
-                                               void *prepare_arg) {
-  buffer_.resize(max_nbyte * count);
-  // setup closure
-  SerializeReduceClosure<DType> c;
-  c.sendrecvobj = sendrecvobj; c.max_nbyte = max_nbyte; c.count = count;
-  c.prepare_fun = prepare_fun; c.prepare_arg = prepare_arg; c.p_buffer = &buffer_;
-  // invoke here
-  handle_.Allreduce(BeginPtr(buffer_), max_nbyte, count,
-                    SerializeReduceClosure<DType>::Invoke, &c);
-  for (size_t i = 0; i < count; ++i) {
-    utils::MemoryFixSizeBuffer fs(BeginPtr(buffer_) + i * max_nbyte, max_nbyte);
-    sendrecvobj[i].Load(fs);
-  }
-}
-
-template<typename DType>
-inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
-                                               size_t max_nbytes, size_t count,
-                                               std::function<void()> prepare_fun) {
-  this->Allreduce(sendrecvobj, max_nbytes, count, InvokeLambda, &prepare_fun);
-}
 }  // namespace rabit
 #endif  // RABIT_INTERNAL_RABIT_INL_H_
diff --git a/rabit/include/rabit/rabit.h b/rabit/include/rabit/rabit.h
index 1f2b16ea1126..3da8ca268dde 100644
--- a/rabit/include/rabit/rabit.h
+++ b/rabit/include/rabit/rabit.h
@@ -274,61 +274,6 @@ inline void LazyCheckPoint(const Serializable *global_model);
  * \sa LoadCheckPoint, CheckPoint
  */
 inline int VersionNumber();
-// ----- extensions that allow customized reducer ------
-// helper class to do customized reduce, user do not need to know the type
-namespace engine {
-class ReduceHandle;
-}  // namespace engine
-/*!
- * \brief template class to make customized reduce,
- *  this class defines complex reducer handles all the data structure that can be
- *  serialized/deserialized into fixed size buffer
- *  Do not use reducer directly in the function you call Finalize, because the destructor can execute after Finalize
- *
- * \tparam DType data type that to be reduced, DType must contain the following functions:
- * \tparam freduce the customized reduction function
- *   (1) Save(IStream &fs)  (2) Load(IStream &fs) (3) Reduce(const DType &src, size_t max_nbyte)
- */
-template<typename DType>
-class SerializeReducer {
- public:
-  SerializeReducer();
-  /*!
-   * \brief customized in-place all reduce operation
-   * \param sendrecvobj pointer to the array of objects to be reduced
-   * \param max_nbyte maximum amount of memory needed to serialize each object
-   *        this includes budget limit for intermediate and final result
-   * \param count number of elements to be reduced
-   * \param prepare_fun Lazy preprocessing function, if it is not NULL, prepare_fun(prepare_arg)
-   *                     will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf.
-   *                     If the result of Allreduce can be recovered directly, then the prepare_func will NOT be called
-   * \param prepare_arg argument used to pass into the lazy preprocessing function
-   */
-  inline void Allreduce(DType *sendrecvobj,
-                        size_t max_nbyte, size_t count,
-                        void (*prepare_fun)(void *) = nullptr,
-                        void *prepare_arg = nullptr);
-// C++11 support for lambda prepare function
-#if DMLC_USE_CXX11
-  /*!
-   * \brief customized in-place all reduce operation, with lambda function as preprocessor
-   * \param sendrecvobj pointer to the array of objects to be reduced
-   * \param max_nbyte maximum amount of memory needed to serialize each object
-   *        this includes budget limit for intermediate and final result
-   * \param count number of elements to be reduced
-   * \param prepare_fun lambda function executed to prepare the data, if necessary
-   */
-  inline void Allreduce(DType *sendrecvobj,
-                        size_t max_nbyte, size_t count,
-                        std::function<void()> prepare_fun);
-#endif  // DMLC_USE_CXX11
-
- private:
-  /*! \brief function handle to do reduce */
-  engine::ReduceHandle handle_;
-  /*! \brief temporal buffer used to do reduce*/
-  std::string buffer_;
-};
 }  // namespace rabit
 // implementation of template functions
 #include "./internal/rabit-inl.h"
diff --git a/rabit/src/engine.cc b/rabit/src/engine.cc
index 36e28a1771c1..5e325383c299 100644
--- a/rabit/src/engine.cc
+++ b/rabit/src/engine.cc
@@ -103,27 +103,5 @@ void Allreduce_(void *sendrecvbuf,  // NOLINT
   GetEngine()->Allreduce(sendrecvbuf, type_nbytes, count, red, prepare_fun,
     prepare_arg);
 }
-
-// code for reduce handle
-ReduceHandle::ReduceHandle() = default;
-ReduceHandle::~ReduceHandle() = default;
-
-int ReduceHandle::TypeSize(const MPI::Datatype &dtype) {
-  return static_cast<int>(dtype.type_size);
-}
-
-void ReduceHandle::Init(IEngine::ReduceFunction redfunc, size_t ) {
-  utils::Assert(redfunc_ == nullptr, "cannot initialize reduce handle twice");
-  redfunc_ = redfunc;
-}
-
-void ReduceHandle::Allreduce(void *sendrecvbuf,
-                             size_t type_nbytes, size_t count,
-                             IEngine::PreprocFunction prepare_fun,
-                             void *prepare_arg) {
-  utils::Assert(redfunc_ != nullptr, "must initialize handle to call AllReduce");
-  GetEngine()->Allreduce(sendrecvbuf, type_nbytes, count,
-                         redfunc_, prepare_fun, prepare_arg);
-}
 }  // namespace engine
 }  // namespace rabit
diff --git a/src/tree/tree_updater.cc b/src/tree/tree_updater.cc
index ee5659636305..190a1e02080e 100644
--- a/src/tree/tree_updater.cc
+++ b/src/tree/tree_updater.cc
@@ -33,7 +33,6 @@ DMLC_REGISTRY_LINK_TAG(updater_colmaker);
 DMLC_REGISTRY_LINK_TAG(updater_refresh);
 DMLC_REGISTRY_LINK_TAG(updater_prune);
 DMLC_REGISTRY_LINK_TAG(updater_quantile_hist);
-DMLC_REGISTRY_LINK_TAG(updater_histmaker);
 DMLC_REGISTRY_LINK_TAG(updater_approx);
 DMLC_REGISTRY_LINK_TAG(updater_sync);
 #ifdef XGBOOST_USE_CUDA
diff --git a/src/tree/updater_histmaker.cc b/src/tree/updater_histmaker.cc
deleted file mode 100644
index 4832ffa5e998..000000000000
--- a/src/tree/updater_histmaker.cc
+++ /dev/null
@@ -1,627 +0,0 @@
-/*!
- * Copyright 2014-2022 by XGBoost Contributors
- * \file updater_histmaker.cc
- * \brief use histogram counting to construct a tree
- * \author Tianqi Chen
- */
-#include <rabit/rabit.h>
-#include <vector>
-#include <algorithm>
-
-#include "xgboost/tree_updater.h"
-#include "xgboost/base.h"
-#include "xgboost/logging.h"
-
-#include "../common/quantile.h"
-#include "../common/group_data.h"
-#include "./updater_basemaker-inl.h"
-#include "constraints.h"
-
-namespace xgboost {
-namespace tree {
-
-DMLC_REGISTRY_FILE_TAG(updater_histmaker);
-
-class HistMaker: public BaseMaker {
- public:
-  explicit HistMaker(GenericParameter const *ctx) : BaseMaker(ctx) {}
-  void Update(HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
-              common::Span<HostDeviceVector<bst_node_t>> out_position,
-              const std::vector<RegTree *> &trees) override {
-    interaction_constraints_.Configure(param_, p_fmat->Info().num_col_);
-    // rescale learning rate according to size of trees
-    float lr = param_.learning_rate;
-    param_.learning_rate = lr / trees.size();
-    // build tree
-    for (auto tree : trees) {
-      this->UpdateTree(gpair->ConstHostVector(), p_fmat, tree);
-    }
-    param_.learning_rate = lr;
-  }
-  char const* Name() const override {
-    return "grow_histmaker";
-  }
-
- protected:
-    /*! \brief a single column of histogram cuts */
-  struct HistUnit {
-    /*! \brief cutting point of histogram, contains maximum point */
-    const float *cut;
-    /*! \brief content of statistics data */
-    GradStats *data;
-    /*! \brief size of histogram */
-    uint32_t size;
-    // default constructor
-    HistUnit() = default;
-    // constructor
-    HistUnit(const float *cut, GradStats *data, uint32_t size)
-        : cut{cut}, data{data}, size{size} {}
-    /*! \brief add a histogram to data */
-  };
-  /*! \brief a set of histograms from different index */
-  struct HistSet {
-    /*! \brief the index pointer of each histunit */
-    const uint32_t *rptr;
-    /*! \brief cutting points in each histunit */
-    const bst_float *cut;
-    /*! \brief data in different hist unit */
-    std::vector<GradStats> data;
-    /*! \brief return a column of histogram cuts */
-    inline HistUnit operator[](size_t fid) {
-      return {cut + rptr[fid], &data[0] + rptr[fid], rptr[fid+1] - rptr[fid]};
-    }
-  };
-  // thread workspace
-  struct ThreadWSpace {
-    /*! \brief actual unit pointer */
-    std::vector<unsigned> rptr;
-    /*! \brief cut field */
-    std::vector<bst_float> cut;
-    // per thread histset
-    std::vector<HistSet> hset;
-    // initialize the hist set
-    inline void Configure(int nthread) {
-      hset.resize(nthread);
-      // cleanup statistics
-      for (int tid = 0; tid < nthread; ++tid) {
-        for (auto& d : hset[tid].data) { d = GradStats(); }
-        hset[tid].rptr = dmlc::BeginPtr(rptr);
-        hset[tid].cut = dmlc::BeginPtr(cut);
-        hset[tid].data.resize(cut.size(), GradStats());
-      }
-    }
-    /*! \brief clear the workspace */
-    inline void Clear() {
-      cut.clear(); rptr.resize(1); rptr[0] = 0;
-    }
-    /*! \brief total size */
-    inline size_t Size() const {
-      return rptr.size() - 1;
-    }
-  };
-  // workspace of thread
-  ThreadWSpace wspace_;
-  // set of working features
-  std::vector<bst_feature_t> selected_features_;
-  // update function implementation
-  virtual void UpdateTree(const std::vector<GradientPair> &gpair,
-                          DMatrix *p_fmat,
-                          RegTree *p_tree) {
-    CHECK(param_.max_depth > 0) << "max_depth must be larger than 0";
-    this->InitData(gpair, *p_fmat, *p_tree);
-    this->InitWorkSet(p_fmat, *p_tree, &selected_features_);
-    // mark root node as fresh.
-    (*p_tree)[0].SetLeaf(0.0f, 0);
-
-    for (int depth = 0; depth < param_.max_depth; ++depth) {
-      // reset and propose candidate split
-      this->ResetPosAndPropose(gpair, p_fmat, selected_features_, *p_tree);
-      // create histogram
-      this->CreateHist(gpair, p_fmat, selected_features_, *p_tree);
-      // find split based on histogram statistics
-      this->FindSplit(selected_features_, p_tree);
-      // reset position after split
-      this->ResetPositionAfterSplit(p_fmat, *p_tree);
-      this->UpdateQueueExpand(*p_tree);
-      // if nothing left to be expand, break
-      if (qexpand_.size() == 0) break;
-    }
-    for (int const nid : qexpand_) {
-      (*p_tree)[nid].SetLeaf(p_tree->Stat(nid).base_weight * param_.learning_rate);
-    }
-  }
-  // this function does two jobs
-  // (1) reset the position in array position, to be the latest leaf id
-  // (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly
-  virtual void ResetPosAndPropose(const std::vector<GradientPair> &gpair,
-                                  DMatrix *p_fmat,
-                                  const std::vector <bst_feature_t> &fset,
-                                  const RegTree &tree) = 0;
-  // initialize the current working set of features in this round
-  virtual void InitWorkSet(DMatrix *,
-                           const RegTree &tree,
-                           std::vector<bst_feature_t> *p_fset) {
-    p_fset->resize(tree.param.num_feature);
-    for (size_t i = 0; i < p_fset->size(); ++i) {
-      (*p_fset)[i] = static_cast<unsigned>(i);
-    }
-  }
-  // reset position after split, this is not a must, depending on implementation
-  virtual void ResetPositionAfterSplit(DMatrix *p_fmat,
-                                       const RegTree &tree) {
-  }
-  virtual void CreateHist(const std::vector<GradientPair> &gpair,
-                          DMatrix *,
-                          const std::vector <bst_feature_t> &fset,
-                          const RegTree &)  = 0;
-
- private:
-  void EnumerateSplit(const HistUnit &hist,
-                      const GradStats &node_sum,
-                      bst_uint fid,
-                      SplitEntry *best,
-                      GradStats *left_sum) const {
-    if (hist.size == 0) return;
-
-    double root_gain = CalcGain(param_, node_sum.GetGrad(), node_sum.GetHess());
-    GradStats s, c;
-    for (bst_uint i = 0; i < hist.size; ++i) {
-      s.Add(hist.data[i]);
-      if (s.sum_hess >= param_.min_child_weight) {
-        c.SetSubstract(node_sum, s);
-        if (c.sum_hess >= param_.min_child_weight) {
-          double loss_chg = CalcGain(param_, s.GetGrad(), s.GetHess()) +
-                            CalcGain(param_, c.GetGrad(), c.GetHess()) - root_gain;
-          if (best->Update(static_cast<bst_float>(loss_chg), fid, hist.cut[i],
-                           false, false, s, c)) {
-            *left_sum = s;
-          }
-        }
-      }
-    }
-    s = GradStats();
-    for (bst_uint i = hist.size - 1; i != 0; --i) {
-      s.Add(hist.data[i]);
-      if (s.sum_hess >= param_.min_child_weight) {
-        c.SetSubstract(node_sum, s);
-        if (c.sum_hess >= param_.min_child_weight) {
-          double loss_chg = CalcGain(param_, s.GetGrad(), s.GetHess()) +
-                            CalcGain(param_, c.GetGrad(), c.GetHess()) - root_gain;
-          if (best->Update(static_cast<bst_float>(loss_chg), fid,
-                           hist.cut[i - 1], true, false, c, s)) {
-            *left_sum = c;
-          }
-        }
-      }
-    }
-  }
-
-  void FindSplit(const std::vector <bst_feature_t> &feature_set,
-                 RegTree *p_tree) {
-    const size_t num_feature = feature_set.size();
-    // get the best split condition for each node
-    std::vector<SplitEntry> sol(qexpand_.size());
-    std::vector<GradStats> left_sum(qexpand_.size());
-    auto nexpand = qexpand_.size();
-    common::ParallelFor(nexpand, ctx_->Threads(), common::Sched::Dyn(1), [&](auto wid) {
-      const int nid = qexpand_[wid];
-      CHECK_EQ(node2workindex_[nid], static_cast<int>(wid));
-      SplitEntry &best = sol[wid];
-      GradStats &node_sum = wspace_.hset[0][num_feature + wid * (num_feature + 1)].data[0];
-      for (size_t i = 0; i < feature_set.size(); ++i) {
-        // Query is thread safe as it's a const function.
-        if (!this->interaction_constraints_.Query(nid, feature_set[i])) {
-          continue;
-        }
-
-        EnumerateSplit(this->wspace_.hset[0][i + wid * (num_feature + 1)], node_sum, feature_set[i],
-                       &best, &left_sum[wid]);
-      }
-    });
-    // get the best result, we can synchronize the solution
-    for (bst_omp_uint wid = 0; wid < nexpand; ++wid) {
-      const bst_node_t nid = qexpand_[wid];
-      SplitEntry const& best = sol[wid];
-      const GradStats &node_sum = wspace_.hset[0][num_feature + wid * (num_feature + 1)].data[0];
-      this->SetStats(p_tree, nid, node_sum);
-      // set up the values
-      p_tree->Stat(nid).loss_chg = best.loss_chg;
-      // now we know the solution in snode[nid], set split
-      if (best.loss_chg > kRtEps) {
-        bst_float base_weight = CalcWeight(param_, node_sum);
-        bst_float left_leaf_weight =
-            CalcWeight(param_, best.left_sum.sum_grad, best.left_sum.sum_hess) *
-            param_.learning_rate;
-        bst_float right_leaf_weight =
-            CalcWeight(param_, best.right_sum.sum_grad,
-                       best.right_sum.sum_hess) *
-            param_.learning_rate;
-        p_tree->ExpandNode(nid, best.SplitIndex(), best.split_value,
-                           best.DefaultLeft(), base_weight, left_leaf_weight,
-                           right_leaf_weight, best.loss_chg,
-                           node_sum.sum_hess,
-                           best.left_sum.GetHess(), best.right_sum.GetHess());
-        GradStats right_sum;
-        right_sum.SetSubstract(node_sum, left_sum[wid]);
-        auto left_child = (*p_tree)[nid].LeftChild();
-        auto right_child = (*p_tree)[nid].RightChild();
-        this->SetStats(p_tree, left_child, left_sum[wid]);
-        this->SetStats(p_tree, right_child, right_sum);
-        this->interaction_constraints_.Split(nid, best.SplitIndex(), left_child, right_child);
-      } else {
-        (*p_tree)[nid].SetLeaf(p_tree->Stat(nid).base_weight * param_.learning_rate);
-      }
-    }
-  }
-
-  inline void SetStats(RegTree *p_tree, int nid, const GradStats &node_sum) {
-    p_tree->Stat(nid).base_weight =
-        static_cast<bst_float>(CalcWeight(param_, node_sum));
-    p_tree->Stat(nid).sum_hess = static_cast<bst_float>(node_sum.sum_hess);
-  }
-};
-
-class CQHistMaker : public HistMaker {
- public:
-  explicit CQHistMaker(GenericParameter const *ctx) : HistMaker(ctx) {}
-  char const *Name() const override { return "grow_local_histmaker"; }
-
- protected:
-  struct HistEntry {
-    HistMaker::HistUnit hist;
-    unsigned istart;
-    /*!
-     * \brief add a histogram to data,
-     * do linear scan, start from istart
-     */
-    inline void Add(bst_float fv,
-                    const std::vector<GradientPair> &gpair,
-                    const bst_uint ridx) {
-      while (istart < hist.size && !(fv < hist.cut[istart])) ++istart;
-      CHECK_NE(istart, hist.size);
-      hist.data[istart].Add(gpair[ridx]);
-    }
-    /*!
-     * \brief add a histogram to data,
-     * do linear scan, start from istart
-     */
-    inline void Add(bst_float fv,
-                    GradientPair gstats) {
-      if (fv < hist.cut[istart]) {
-        hist.data[istart].Add(gstats);
-      } else {
-        while (istart < hist.size && !(fv < hist.cut[istart])) ++istart;
-        if (istart != hist.size) {
-          hist.data[istart].Add(gstats);
-        } else {
-          LOG(INFO) << "fv=" << fv << ", hist.size=" << hist.size;
-          for (size_t i = 0; i < hist.size; ++i) {
-            LOG(INFO) << "hist[" << i << "]=" << hist.cut[i];
-          }
-          LOG(FATAL) << "fv=" << fv << ", hist.last=" << hist.cut[hist.size - 1];
-        }
-      }
-    }
-  };
-  // sketch type used for this
-  using WXQSketch = common::WXQuantileSketch<bst_float, bst_float>;
-  // initialize the work set of tree
-  void InitWorkSet(DMatrix *p_fmat,
-                   const RegTree &tree,
-                   std::vector<bst_feature_t> *p_fset) override {
-    if (p_fmat != cache_dmatrix_) {
-      feat_helper_.InitByCol(p_fmat, tree);
-      cache_dmatrix_ = p_fmat;
-    }
-    feat_helper_.SyncInfo();
-    feat_helper_.SampleCol(this->param_.colsample_bytree, p_fset);
-  }
-  // code to create histogram
-  void CreateHist(const std::vector<GradientPair> &gpair,
-                  DMatrix *p_fmat,
-                  const std::vector<bst_feature_t> &fset,
-                  const RegTree &tree) override {
-    const MetaInfo &info = p_fmat->Info();
-    // fill in reverse map
-    feat2workindex_.resize(tree.param.num_feature);
-    std::fill(feat2workindex_.begin(), feat2workindex_.end(), -1);
-    for (size_t i = 0; i < fset.size(); ++i) {
-      feat2workindex_[fset[i]] = static_cast<int>(i);
-    }
-    // start to work
-    this->wspace_.Configure(1);
-    // if it is C++11, use lazy evaluation for Allreduce,
-    // to gain speedup in recovery
-    auto lazy_get_hist = [&]() {
-      thread_hist_.resize(ctx_->Threads());
-      // start accumulating statistics
-      for (const auto &batch : p_fmat->GetBatches<SortedCSCPage>()) {
-        auto page = batch.GetView();
-        // start enumeration
-        common::ParallelFor(fset.size(), ctx_->Threads(), common::Sched::Dyn(1), [&](auto i) {
-          int fid = fset[i];
-          int offset = feat2workindex_[fid];
-          if (offset >= 0) {
-            this->UpdateHistCol(gpair, page[fid], info, tree, fset, offset,
-                                &thread_hist_[omp_get_thread_num()]);
-          }
-        });
-      }
-      // update node statistics.
-      this->GetNodeStats(gpair, *p_fmat, tree,
-                         &thread_stats_, &node_stats_);
-      for (int const nid : this->qexpand_) {
-        const int wid = this->node2workindex_[nid];
-        this->wspace_.hset[0][fset.size() + wid * (fset.size() + 1)]
-                .data[0] = node_stats_[nid];
-      }
-    };
-    // sync the histogram
-    rabit::Allreduce<rabit::op::Sum>(&dmlc::BeginPtr(this->wspace_.hset[0].data)->sum_grad,
-                                     this->wspace_.hset[0].data.size() * 2, lazy_get_hist);
-  }
-
-  void ResetPositionAfterSplit(DMatrix *,
-                                 const RegTree &tree) override {
-    this->GetSplitSet(this->qexpand_, tree, &fsplit_set_);
-  }
-  void ResetPosAndPropose(const std::vector<GradientPair> &gpair,
-                          DMatrix *p_fmat,
-                          const std::vector<bst_feature_t> &fset,
-                          const RegTree &tree) override {
-    const MetaInfo &info = p_fmat->Info();
-    // fill in reverse map
-    feat2workindex_.resize(tree.param.num_feature);
-    std::fill(feat2workindex_.begin(), feat2workindex_.end(), -1);
-    work_set_.clear();
-    for (auto fidx : fset) {
-      if (feat_helper_.Type(fidx) == 2) {
-        feat2workindex_[fidx] = static_cast<int>(work_set_.size());
-        work_set_.push_back(fidx);
-      } else {
-        feat2workindex_[fidx] = -2;
-      }
-    }
-    const size_t work_set_size = work_set_.size();
-
-    sketchs_.resize(this->qexpand_.size() * work_set_size);
-    for (auto& sketch : sketchs_) {
-      sketch.Init(info.num_row_, this->param_.sketch_eps);
-    }
-    // initialize the summary array
-    summary_array_.resize(sketchs_.size());
-    // setup maximum size
-    unsigned max_size = this->param_.MaxSketchSize();
-    for (size_t i = 0; i < sketchs_.size(); ++i) {
-      summary_array_[i].Reserve(max_size);
-    }
-    {
-      // get summary
-      thread_sketch_.resize(ctx_->Threads());
-
-      // TWOPASS: use the real set + split set in the column iteration.
-      this->SetDefaultPostion(p_fmat, tree);
-      work_set_.insert(work_set_.end(), fsplit_set_.begin(), fsplit_set_.end());
-      std::sort(work_set_.begin(), work_set_.end());
-      work_set_.resize(std::unique(work_set_.begin(), work_set_.end()) - work_set_.begin());
-
-      // start accumulating statistics
-      for (const auto &batch : p_fmat->GetBatches<SortedCSCPage>()) {
-        // TWOPASS: use the real set + split set in the column iteration.
-        this->CorrectNonDefaultPositionByBatch(batch, fsplit_set_, tree);
-        auto page = batch.GetView();
-        // start enumeration
-        common::ParallelFor(work_set_.size(), ctx_->Threads(), common::Sched::Dyn(1),
-                            [&](auto i) {
-                              int fid = work_set_[i];
-                              int offset = feat2workindex_[fid];
-                              if (offset >= 0) {
-                                this->UpdateSketchCol(gpair, page[fid], tree, work_set_size, offset,
-                                                      &thread_sketch_[omp_get_thread_num()]);
-                              }
-                            });
-      }
-      for (size_t i = 0; i < sketchs_.size(); ++i) {
-        common::WXQuantileSketch<bst_float, bst_float>::SummaryContainer out;
-        sketchs_[i].GetSummary(&out);
-        summary_array_[i].SetPrune(out, max_size);
-      }
-      CHECK_EQ(summary_array_.size(), sketchs_.size());
-    }
-    if (summary_array_.size() != 0) {
-      size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size);
-      sreducer_.Allreduce(dmlc::BeginPtr(summary_array_), nbytes, summary_array_.size());
-    }
-    // now we get the final result of sketch, setup the cut
-    this->wspace_.cut.clear();
-    this->wspace_.rptr.clear();
-    this->wspace_.rptr.push_back(0);
-    for (size_t wid = 0; wid < this->qexpand_.size(); ++wid) {
-      for (unsigned int i : fset) {
-        int offset = feat2workindex_[i];
-        if (offset >= 0) {
-          const WXQSketch::Summary &a = summary_array_[wid * work_set_size + offset];
-          for (size_t i = 1; i < a.size; ++i) {
-            bst_float cpt = a.data[i].value - kRtEps;
-            if (i == 1 || cpt > this->wspace_.cut.back()) {
-              this->wspace_.cut.push_back(cpt);
-            }
-          }
-          // push a value that is greater than anything
-          if (a.size != 0) {
-            bst_float cpt = a.data[a.size - 1].value;
-            // this must be bigger than last value in a scale
-            bst_float last = cpt + fabs(cpt) + kRtEps;
-            this->wspace_.cut.push_back(last);
-          }
-          this->wspace_.rptr.push_back(static_cast<unsigned>(this->wspace_.cut.size()));
-        } else {
-          CHECK_EQ(offset, -2);
-          bst_float cpt = feat_helper_.MaxValue(i);
-          this->wspace_.cut.push_back(cpt + fabs(cpt) + kRtEps);
-          this->wspace_.rptr.push_back(static_cast<unsigned>(this->wspace_.cut.size()));
-        }
-      }
-      // reserve last value for global statistics
-      this->wspace_.cut.push_back(0.0f);
-      this->wspace_.rptr.push_back(static_cast<unsigned>(this->wspace_.cut.size()));
-    }
-    CHECK_EQ(this->wspace_.rptr.size(),
-             (fset.size() + 1) * this->qexpand_.size() + 1);
-  }
-
-  inline void UpdateHistCol(const std::vector<GradientPair> &gpair,
-                            const SparsePage::Inst &col,
-                            const MetaInfo &info,
-                            const RegTree &tree,
-                            const std::vector<bst_feature_t> &fset,
-                            bst_uint fid_offset,
-                            std::vector<HistEntry> *p_temp) {
-    if (col.size() == 0) return;
-    // initialize sbuilder for use
-    std::vector<HistEntry> &hbuilder = *p_temp;
-    hbuilder.resize(tree.param.num_nodes);
-    for (int const nid : this->qexpand_) {
-      const unsigned wid = this->node2workindex_[nid];
-      hbuilder[nid].istart = 0;
-      hbuilder[nid].hist = this->wspace_.hset[0][fid_offset + wid * (fset.size()+1)];
-    }
-    if (this->param_.cache_opt != 0) {
-      constexpr bst_uint kBuffer = 32;
-      bst_uint align_length = col.size() / kBuffer * kBuffer;
-      int buf_position[kBuffer];
-      GradientPair buf_gpair[kBuffer];
-      for (bst_uint j = 0; j < align_length; j += kBuffer) {
-        for (bst_uint i = 0; i < kBuffer; ++i) {
-          bst_uint ridx = col[j + i].index;
-          buf_position[i] = this->position_[ridx];
-          buf_gpair[i] = gpair[ridx];
-        }
-        for (bst_uint i = 0; i < kBuffer; ++i) {
-          const int nid = buf_position[i];
-          if (nid >= 0) {
-            hbuilder[nid].Add(col[j + i].fvalue, buf_gpair[i]);
-          }
-        }
-      }
-      for (bst_uint j = align_length; j < col.size(); ++j) {
-        const bst_uint ridx = col[j].index;
-        const int nid = this->position_[ridx];
-        if (nid >= 0) {
-          hbuilder[nid].Add(col[j].fvalue, gpair[ridx]);
-        }
-      }
-    } else {
-      for (const auto& c : col) {
-        const bst_uint ridx = c.index;
-        const int nid = this->position_[ridx];
-        if (nid >= 0) {
-          hbuilder[nid].Add(c.fvalue, gpair, ridx);
-        }
-      }
-    }
-  }
-  inline void UpdateSketchCol(const std::vector<GradientPair> &gpair,
-                              const SparsePage::Inst &col,
-                              const RegTree &tree,
-                              size_t work_set_size,
-                              bst_uint offset,
-                              std::vector<BaseMaker::SketchEntry> *p_temp) {
-    if (col.size() == 0) return;
-    // initialize sbuilder for use
-    std::vector<BaseMaker::SketchEntry> &sbuilder = *p_temp;
-    sbuilder.resize(tree.param.num_nodes);
-    for (int const nid : this->qexpand_) {
-      const unsigned wid = this->node2workindex_[nid];
-      sbuilder[nid].sum_total = 0.0f;
-      sbuilder[nid].sketch = &sketchs_[wid * work_set_size + offset];
-    }
-    // first pass, get sum of weight, TODO, optimization to skip first pass
-    for (const auto& c : col) {
-        const bst_uint ridx = c.index;
-        const int nid = this->position_[ridx];
-        if (nid >= 0) {
-          sbuilder[nid].sum_total += gpair[ridx].GetHess();
-      }
-    }
-    // if only one value, no need to do second pass
-    if (col[0].fvalue  == col[col.size()-1].fvalue) {
-      for (int const nid : this->qexpand_) {
-        sbuilder[nid].sketch->Push(
-            col[0].fvalue, static_cast<bst_float>(sbuilder[nid].sum_total));
-      }
-      return;
-    }
-    // two pass scan
-    unsigned max_size = this->param_.MaxSketchSize();
-    for (int const nid : this->qexpand_) {
-      sbuilder[nid].Init(max_size);
-    }
-    // second pass, build the sketch
-    if (this->param_.cache_opt != 0) {
-      constexpr bst_uint kBuffer = 32;
-      bst_uint align_length = col.size() / kBuffer * kBuffer;
-      int buf_position[kBuffer];
-      bst_float buf_hess[kBuffer];
-      for (bst_uint j = 0; j < align_length; j += kBuffer) {
-        for (bst_uint i = 0; i < kBuffer; ++i) {
-          bst_uint ridx = col[j + i].index;
-          buf_position[i] = this->position_[ridx];
-          buf_hess[i] = gpair[ridx].GetHess();
-        }
-        for (bst_uint i = 0; i < kBuffer; ++i) {
-          const int nid = buf_position[i];
-          if (nid >= 0) {
-            sbuilder[nid].Push(col[j + i].fvalue, buf_hess[i], max_size);
-          }
-        }
-      }
-      for (bst_uint j = align_length; j < col.size(); ++j) {
-        const bst_uint ridx = col[j].index;
-        const int nid = this->position_[ridx];
-        if (nid >= 0) {
-          sbuilder[nid].Push(col[j].fvalue, gpair[ridx].GetHess(), max_size);
-        }
-      }
-    } else {
-      for (const auto& c : col) {
-        const bst_uint ridx = c.index;
-        const int nid = this->position_[ridx];
-        if (nid >= 0) {
-          sbuilder[nid].Push(c.fvalue, gpair[ridx].GetHess(), max_size);
-        }
-      }
-    }
-    for (int const nid : this->qexpand_) { sbuilder[nid].Finalize(max_size); }
-  }
-  // cached dmatrix where we initialized the feature on.
-  const DMatrix* cache_dmatrix_{nullptr};
-  // feature helper
-  BaseMaker::FMetaHelper feat_helper_;
-  // temp space to map feature id to working index
-  std::vector<int> feat2workindex_;
-  // set of index from fset that are current work set
-  std::vector<bst_feature_t> work_set_;
-  // set of index from that are split candidates.
-  std::vector<bst_uint> fsplit_set_;
-  // thread temp data
-  std::vector<std::vector<BaseMaker::SketchEntry> > thread_sketch_;
-  // used to hold statistics
-  std::vector<std::vector<GradStats> > thread_stats_;
-  // used to hold start pointer
-  std::vector<std::vector<HistEntry> > thread_hist_;
-  // node statistics
-  std::vector<GradStats> node_stats_;
-  // summary array
-  std::vector<WXQSketch::SummaryContainer> summary_array_;
-  // reducer for summary
-  rabit::SerializeReducer<WXQSketch::SummaryContainer> sreducer_;
-  // per node, per feature sketch
-  std::vector<common::WXQuantileSketch<bst_float, bst_float> > sketchs_;
-};
-
-XGBOOST_REGISTER_TREE_UPDATER(LocalHistMaker, "grow_local_histmaker")
-    .describe("Tree constructor that uses approximate histogram construction.")
-    .set_body([](GenericParameter const *ctx, ObjInfo) { return new CQHistMaker(ctx); });
-}  // namespace tree
-}  // namespace xgboost

From 594a963624b60365fcaf0c623f19a124d9783c15 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Thu, 16 Jun 2022 12:20:51 -0700
Subject: [PATCH 3/5] fix mpi engine

---
 rabit/src/engine_mpi.cc | 87 -----------------------------------------
 1 file changed, 87 deletions(-)

diff --git a/rabit/src/engine_mpi.cc b/rabit/src/engine_mpi.cc
index c5811cb76a6c..d5ed2f454993 100644
--- a/rabit/src/engine_mpi.cc
+++ b/rabit/src/engine_mpi.cc
@@ -158,92 +158,5 @@ void Allreduce_(void *sendrecvbuf,
   MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf,
                             count, GetType(dtype), GetOp(op));
 }
-
-// code for reduce handle
-ReduceHandle::ReduceHandle(void)
-    : handle_(NULL), redfunc_(NULL), htype_(NULL) {
-}
-ReduceHandle::~ReduceHandle(void) {
-  /* !WARNING!
-
-     A handle can be held by a tree method/Learner from xgboost.  The booster might not be
-     freed until program exit, while (good) users call rabit.finalize() before reaching
-     the end of program.  So op->Free() might be called after finalization and results
-     into following error:
-
-      ```
-        Attempting to use an MPI routine after finalizing MPICH
-      ```
-
-     Here we skip calling Free if MPI has already been finalized to workaround the issue.
-     It can be a potential leak of memory.  The best way to resolve it is to eliminate all
-     use of long living handle.
-  */
-  int finalized = 0;
-  CHECK_EQ(MPI_Finalized(&finalized), MPI_SUCCESS);
-  if (handle_ != NULL) {
-    MPI::Op *op = reinterpret_cast<MPI::Op*>(handle_);
-    if (!finalized) {
-      op->Free();
-    }
-    delete op;
-  }
-  if (htype_ != NULL) {
-    MPI::Datatype *dtype = reinterpret_cast<MPI::Datatype*>(htype_);
-    if (!finalized) {
-      dtype->Free();
-    }
-    delete dtype;
-  }
-}
-int ReduceHandle::TypeSize(const MPI::Datatype &dtype) {
-  return dtype.Get_size();
-}
-void ReduceHandle::Init(IEngine::ReduceFunction redfunc, size_t type_nbytes) {
-  utils::Assert(handle_ == NULL, "cannot initialize reduce handle twice");
-  if (type_nbytes != 0) {
-    MPI::Datatype *dtype = new MPI::Datatype();
-    if (type_nbytes % 8 == 0) {
-      *dtype = MPI::LONG.Create_contiguous(type_nbytes / sizeof(long));  // NOLINT(*)
-    } else if (type_nbytes % 4 == 0) {
-      *dtype = MPI::INT.Create_contiguous(type_nbytes / sizeof(int));
-    } else {
-      *dtype = MPI::CHAR.Create_contiguous(type_nbytes);
-    }
-    dtype->Commit();
-    created_type_nbytes_ = type_nbytes;
-    htype_ = dtype;
-  }
-  MPI::Op *op = new MPI::Op();
-  MPI::User_function *pf = redfunc;
-  op->Init(pf, true);
-  handle_ = op;
-}
-void ReduceHandle::Allreduce(void *sendrecvbuf,
-                             size_t type_nbytes, size_t count,
-                             IEngine::PreprocFunction prepare_fun,
-                             void *prepare_arg) {
-  utils::Assert(handle_ != NULL, "must initialize handle to call AllReduce");
-  MPI::Op *op = reinterpret_cast<MPI::Op*>(handle_);
-  MPI::Datatype *dtype = reinterpret_cast<MPI::Datatype*>(htype_);
-  if (created_type_nbytes_ != type_nbytes || dtype == NULL) {
-    if (dtype == NULL) {
-      dtype = new MPI::Datatype();
-    } else {
-      dtype->Free();
-    }
-    if (type_nbytes % 8 == 0) {
-      *dtype = MPI::LONG.Create_contiguous(type_nbytes / sizeof(long));  // NOLINT(*)
-    } else if (type_nbytes % 4 == 0) {
-      *dtype = MPI::INT.Create_contiguous(type_nbytes / sizeof(int));
-    } else {
-      *dtype = MPI::CHAR.Create_contiguous(type_nbytes);
-    }
-    dtype->Commit();
-    created_type_nbytes_ = type_nbytes;
-  }
-  if (prepare_fun != NULL) prepare_fun(prepare_arg);
-  MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf, count, *dtype, *op);
-}
 }  // namespace engine
 }  // namespace rabit

From e4784a4364aaa806ac4c86853a54d2a7de5ecc3e Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Fri, 17 Jun 2022 11:02:16 -0700
Subject: [PATCH 4/5] remove sketch_eps and update docs

---
 doc/parameter.rst                             | 10 ---
 doc/treemethod.rst                            | 62 ++++++++++---------
 .../scala/spark/params/BoosterParams.scala    | 14 -----
 src/tree/param.h                              | 12 ----
 4 files changed, 33 insertions(+), 65 deletions(-)

diff --git a/doc/parameter.rst b/doc/parameter.rst
index 3a35666338dd..71ada71f7612 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -151,15 +151,6 @@ Parameters for Tree Booster
     - ``hist``: Faster histogram optimized approximate greedy algorithm.
     - ``gpu_hist``: GPU implementation of ``hist`` algorithm.
 
-* ``sketch_eps`` [default=0.03]
-
-  - Only used for ``updater=grow_local_histmaker``.
-  - This roughly translates into ``O(1 / sketch_eps)`` number of bins.
-    Compared to directly select number of bins, this comes with theoretical guarantee with sketch accuracy.
-  - Usually user does not have to tune this.
-    But consider setting to a lower number for more accurate enumeration of split candidates.
-  - range: (0, 1)
-
 * ``scale_pos_weight`` [default=1]
 
   - Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: ``sum(negative instances) / sum(positive instances)``. See :doc:`Parameters Tuning </tutorials/param_tuning>` for more discussion. Also, see Higgs Kaggle competition demo for examples: `R <https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-train.R>`_, `py1 <https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-numpy.py>`_, `py2 <https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-cv.py>`_, `py3 <https://github.com/dmlc/xgboost/blob/master/demo/guide-python/cross_validation.py>`_.
@@ -170,7 +161,6 @@ Parameters for Tree Booster
 
     - ``grow_colmaker``: non-distributed column-based construction of trees.
     - ``grow_histmaker``: distributed tree construction with row-based data splitting based on global proposal of histogram counting.
-    - ``grow_local_histmaker``: based on local histogram counting.
     - ``grow_quantile_histmaker``: Grow tree using quantized histogram.
     - ``grow_gpu_hist``: Grow tree with GPU.
     - ``sync``: synchronizes trees in all distributed nodes.
diff --git a/doc/treemethod.rst b/doc/treemethod.rst
index 91f546c36a9c..254eafb281db 100644
--- a/doc/treemethod.rst
+++ b/doc/treemethod.rst
@@ -5,7 +5,7 @@ Tree Methods
 For training boosted tree models, there are 2 parameters used for choosing algorithms,
 namely ``updater`` and ``tree_method``.  XGBoost has 4 builtin tree methods, namely
 ``exact``, ``approx``, ``hist`` and ``gpu_hist``.  Along with these tree methods, there
-are also some free standing updaters including ``grow_local_histmaker``, ``refresh``,
+are also some free standing updaters including ``refresh``,
 ``prune`` and ``sync``.  The parameter ``updater`` is more primitive than ``tree_method``
 as the latter is just a pre-configuration of the former.  The difference is mostly due to
 historical reasons that each updater requires some specific configurations and might has
@@ -37,27 +37,18 @@ approximated training algorithms.  These algorithms build a gradient histogram f
 node and iterate through the histogram instead of real dataset.  Here we introduce the
 implementations in XGBoost below.
 
-1. ``grow_local_histmaker`` updater: An approximation tree method described in `reference
-   paper <http://arxiv.org/abs/1603.02754>`_.  This updater is rarely used in practice so
-   it's still an updater rather than tree method.  During split finding, it first runs a
-   weighted GK sketching for data points belong to current node to find split candidates,
-   using hessian as weights.  The histogram is built upon this per-node sketch.  It's
-   faster than ``exact`` in some applications, but still slow in computation.
-
-2. ``approx`` tree method: An approximation tree method described in `reference paper
-   <http://arxiv.org/abs/1603.02754>`_.  Different from ``grow_local_histmaker``, it runs
-   sketching before building each tree using all the rows (rows belonging to the root)
-   instead of per-node dataset.  Similar to ``grow_local_histmaker`` updater, hessian is
-   used as weights during sketch.  The algorithm can be accessed by setting
-   ``tree_method`` to ``approx``.
+1. ``approx`` tree method: An approximation tree method described in `reference paper
+   <http://arxiv.org/abs/1603.02754>`_.  It runs sketching before building each tree
+   using all the rows (rows belonging to the root). Hessian is used as weights during
+   sketch.  The algorithm can be accessed by setting ``tree_method`` to ``approx``.
 
-3. ``hist`` tree method: An approximation tree method used in LightGBM with slight
+2. ``hist`` tree method: An approximation tree method used in LightGBM with slight
    differences in implementation.  It runs sketching before training using only user
    provided weights instead of hessian.  The subsequent per-node histogram is built upon
    this global sketch.  This is the fastest algorithm as it runs sketching only once.  The
    algorithm can be accessed by setting ``tree_method`` to ``hist``.
 
-4. ``gpu_hist`` tree method: The ``gpu_hist`` tree method is a GPU implementation of
+3. ``gpu_hist`` tree method: The ``gpu_hist`` tree method is a GPU implementation of
    ``hist``, with additional support for gradient based sampling.  The algorithm can be
    accessed by setting ``tree_method`` to ``gpu_hist``.
 
@@ -102,19 +93,32 @@ Other Updaters
 Removed Updaters
 ****************
 
-2 Updaters were removed during development due to maintainability.  We describe them here
-solely for the interest of documentation.  First one is distributed colmaker, which was a
-distributed version of exact tree method.  It required specialization for column based
-splitting strategy and a different prediction procedure.  As the exact tree method is slow
-by itself and scaling is even less efficient, we removed it entirely.  Second one is
-``skmaker``.  Per-node weighted sketching employed by ``grow_local_histmaker`` is slow,
-the ``skmaker`` was unmaintained and seems to be a workaround trying to eliminate the
-histogram creation step and uses sketching values directly during split evaluation.  It
-was never tested and contained some unknown bugs, we decided to remove it and focus our
-resources on more promising algorithms instead.  For accuracy, most of the time
-``approx``, ``hist`` and ``gpu_hist`` are enough with some parameters tuning, so removing
-them don't have any real practical impact.
-
+3 Updaters were removed during development due to maintainability.  We describe them here
+solely for the interest of documentation.
+
+1. Distributed colmaker, which was a distributed version of exact tree method.  It
+   required specialization for column based splitting strategy and a different prediction
+   procedure.  As the exact tree method is slow by itself and scaling is even less
+   efficient, we removed it entirely.
+
+2. ``skmaker``.  Per-node weighted sketching employed by ``grow_local_histmaker`` is slow,
+   the ``skmaker`` was unmaintained and seems to be a workaround trying to eliminate the
+   histogram creation step and uses sketching values directly during split evaluation.  It
+   was never tested and contained some unknown bugs, we decided to remove it and focus our
+   resources on more promising algorithms instead.  For accuracy, most of the time
+   ``approx``, ``hist`` and ``gpu_hist`` are enough with some parameters tuning, so
+   removing them don't have any real practical impact.
+
+3. ``grow_local_histmaker`` updater: An approximation tree method described in `reference
+   paper <http://arxiv.org/abs/1603.02754>`_.  This updater was rarely used in practice so
+   it was still an updater rather than tree method.  During split finding, it first runs a
+   weighted GK sketching for data points belong to current node to find split candidates,
+   using hessian as weights.  The histogram is built upon this per-node sketch.  It was
+   faster than ``exact`` in some applications, but still slow in computation.  It was
+   removed because it depended on Rabit's customized reduction function that handles all
+   the data structure that can be serialized/deserialized into fixed size buffer, which is
+   not directly supported by NCCL or federated learning gRPC, making it hard to refactor
+   into a common allreducer interface.
 
 **************
 Feature Matrix
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala
index e4223e953812..21a77341c7e5 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala
@@ -182,20 +182,6 @@ private[spark] trait BoosterParams extends Params {
 
   final def getSinglePrecisionHistogram: Boolean = $(singlePrecisionHistogram)
 
-  /**
-   * This is only used for approximate greedy algorithm.
-   * This roughly translated into O(1 / sketch_eps) number of bins. Compared to directly select
-   * number of bins, this comes with theoretical guarantee with sketch accuracy.
-   * [default=0.03] range: (0, 1)
-   */
-  final val sketchEps = new DoubleParam(this, "sketchEps",
-    "This is only used for approximate greedy algorithm. This roughly translated into" +
-      " O(1 / sketch_eps) number of bins. Compared to directly select number of bins, this comes" +
-      " with theoretical guarantee with sketch accuracy.",
-    (value: Double) => value < 1 && value > 0)
-
-  final def getSketchEps: Double = $(sketchEps)
-
   /**
    * Control the balance of positive and negative weights, useful for unbalanced classes. A typical
    * value to consider: sum(negative cases) / sum(positive cases).   [default=1]
diff --git a/src/tree/param.h b/src/tree/param.h
index 5c6c2e11375d..c94437732299 100644
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -65,8 +65,6 @@ struct TrainParam : public XGBoostParameter<TrainParam> {
   // whether to subsample columns during tree construction
   float colsample_bytree;
   // accuracy of sketch
-  float sketch_eps;
-  // accuracy of sketch
   float sketch_ratio;
   // option to open cacheline optimization
   bool cache_opt;
@@ -162,10 +160,6 @@ struct TrainParam : public XGBoostParameter<TrainParam> {
         .set_range(0.0f, 1.0f)
         .set_default(1.0f)
         .describe("Subsample ratio of columns, resample on each tree construction.");
-    DMLC_DECLARE_FIELD(sketch_eps)
-        .set_range(0.0f, 1.0f)
-        .set_default(0.03f)
-        .describe("EXP Param: Sketch accuracy of approximate algorithm.");
     DMLC_DECLARE_FIELD(sketch_ratio)
         .set_lower_bound(0.0f)
         .set_default(2.0f)
@@ -203,12 +197,6 @@ struct TrainParam : public XGBoostParameter<TrainParam> {
     return loss_chg < this->min_split_loss ||
            (this->max_depth != 0 && depth > this->max_depth);
   }
-  /*! \brief maximum sketch size */
-  inline unsigned MaxSketchSize() const {
-    auto ret = static_cast<unsigned>(sketch_ratio / sketch_eps);
-    CHECK_GT(ret, 0U);
-    return ret;
-  }
 
   bst_node_t MaxNodes() const {
     if (this->max_depth == 0 && this->max_leaves == 0) {

From 6248c4025734e018742b7fcbc8f65ebacd37883f Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Fri, 17 Jun 2022 11:17:33 -0700
Subject: [PATCH 5/5] remove setSketchEps

---
 .../scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala | 2 --
 .../scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala  | 2 --
 2 files changed, 4 deletions(-)

diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala
index acc9febff6e4..a10394837a68 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala
@@ -100,8 +100,6 @@ class XGBoostClassifier (
 
   def setMaxLeaves(value: Int): this.type = set(maxLeaves, value)
 
-  def setSketchEps(value: Double): this.type = set(sketchEps, value)
-
   def setScalePosWeight(value: Double): this.type = set(scalePosWeight, value)
 
   def setSampleType(value: String): this.type = set(sampleType, value)
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala
index 77e0ac6b0901..82f8346b062a 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala
@@ -102,8 +102,6 @@ class XGBoostRegressor (
 
   def setMaxLeaves(value: Int): this.type = set(maxLeaves, value)
 
-  def setSketchEps(value: Double): this.type = set(sketchEps, value)
-
   def setScalePosWeight(value: Double): this.type = set(scalePosWeight, value)
 
   def setSampleType(value: String): this.type = set(sampleType, value)