dmlc · trivialfis · Jun 21, 2022 · Jun 13, 2022 · Jun 16, 2022 · Jun 16, 2022
diff --git a/amalgamation/xgboost-all0.cc b/amalgamation/xgboost-all0.cc
@@ -55,7 +55,6 @@
 #include "../src/tree/tree_updater.cc"
 #include "../src/tree/updater_approx.cc"
 #include "../src/tree/updater_colmaker.cc"
-#include "../src/tree/updater_histmaker.cc"
 #include "../src/tree/updater_prune.cc"
 #include "../src/tree/updater_quantile_hist.cc"
 #include "../src/tree/updater_refresh.cc"

diff --git a/doc/parameter.rst b/doc/parameter.rst
@@ -151,15 +151,6 @@ Parameters for Tree Booster
     - ``hist``: Faster histogram optimized approximate greedy algorithm.
     - ``gpu_hist``: GPU implementation of ``hist`` algorithm.
 
-* ``sketch_eps`` [default=0.03]
-
-  - Only used for ``updater=grow_local_histmaker``.
-  - This roughly translates into ``O(1 / sketch_eps)`` number of bins.
-    Compared to directly select number of bins, this comes with theoretical guarantee with sketch accuracy.
-  - Usually user does not have to tune this.
-    But consider setting to a lower number for more accurate enumeration of split candidates.
-  - range: (0, 1)
-
 * ``scale_pos_weight`` [default=1]
 
   - Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: ``sum(negative instances) / sum(positive instances)``. See :doc:`Parameters Tuning </tutorials/param_tuning>` for more discussion. Also, see Higgs Kaggle competition demo for examples: `R <https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-train.R>`_, `py1 <https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-numpy.py>`_, `py2 <https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-cv.py>`_, `py3 <https://github.com/dmlc/xgboost/blob/master/demo/guide-python/cross_validation.py>`_.
@@ -170,7 +161,6 @@ Parameters for Tree Booster
 
     - ``grow_colmaker``: non-distributed column-based construction of trees.
     - ``grow_histmaker``: distributed tree construction with row-based data splitting based on global proposal of histogram counting.
-    - ``grow_local_histmaker``: based on local histogram counting.
     - ``grow_quantile_histmaker``: Grow tree using quantized histogram.
     - ``grow_gpu_hist``: Grow tree with GPU.
     - ``sync``: synchronizes trees in all distributed nodes.

diff --git a/doc/treemethod.rst b/doc/treemethod.rst
@@ -5,7 +5,7 @@ Tree Methods
 For training boosted tree models, there are 2 parameters used for choosing algorithms,
 namely ``updater`` and ``tree_method``.  XGBoost has 4 builtin tree methods, namely
 ``exact``, ``approx``, ``hist`` and ``gpu_hist``.  Along with these tree methods, there
-are also some free standing updaters including ``grow_local_histmaker``, ``refresh``,
+are also some free standing updaters including ``refresh``,
 ``prune`` and ``sync``.  The parameter ``updater`` is more primitive than ``tree_method``
 as the latter is just a pre-configuration of the former.  The difference is mostly due to
 historical reasons that each updater requires some specific configurations and might has
@@ -37,27 +37,18 @@ approximated training algorithms.  These algorithms build a gradient histogram f
 node and iterate through the histogram instead of real dataset.  Here we introduce the
 implementations in XGBoost below.
 
-1. ``grow_local_histmaker`` updater: An approximation tree method described in `reference
-   paper <http://arxiv.org/abs/1603.02754>`_.  This updater is rarely used in practice so
-   it's still an updater rather than tree method.  During split finding, it first runs a
-   weighted GK sketching for data points belong to current node to find split candidates,
-   using hessian as weights.  The histogram is built upon this per-node sketch.  It's
-   faster than ``exact`` in some applications, but still slow in computation.
-
-2. ``approx`` tree method: An approximation tree method described in `reference paper
-   <http://arxiv.org/abs/1603.02754>`_.  Different from ``grow_local_histmaker``, it runs
-   sketching before building each tree using all the rows (rows belonging to the root)
-   instead of per-node dataset.  Similar to ``grow_local_histmaker`` updater, hessian is
-   used as weights during sketch.  The algorithm can be accessed by setting
-   ``tree_method`` to ``approx``.
+1. ``approx`` tree method: An approximation tree method described in `reference paper
+   <http://arxiv.org/abs/1603.02754>`_.  It runs sketching before building each tree
+   using all the rows (rows belonging to the root). Hessian is used as weights during
+   sketch.  The algorithm can be accessed by setting ``tree_method`` to ``approx``.
 
-3. ``hist`` tree method: An approximation tree method used in LightGBM with slight
+2. ``hist`` tree method: An approximation tree method used in LightGBM with slight
    differences in implementation.  It runs sketching before training using only user
    provided weights instead of hessian.  The subsequent per-node histogram is built upon
    this global sketch.  This is the fastest algorithm as it runs sketching only once.  The
    algorithm can be accessed by setting ``tree_method`` to ``hist``.
 
-4. ``gpu_hist`` tree method: The ``gpu_hist`` tree method is a GPU implementation of
+3. ``gpu_hist`` tree method: The ``gpu_hist`` tree method is a GPU implementation of
    ``hist``, with additional support for gradient based sampling.  The algorithm can be
    accessed by setting ``tree_method`` to ``gpu_hist``.
 
@@ -102,19 +93,32 @@ Other Updaters
 Removed Updaters
 ****************
 
-2 Updaters were removed during development due to maintainability.  We describe them here
-solely for the interest of documentation.  First one is distributed colmaker, which was a
-distributed version of exact tree method.  It required specialization for column based
-splitting strategy and a different prediction procedure.  As the exact tree method is slow
-by itself and scaling is even less efficient, we removed it entirely.  Second one is
-``skmaker``.  Per-node weighted sketching employed by ``grow_local_histmaker`` is slow,
-the ``skmaker`` was unmaintained and seems to be a workaround trying to eliminate the
-histogram creation step and uses sketching values directly during split evaluation.  It
-was never tested and contained some unknown bugs, we decided to remove it and focus our
-resources on more promising algorithms instead.  For accuracy, most of the time
-``approx``, ``hist`` and ``gpu_hist`` are enough with some parameters tuning, so removing
-them don't have any real practical impact.
-
+3 Updaters were removed during development due to maintainability.  We describe them here
+solely for the interest of documentation.
+
+1. Distributed colmaker, which was a distributed version of exact tree method.  It
+   required specialization for column based splitting strategy and a different prediction
+   procedure.  As the exact tree method is slow by itself and scaling is even less
+   efficient, we removed it entirely.
+
+2. ``skmaker``.  Per-node weighted sketching employed by ``grow_local_histmaker`` is slow,
+   the ``skmaker`` was unmaintained and seems to be a workaround trying to eliminate the
+   histogram creation step and uses sketching values directly during split evaluation.  It
+   was never tested and contained some unknown bugs, we decided to remove it and focus our
+   resources on more promising algorithms instead.  For accuracy, most of the time
+   ``approx``, ``hist`` and ``gpu_hist`` are enough with some parameters tuning, so
+   removing them don't have any real practical impact.
+
+3. ``grow_local_histmaker`` updater: An approximation tree method described in `reference
+   paper <http://arxiv.org/abs/1603.02754>`_.  This updater was rarely used in practice so
+   it was still an updater rather than tree method.  During split finding, it first runs a
+   weighted GK sketching for data points belong to current node to find split candidates,
+   using hessian as weights.  The histogram is built upon this per-node sketch.  It was
+   faster than ``exact`` in some applications, but still slow in computation.  It was
+   removed because it depended on Rabit's customized reduction function that handles all
+   the data structure that can be serialized/deserialized into fixed size buffer, which is
+   not directly supported by NCCL or federated learning gRPC, making it hard to refactor
+   into a common allreducer interface.
 
 **************
 Feature Matrix

diff --git a/...ages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala b/...ages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala
@@ -100,8 +100,6 @@ class XGBoostClassifier (
 
   def setMaxLeaves(value: Int): this.type = set(maxLeaves, value)
 
-  def setSketchEps(value: Double): this.type = set(sketchEps, value)
-
   def setScalePosWeight(value: Double): this.type = set(scalePosWeight, value)
 
   def setSampleType(value: String): this.type = set(sampleType, value)

diff --git a/...kages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala b/...kages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala
@@ -102,8 +102,6 @@ class XGBoostRegressor (
 
   def setMaxLeaves(value: Int): this.type = set(maxLeaves, value)
 
-  def setSketchEps(value: Double): this.type = set(sketchEps, value)
-
   def setScalePosWeight(value: Double): this.type = set(scalePosWeight, value)
 
   def setSampleType(value: String): this.type = set(sampleType, value)

diff --git a/...s/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala b/...s/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala
@@ -182,20 +182,6 @@ private[spark] trait BoosterParams extends Params {
 
   final def getSinglePrecisionHistogram: Boolean = $(singlePrecisionHistogram)
 
-  /**
-   * This is only used for approximate greedy algorithm.
-   * This roughly translated into O(1 / sketch_eps) number of bins. Compared to directly select
-   * number of bins, this comes with theoretical guarantee with sketch accuracy.
-   * [default=0.03] range: (0, 1)
-   */
-  final val sketchEps = new DoubleParam(this, "sketchEps",
-    "This is only used for approximate greedy algorithm. This roughly translated into" +
-      " O(1 / sketch_eps) number of bins. Compared to directly select number of bins, this comes" +
-      " with theoretical guarantee with sketch accuracy.",
-    (value: Double) => value < 1 && value > 0)
-
-  final def getSketchEps: Double = $(sketchEps)
-
   /**
    * Control the balance of positive and negative weights, useful for unbalanced classes. A typical
    * value to consider: sum(negative cases) / sum(positive cases).   [default=1]

diff --git a/plugin/federated/engine_federated.cc b/plugin/federated/engine_federated.cc
@@ -238,35 +238,5 @@ void Allreduce_(void *sendrecvbuf, size_t type_nbytes, size_t count, IEngine::Re
   if (engine.GetWorldSize() == 1) return;
   engine.Allreduce(sendrecvbuf, type_nbytes * count, dtype, op);
 }
-
-ReduceHandle::ReduceHandle() = default;
-ReduceHandle::~ReduceHandle() = default;
-
-int ReduceHandle::TypeSize(const MPI::Datatype &dtype) { return static_cast<int>(dtype.type_size); }
-
-void ReduceHandle::Init(IEngine::ReduceFunction redfunc,
-                        __attribute__((unused)) size_t type_nbytes) {
-  utils::Assert(redfunc_ == nullptr, "cannot initialize reduce handle twice");
-  redfunc_ = redfunc;
-}
-
-void ReduceHandle::Allreduce(void *sendrecvbuf, size_t type_nbytes, size_t count,
-                             IEngine::PreprocFunction prepare_fun, void *prepare_arg) {
-  utils::Assert(redfunc_ != nullptr, "must initialize handle to call AllReduce");
-  if (prepare_fun != nullptr) prepare_fun(prepare_arg);
-  if (engine.GetWorldSize() == 1) return;
-
-  // Gather all the buffers and call the reduce function locally.
-  auto const buffer_size = type_nbytes * count;
-  auto const gathered = engine.Allgather(sendrecvbuf, buffer_size);
-  auto const *data = gathered.data();
-  for (int i = 0; i < engine.GetWorldSize(); i++) {
-    if (i != engine.GetRank()) {
-      redfunc_(data + buffer_size * i, sendrecvbuf, static_cast<int>(count),
-               MPI::Datatype(type_nbytes));
-    }
-  }
-}
-
 }  // namespace engine
 }  // namespace rabit
diff --git a/rabit/include/rabit/internal/engine.h b/rabit/include/rabit/internal/engine.h
@@ -245,51 +245,6 @@ void Allreduce_(void *sendrecvbuf,  // NOLINT
                 mpi::OpType op,
                 IEngine::PreprocFunction prepare_fun = nullptr,
                 void *prepare_arg = nullptr);
-/*!
- * \brief handle for customized reducer, used to handle customized reduce
- *  this class is mainly created for compatiblity issues with MPI's customized reduce
- */
-class ReduceHandle {
- public:
-  // constructor
-  ReduceHandle();
-  // destructor
-  ~ReduceHandle();
-  /*!
-   * \brief initialize the reduce function,
-   *   with the type the reduce function needs to deal with
-   *   the reduce function MUST be communicative
-   */
-  void Init(IEngine::ReduceFunction redfunc, size_t type_nbytes);
-  /*!
-   * \brief customized in-place all reduce operation
-   * \param sendrecvbuf the in place send-recv buffer
-   * \param type_n4bytes size of the type, in terms of 4bytes
-   * \param count number of elements to send
-   * \param prepare_func Lazy preprocessing function, lazy prepare_fun(prepare_arg)
-   *                     will be called by the function before performing Allreduce in order to initialize the data in sendrecvbuf_.
-   *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
-   * \param prepare_arg argument used to pass into the lazy preprocessing function
-   */
-  void Allreduce(void *sendrecvbuf,
-                 size_t type_nbytes,
-                 size_t count,
-                 IEngine::PreprocFunction prepare_fun = nullptr,
-                 void *prepare_arg = nullptr);
-
-  /*! \return the number of bytes occupied by the type */
-  static int TypeSize(const MPI::Datatype &dtype);
-
- protected:
-  // handle function field
-  void *handle_ {nullptr};
-  // reduce function of the reducer
-  IEngine::ReduceFunction *redfunc_{nullptr};
-  // handle to the type field
-  void *htype_{nullptr};
-  // the created type in 4 bytes
-  size_t created_type_nbytes_;
-};
 }  // namespace engine
 }  // namespace rabit
 #endif  // RABIT_INTERNAL_ENGINE_H_
diff --git a/rabit/include/rabit/internal/rabit-inl.h b/rabit/include/rabit/internal/rabit-inl.h
@@ -225,122 +225,5 @@ inline void LazyCheckPoint(const Serializable *global_model) {
 inline int VersionNumber() {
   return engine::GetEngine()->VersionNumber();
 }
-// ---------------------------------
-// Code to handle customized Reduce
-// ---------------------------------
-// function to perform reduction for Reducer
-template<typename DType, void (*freduce)(DType &dst, const DType &src)>
-inline void ReducerSafeImpl(const void *src_, void *dst_, int len_, const MPI::Datatype &dtype) {
-  const size_t kUnit = sizeof(DType);
-  const char *psrc = reinterpret_cast<const char*>(src_);
-  char *pdst = reinterpret_cast<char*>(dst_);
-
-  for (int i = 0; i < len_; ++i) {
-    DType tdst, tsrc;
-    // use memcpy to avoid alignment issue
-    std::memcpy(&tdst, pdst + (i * kUnit), sizeof(tdst));
-    std::memcpy(&tsrc, psrc + (i * kUnit), sizeof(tsrc));
-    freduce(tdst, tsrc);
-    std::memcpy(pdst + i * kUnit, &tdst, sizeof(tdst));
-  }
-}
-// function to perform reduction for Reducer
-template<typename DType, void (*freduce)(DType &dst, const DType &src)> // NOLINT(*)
-inline void ReducerAlignImpl(const void *src_, void *dst_,
-                          int len_, const MPI::Datatype &dtype) {
-  const DType *psrc = reinterpret_cast<const DType*>(src_);
-  DType *pdst = reinterpret_cast<DType*>(dst_);
-  for (int i = 0; i < len_; ++i) {
-    freduce(pdst[i], psrc[i]);
-  }
-}
-template<typename DType, void (*freduce)(DType &dst, const DType &src)>  // NOLINT(*)
-inline Reducer<DType, freduce>::Reducer() {
-  // it is safe to directly use handle for aligned data types
-  if (sizeof(DType) == 8 || sizeof(DType) == 4 || sizeof(DType) == 1) {
-    this->handle_.Init(ReducerAlignImpl<DType, freduce>, sizeof(DType));
-  } else {
-    this->handle_.Init(ReducerSafeImpl<DType, freduce>, sizeof(DType));
-  }
-}
-template<typename DType, void (*freduce)(DType &dst, const DType &src)> // NOLINT(*)
-inline void Reducer<DType, freduce>::Allreduce(DType *sendrecvbuf, size_t count,
-                                               void (*prepare_fun)(void *arg),
-                                               void *prepare_arg) {
-  handle_.Allreduce(sendrecvbuf, sizeof(DType), count, prepare_fun,
-                    prepare_arg);
-}
-// function to perform reduction for SerializeReducer
-template<typename DType>
-inline void SerializeReducerFuncImpl(const void *src_, void *dst_,
-                                     int len_, const MPI::Datatype &dtype) {
-  int nbytes = engine::ReduceHandle::TypeSize(dtype);
-  // temp space
-  for (int i = 0; i < len_; ++i) {
-    DType tsrc, tdst;
-    utils::MemoryFixSizeBuffer fsrc((char*)(src_) + i * nbytes, nbytes); // NOLINT(*)
-    utils::MemoryFixSizeBuffer fdst((char*)(dst_) + i * nbytes, nbytes); // NOLINT(*)
-    tsrc.Load(fsrc);
-    tdst.Load(fdst);
-    // govern const check
-    tdst.Reduce(static_cast<const DType &>(tsrc), nbytes);
-    fdst.Seek(0);
-    tdst.Save(fdst);
-  }
-}
-template<typename DType>
-inline SerializeReducer<DType>::SerializeReducer() {
-  handle_.Init(SerializeReducerFuncImpl<DType>, sizeof(DType));
-}
-// closure to call Allreduce
-template<typename DType>
-struct SerializeReduceClosure {
-  DType *sendrecvobj;
-  size_t max_nbyte, count;
-  void (*prepare_fun)(void *arg);
-  void *prepare_arg;
-  std::string *p_buffer;
-  // invoke the closure
-  inline void Run() {
-    if (prepare_fun != nullptr) prepare_fun(prepare_arg);
-    for (size_t i = 0; i < count; ++i) {
-      utils::MemoryFixSizeBuffer fs(BeginPtr(*p_buffer) + i * max_nbyte, max_nbyte);
-      sendrecvobj[i].Save(fs);
-    }
-  }
-  inline static void Invoke(void *c) {
-    static_cast<SerializeReduceClosure<DType>*>(c)->Run();
-  }
-};
-template<typename DType>
-inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
-                                               size_t max_nbyte, size_t count,
-                                               void (*prepare_fun)(void *arg),
-                                               void *prepare_arg) {
-  buffer_.resize(max_nbyte * count);
-  // setup closure
-  SerializeReduceClosure<DType> c;
-  c.sendrecvobj = sendrecvobj; c.max_nbyte = max_nbyte; c.count = count;
-  c.prepare_fun = prepare_fun; c.prepare_arg = prepare_arg; c.p_buffer = &buffer_;
-  // invoke here
-  handle_.Allreduce(BeginPtr(buffer_), max_nbyte, count,
-                    SerializeReduceClosure<DType>::Invoke, &c);
-  for (size_t i = 0; i < count; ++i) {
-    utils::MemoryFixSizeBuffer fs(BeginPtr(buffer_) + i * max_nbyte, max_nbyte);
-    sendrecvobj[i].Load(fs);
-  }
-}
-
-template<typename DType, void (*freduce)(DType &dst, const DType &src)>  // NOLINT(*)g
-inline void Reducer<DType, freduce>::Allreduce(DType *sendrecvbuf, size_t count,
-                                               std::function<void()> prepare_fun) {
-  this->Allreduce(sendrecvbuf, count, InvokeLambda, &prepare_fun);
-}
-template<typename DType>
-inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
-                                               size_t max_nbytes, size_t count,
-                                               std::function<void()> prepare_fun) {
-  this->Allreduce(sendrecvobj, max_nbytes, count, InvokeLambda, &prepare_fun);
-}
 }  // namespace rabit
 #endif  // RABIT_INTERNAL_RABIT_INL_H_