From 755251c7e04a7e766393ececca9780b909ea8b9c Mon Sep 17 00:00:00 2001 From: Rong Ou Date: Mon, 13 Jun 2022 16:38:48 -0700 Subject: [PATCH 1/5] remove rabit reducer --- rabit/include/rabit/internal/rabit-inl.h | 21 ------------- rabit/include/rabit/rabit.h | 39 ------------------------ src/tree/hist/histogram.h | 5 ++- src/tree/updater_histmaker.cc | 6 ++-- src/tree/updater_refresh.cc | 5 ++- 5 files changed, 6 insertions(+), 70 deletions(-) diff --git a/rabit/include/rabit/internal/rabit-inl.h b/rabit/include/rabit/internal/rabit-inl.h index 9289ea8806e6..88cf2128af37 100644 --- a/rabit/include/rabit/internal/rabit-inl.h +++ b/rabit/include/rabit/internal/rabit-inl.h @@ -254,22 +254,6 @@ inline void ReducerAlignImpl(const void *src_, void *dst_, freduce(pdst[i], psrc[i]); } } -template // NOLINT(*) -inline Reducer::Reducer() { - // it is safe to directly use handle for aligned data types - if (sizeof(DType) == 8 || sizeof(DType) == 4 || sizeof(DType) == 1) { - this->handle_.Init(ReducerAlignImpl, sizeof(DType)); - } else { - this->handle_.Init(ReducerSafeImpl, sizeof(DType)); - } -} -template // NOLINT(*) -inline void Reducer::Allreduce(DType *sendrecvbuf, size_t count, - void (*prepare_fun)(void *arg), - void *prepare_arg) { - handle_.Allreduce(sendrecvbuf, sizeof(DType), count, prepare_fun, - prepare_arg); -} // function to perform reduction for SerializeReducer template inline void SerializeReducerFuncImpl(const void *src_, void *dst_, @@ -331,11 +315,6 @@ inline void SerializeReducer::Allreduce(DType *sendrecvobj, } } -template // NOLINT(*)g -inline void Reducer::Allreduce(DType *sendrecvbuf, size_t count, - std::function prepare_fun) { - this->Allreduce(sendrecvbuf, count, InvokeLambda, &prepare_fun); -} template inline void SerializeReducer::Allreduce(DType *sendrecvobj, size_t max_nbytes, size_t count, diff --git a/rabit/include/rabit/rabit.h b/rabit/include/rabit/rabit.h index 8f10cf3f3b32..1f2b16ea1126 100644 --- a/rabit/include/rabit/rabit.h +++ b/rabit/include/rabit/rabit.h @@ -279,45 +279,6 @@ inline int VersionNumber(); namespace engine { class ReduceHandle; } // namespace engine -/*! - * \brief template class to make customized reduce and all reduce easy - * Do not use reducer directly in the function you call Finalize, - * because the destructor can execute after Finalize - * \tparam DType data type that to be reduced - * \tparam freduce the customized reduction function - * DType must be a struct, with no pointer - */ -template // NOLINT(*) -class Reducer { - public: - Reducer(); - /*! - * \brief customized in-place all reduce operation - * \param sendrecvbuf the in place send-recv buffer - * \param count number of elements to be reduced - * \param prepare_fun Lazy preprocessing function, if it is not NULL, prepare_fun(prepare_arg) - * will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf. - * If the result of Allreduce can be recovered directly, then prepare_func will NOT be called - * \param prepare_arg argument used to pass into the lazy preprocessing function - */ - inline void Allreduce(DType *sendrecvbuf, size_t count, - void (*prepare_fun)(void *) = nullptr, - void *prepare_arg = nullptr); -#if DMLC_USE_CXX11 - /*! - * \brief customized in-place all reduce operation, with lambda function as preprocessor - * \param sendrecvbuf pointer to the array of objects to be reduced - * \param count number of elements to be reduced - * \param prepare_fun lambda function executed to prepare the data, if necessary - */ - inline void Allreduce(DType *sendrecvbuf, size_t count, - std::function prepare_fun); -#endif // DMLC_USE_CXX11 - - private: - /*! \brief function handle to do reduce */ - engine::ReduceHandle handle_; -}; /*! * \brief template class to make customized reduce, * this class defines complex reducer handles all the data structure that can be diff --git a/src/tree/hist/histogram.h b/src/tree/hist/histogram.h index 266086af11a4..6b93577a0c6e 100644 --- a/src/tree/hist/histogram.h +++ b/src/tree/hist/histogram.h @@ -24,7 +24,6 @@ class HistogramBuilder { common::HistCollection hist_local_worker_; common::GHistBuilder builder_; common::ParallelGHistBuilder buffer_; - rabit::Reducer reducer_; BatchParam param_; int32_t n_threads_{-1}; size_t n_batches_{0}; @@ -199,8 +198,8 @@ class HistogramBuilder { } }); - reducer_.Allreduce(this->hist_[starting_index].data(), - builder_.GetNumBins() * sync_count); + rabit::Allreduce(reinterpret_cast(this->hist_[starting_index].data()), + builder_.GetNumBins() * sync_count * 2); ParallelSubtractionHist(space, nodes_for_explicit_hist_build, nodes_for_subtraction_trick, p_tree); diff --git a/src/tree/updater_histmaker.cc b/src/tree/updater_histmaker.cc index 1c625954922e..4832ffa5e998 100644 --- a/src/tree/updater_histmaker.cc +++ b/src/tree/updater_histmaker.cc @@ -101,8 +101,6 @@ class HistMaker: public BaseMaker { }; // workspace of thread ThreadWSpace wspace_; - // reducer for histogram - rabit::Reducer histred_; // set of working features std::vector selected_features_; // update function implementation @@ -359,8 +357,8 @@ class CQHistMaker : public HistMaker { } }; // sync the histogram - this->histred_.Allreduce(dmlc::BeginPtr(this->wspace_.hset[0].data), - this->wspace_.hset[0].data.size(), lazy_get_hist); + rabit::Allreduce(&dmlc::BeginPtr(this->wspace_.hset[0].data)->sum_grad, + this->wspace_.hset[0].data.size() * 2, lazy_get_hist); } void ResetPositionAfterSplit(DMatrix *, diff --git a/src/tree/updater_refresh.cc b/src/tree/updater_refresh.cc index 72a81c730884..19bfe7d00195 100644 --- a/src/tree/updater_refresh.cc +++ b/src/tree/updater_refresh.cc @@ -100,7 +100,8 @@ class TreeRefresher : public TreeUpdater { } }); }; - reducer_.Allreduce(dmlc::BeginPtr(stemp[0]), stemp[0].size(), lazy_get_stats); + rabit::Allreduce(&dmlc::BeginPtr(stemp[0])->sum_grad, stemp[0].size() * 2, + lazy_get_stats); // rescale learning rate according to size of trees float lr = param_.learning_rate; param_.learning_rate = lr / trees.size(); @@ -154,8 +155,6 @@ class TreeRefresher : public TreeUpdater { } // training parameter TrainParam param_; - // reducer - rabit::Reducer reducer_; }; XGBOOST_REGISTER_TREE_UPDATER(TreeRefresher, "refresh") From 39a0147307b0907cc0311ac2b2baab4198780ebd Mon Sep 17 00:00:00 2001 From: Rong Ou Date: Wed, 15 Jun 2022 17:43:09 -0700 Subject: [PATCH 2/5] remove histmaker --- amalgamation/xgboost-all0.cc | 1 - plugin/federated/engine_federated.cc | 30 -- rabit/include/rabit/internal/engine.h | 45 -- rabit/include/rabit/internal/rabit-inl.h | 96 ---- rabit/include/rabit/rabit.h | 55 -- rabit/src/engine.cc | 22 - src/tree/tree_updater.cc | 1 - src/tree/updater_histmaker.cc | 627 ----------------------- 8 files changed, 877 deletions(-) delete mode 100644 src/tree/updater_histmaker.cc diff --git a/amalgamation/xgboost-all0.cc b/amalgamation/xgboost-all0.cc index ef3c2ffde8a7..cf02b07a097e 100644 --- a/amalgamation/xgboost-all0.cc +++ b/amalgamation/xgboost-all0.cc @@ -55,7 +55,6 @@ #include "../src/tree/tree_updater.cc" #include "../src/tree/updater_approx.cc" #include "../src/tree/updater_colmaker.cc" -#include "../src/tree/updater_histmaker.cc" #include "../src/tree/updater_prune.cc" #include "../src/tree/updater_quantile_hist.cc" #include "../src/tree/updater_refresh.cc" diff --git a/plugin/federated/engine_federated.cc b/plugin/federated/engine_federated.cc index d18e9e095d0d..8fd396d94744 100644 --- a/plugin/federated/engine_federated.cc +++ b/plugin/federated/engine_federated.cc @@ -238,35 +238,5 @@ void Allreduce_(void *sendrecvbuf, size_t type_nbytes, size_t count, IEngine::Re if (engine.GetWorldSize() == 1) return; engine.Allreduce(sendrecvbuf, type_nbytes * count, dtype, op); } - -ReduceHandle::ReduceHandle() = default; -ReduceHandle::~ReduceHandle() = default; - -int ReduceHandle::TypeSize(const MPI::Datatype &dtype) { return static_cast(dtype.type_size); } - -void ReduceHandle::Init(IEngine::ReduceFunction redfunc, - __attribute__((unused)) size_t type_nbytes) { - utils::Assert(redfunc_ == nullptr, "cannot initialize reduce handle twice"); - redfunc_ = redfunc; -} - -void ReduceHandle::Allreduce(void *sendrecvbuf, size_t type_nbytes, size_t count, - IEngine::PreprocFunction prepare_fun, void *prepare_arg) { - utils::Assert(redfunc_ != nullptr, "must initialize handle to call AllReduce"); - if (prepare_fun != nullptr) prepare_fun(prepare_arg); - if (engine.GetWorldSize() == 1) return; - - // Gather all the buffers and call the reduce function locally. - auto const buffer_size = type_nbytes * count; - auto const gathered = engine.Allgather(sendrecvbuf, buffer_size); - auto const *data = gathered.data(); - for (int i = 0; i < engine.GetWorldSize(); i++) { - if (i != engine.GetRank()) { - redfunc_(data + buffer_size * i, sendrecvbuf, static_cast(count), - MPI::Datatype(type_nbytes)); - } - } -} - } // namespace engine } // namespace rabit diff --git a/rabit/include/rabit/internal/engine.h b/rabit/include/rabit/internal/engine.h index 50b452f8db1a..84d92143a1d2 100644 --- a/rabit/include/rabit/internal/engine.h +++ b/rabit/include/rabit/internal/engine.h @@ -245,51 +245,6 @@ void Allreduce_(void *sendrecvbuf, // NOLINT mpi::OpType op, IEngine::PreprocFunction prepare_fun = nullptr, void *prepare_arg = nullptr); -/*! - * \brief handle for customized reducer, used to handle customized reduce - * this class is mainly created for compatiblity issues with MPI's customized reduce - */ -class ReduceHandle { - public: - // constructor - ReduceHandle(); - // destructor - ~ReduceHandle(); - /*! - * \brief initialize the reduce function, - * with the type the reduce function needs to deal with - * the reduce function MUST be communicative - */ - void Init(IEngine::ReduceFunction redfunc, size_t type_nbytes); - /*! - * \brief customized in-place all reduce operation - * \param sendrecvbuf the in place send-recv buffer - * \param type_n4bytes size of the type, in terms of 4bytes - * \param count number of elements to send - * \param prepare_func Lazy preprocessing function, lazy prepare_fun(prepare_arg) - * will be called by the function before performing Allreduce in order to initialize the data in sendrecvbuf_. - * If the result of Allreduce can be recovered directly, then prepare_func will NOT be called - * \param prepare_arg argument used to pass into the lazy preprocessing function - */ - void Allreduce(void *sendrecvbuf, - size_t type_nbytes, - size_t count, - IEngine::PreprocFunction prepare_fun = nullptr, - void *prepare_arg = nullptr); - - /*! \return the number of bytes occupied by the type */ - static int TypeSize(const MPI::Datatype &dtype); - - protected: - // handle function field - void *handle_ {nullptr}; - // reduce function of the reducer - IEngine::ReduceFunction *redfunc_{nullptr}; - // handle to the type field - void *htype_{nullptr}; - // the created type in 4 bytes - size_t created_type_nbytes_; -}; } // namespace engine } // namespace rabit #endif // RABIT_INTERNAL_ENGINE_H_ diff --git a/rabit/include/rabit/internal/rabit-inl.h b/rabit/include/rabit/internal/rabit-inl.h index 88cf2128af37..6ad296f79aab 100644 --- a/rabit/include/rabit/internal/rabit-inl.h +++ b/rabit/include/rabit/internal/rabit-inl.h @@ -225,101 +225,5 @@ inline void LazyCheckPoint(const Serializable *global_model) { inline int VersionNumber() { return engine::GetEngine()->VersionNumber(); } -// --------------------------------- -// Code to handle customized Reduce -// --------------------------------- -// function to perform reduction for Reducer -template -inline void ReducerSafeImpl(const void *src_, void *dst_, int len_, const MPI::Datatype &dtype) { - const size_t kUnit = sizeof(DType); - const char *psrc = reinterpret_cast(src_); - char *pdst = reinterpret_cast(dst_); - - for (int i = 0; i < len_; ++i) { - DType tdst, tsrc; - // use memcpy to avoid alignment issue - std::memcpy(&tdst, pdst + (i * kUnit), sizeof(tdst)); - std::memcpy(&tsrc, psrc + (i * kUnit), sizeof(tsrc)); - freduce(tdst, tsrc); - std::memcpy(pdst + i * kUnit, &tdst, sizeof(tdst)); - } -} -// function to perform reduction for Reducer -template // NOLINT(*) -inline void ReducerAlignImpl(const void *src_, void *dst_, - int len_, const MPI::Datatype &dtype) { - const DType *psrc = reinterpret_cast(src_); - DType *pdst = reinterpret_cast(dst_); - for (int i = 0; i < len_; ++i) { - freduce(pdst[i], psrc[i]); - } -} -// function to perform reduction for SerializeReducer -template -inline void SerializeReducerFuncImpl(const void *src_, void *dst_, - int len_, const MPI::Datatype &dtype) { - int nbytes = engine::ReduceHandle::TypeSize(dtype); - // temp space - for (int i = 0; i < len_; ++i) { - DType tsrc, tdst; - utils::MemoryFixSizeBuffer fsrc((char*)(src_) + i * nbytes, nbytes); // NOLINT(*) - utils::MemoryFixSizeBuffer fdst((char*)(dst_) + i * nbytes, nbytes); // NOLINT(*) - tsrc.Load(fsrc); - tdst.Load(fdst); - // govern const check - tdst.Reduce(static_cast(tsrc), nbytes); - fdst.Seek(0); - tdst.Save(fdst); - } -} -template -inline SerializeReducer::SerializeReducer() { - handle_.Init(SerializeReducerFuncImpl, sizeof(DType)); -} -// closure to call Allreduce -template -struct SerializeReduceClosure { - DType *sendrecvobj; - size_t max_nbyte, count; - void (*prepare_fun)(void *arg); - void *prepare_arg; - std::string *p_buffer; - // invoke the closure - inline void Run() { - if (prepare_fun != nullptr) prepare_fun(prepare_arg); - for (size_t i = 0; i < count; ++i) { - utils::MemoryFixSizeBuffer fs(BeginPtr(*p_buffer) + i * max_nbyte, max_nbyte); - sendrecvobj[i].Save(fs); - } - } - inline static void Invoke(void *c) { - static_cast*>(c)->Run(); - } -}; -template -inline void SerializeReducer::Allreduce(DType *sendrecvobj, - size_t max_nbyte, size_t count, - void (*prepare_fun)(void *arg), - void *prepare_arg) { - buffer_.resize(max_nbyte * count); - // setup closure - SerializeReduceClosure c; - c.sendrecvobj = sendrecvobj; c.max_nbyte = max_nbyte; c.count = count; - c.prepare_fun = prepare_fun; c.prepare_arg = prepare_arg; c.p_buffer = &buffer_; - // invoke here - handle_.Allreduce(BeginPtr(buffer_), max_nbyte, count, - SerializeReduceClosure::Invoke, &c); - for (size_t i = 0; i < count; ++i) { - utils::MemoryFixSizeBuffer fs(BeginPtr(buffer_) + i * max_nbyte, max_nbyte); - sendrecvobj[i].Load(fs); - } -} - -template -inline void SerializeReducer::Allreduce(DType *sendrecvobj, - size_t max_nbytes, size_t count, - std::function prepare_fun) { - this->Allreduce(sendrecvobj, max_nbytes, count, InvokeLambda, &prepare_fun); -} } // namespace rabit #endif // RABIT_INTERNAL_RABIT_INL_H_ diff --git a/rabit/include/rabit/rabit.h b/rabit/include/rabit/rabit.h index 1f2b16ea1126..3da8ca268dde 100644 --- a/rabit/include/rabit/rabit.h +++ b/rabit/include/rabit/rabit.h @@ -274,61 +274,6 @@ inline void LazyCheckPoint(const Serializable *global_model); * \sa LoadCheckPoint, CheckPoint */ inline int VersionNumber(); -// ----- extensions that allow customized reducer ------ -// helper class to do customized reduce, user do not need to know the type -namespace engine { -class ReduceHandle; -} // namespace engine -/*! - * \brief template class to make customized reduce, - * this class defines complex reducer handles all the data structure that can be - * serialized/deserialized into fixed size buffer - * Do not use reducer directly in the function you call Finalize, because the destructor can execute after Finalize - * - * \tparam DType data type that to be reduced, DType must contain the following functions: - * \tparam freduce the customized reduction function - * (1) Save(IStream &fs) (2) Load(IStream &fs) (3) Reduce(const DType &src, size_t max_nbyte) - */ -template -class SerializeReducer { - public: - SerializeReducer(); - /*! - * \brief customized in-place all reduce operation - * \param sendrecvobj pointer to the array of objects to be reduced - * \param max_nbyte maximum amount of memory needed to serialize each object - * this includes budget limit for intermediate and final result - * \param count number of elements to be reduced - * \param prepare_fun Lazy preprocessing function, if it is not NULL, prepare_fun(prepare_arg) - * will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf. - * If the result of Allreduce can be recovered directly, then the prepare_func will NOT be called - * \param prepare_arg argument used to pass into the lazy preprocessing function - */ - inline void Allreduce(DType *sendrecvobj, - size_t max_nbyte, size_t count, - void (*prepare_fun)(void *) = nullptr, - void *prepare_arg = nullptr); -// C++11 support for lambda prepare function -#if DMLC_USE_CXX11 - /*! - * \brief customized in-place all reduce operation, with lambda function as preprocessor - * \param sendrecvobj pointer to the array of objects to be reduced - * \param max_nbyte maximum amount of memory needed to serialize each object - * this includes budget limit for intermediate and final result - * \param count number of elements to be reduced - * \param prepare_fun lambda function executed to prepare the data, if necessary - */ - inline void Allreduce(DType *sendrecvobj, - size_t max_nbyte, size_t count, - std::function prepare_fun); -#endif // DMLC_USE_CXX11 - - private: - /*! \brief function handle to do reduce */ - engine::ReduceHandle handle_; - /*! \brief temporal buffer used to do reduce*/ - std::string buffer_; -}; } // namespace rabit // implementation of template functions #include "./internal/rabit-inl.h" diff --git a/rabit/src/engine.cc b/rabit/src/engine.cc index 36e28a1771c1..5e325383c299 100644 --- a/rabit/src/engine.cc +++ b/rabit/src/engine.cc @@ -103,27 +103,5 @@ void Allreduce_(void *sendrecvbuf, // NOLINT GetEngine()->Allreduce(sendrecvbuf, type_nbytes, count, red, prepare_fun, prepare_arg); } - -// code for reduce handle -ReduceHandle::ReduceHandle() = default; -ReduceHandle::~ReduceHandle() = default; - -int ReduceHandle::TypeSize(const MPI::Datatype &dtype) { - return static_cast(dtype.type_size); -} - -void ReduceHandle::Init(IEngine::ReduceFunction redfunc, size_t ) { - utils::Assert(redfunc_ == nullptr, "cannot initialize reduce handle twice"); - redfunc_ = redfunc; -} - -void ReduceHandle::Allreduce(void *sendrecvbuf, - size_t type_nbytes, size_t count, - IEngine::PreprocFunction prepare_fun, - void *prepare_arg) { - utils::Assert(redfunc_ != nullptr, "must initialize handle to call AllReduce"); - GetEngine()->Allreduce(sendrecvbuf, type_nbytes, count, - redfunc_, prepare_fun, prepare_arg); -} } // namespace engine } // namespace rabit diff --git a/src/tree/tree_updater.cc b/src/tree/tree_updater.cc index ee5659636305..190a1e02080e 100644 --- a/src/tree/tree_updater.cc +++ b/src/tree/tree_updater.cc @@ -33,7 +33,6 @@ DMLC_REGISTRY_LINK_TAG(updater_colmaker); DMLC_REGISTRY_LINK_TAG(updater_refresh); DMLC_REGISTRY_LINK_TAG(updater_prune); DMLC_REGISTRY_LINK_TAG(updater_quantile_hist); -DMLC_REGISTRY_LINK_TAG(updater_histmaker); DMLC_REGISTRY_LINK_TAG(updater_approx); DMLC_REGISTRY_LINK_TAG(updater_sync); #ifdef XGBOOST_USE_CUDA diff --git a/src/tree/updater_histmaker.cc b/src/tree/updater_histmaker.cc deleted file mode 100644 index 4832ffa5e998..000000000000 --- a/src/tree/updater_histmaker.cc +++ /dev/null @@ -1,627 +0,0 @@ -/*! - * Copyright 2014-2022 by XGBoost Contributors - * \file updater_histmaker.cc - * \brief use histogram counting to construct a tree - * \author Tianqi Chen - */ -#include -#include -#include - -#include "xgboost/tree_updater.h" -#include "xgboost/base.h" -#include "xgboost/logging.h" - -#include "../common/quantile.h" -#include "../common/group_data.h" -#include "./updater_basemaker-inl.h" -#include "constraints.h" - -namespace xgboost { -namespace tree { - -DMLC_REGISTRY_FILE_TAG(updater_histmaker); - -class HistMaker: public BaseMaker { - public: - explicit HistMaker(GenericParameter const *ctx) : BaseMaker(ctx) {} - void Update(HostDeviceVector *gpair, DMatrix *p_fmat, - common::Span> out_position, - const std::vector &trees) override { - interaction_constraints_.Configure(param_, p_fmat->Info().num_col_); - // rescale learning rate according to size of trees - float lr = param_.learning_rate; - param_.learning_rate = lr / trees.size(); - // build tree - for (auto tree : trees) { - this->UpdateTree(gpair->ConstHostVector(), p_fmat, tree); - } - param_.learning_rate = lr; - } - char const* Name() const override { - return "grow_histmaker"; - } - - protected: - /*! \brief a single column of histogram cuts */ - struct HistUnit { - /*! \brief cutting point of histogram, contains maximum point */ - const float *cut; - /*! \brief content of statistics data */ - GradStats *data; - /*! \brief size of histogram */ - uint32_t size; - // default constructor - HistUnit() = default; - // constructor - HistUnit(const float *cut, GradStats *data, uint32_t size) - : cut{cut}, data{data}, size{size} {} - /*! \brief add a histogram to data */ - }; - /*! \brief a set of histograms from different index */ - struct HistSet { - /*! \brief the index pointer of each histunit */ - const uint32_t *rptr; - /*! \brief cutting points in each histunit */ - const bst_float *cut; - /*! \brief data in different hist unit */ - std::vector data; - /*! \brief return a column of histogram cuts */ - inline HistUnit operator[](size_t fid) { - return {cut + rptr[fid], &data[0] + rptr[fid], rptr[fid+1] - rptr[fid]}; - } - }; - // thread workspace - struct ThreadWSpace { - /*! \brief actual unit pointer */ - std::vector rptr; - /*! \brief cut field */ - std::vector cut; - // per thread histset - std::vector hset; - // initialize the hist set - inline void Configure(int nthread) { - hset.resize(nthread); - // cleanup statistics - for (int tid = 0; tid < nthread; ++tid) { - for (auto& d : hset[tid].data) { d = GradStats(); } - hset[tid].rptr = dmlc::BeginPtr(rptr); - hset[tid].cut = dmlc::BeginPtr(cut); - hset[tid].data.resize(cut.size(), GradStats()); - } - } - /*! \brief clear the workspace */ - inline void Clear() { - cut.clear(); rptr.resize(1); rptr[0] = 0; - } - /*! \brief total size */ - inline size_t Size() const { - return rptr.size() - 1; - } - }; - // workspace of thread - ThreadWSpace wspace_; - // set of working features - std::vector selected_features_; - // update function implementation - virtual void UpdateTree(const std::vector &gpair, - DMatrix *p_fmat, - RegTree *p_tree) { - CHECK(param_.max_depth > 0) << "max_depth must be larger than 0"; - this->InitData(gpair, *p_fmat, *p_tree); - this->InitWorkSet(p_fmat, *p_tree, &selected_features_); - // mark root node as fresh. - (*p_tree)[0].SetLeaf(0.0f, 0); - - for (int depth = 0; depth < param_.max_depth; ++depth) { - // reset and propose candidate split - this->ResetPosAndPropose(gpair, p_fmat, selected_features_, *p_tree); - // create histogram - this->CreateHist(gpair, p_fmat, selected_features_, *p_tree); - // find split based on histogram statistics - this->FindSplit(selected_features_, p_tree); - // reset position after split - this->ResetPositionAfterSplit(p_fmat, *p_tree); - this->UpdateQueueExpand(*p_tree); - // if nothing left to be expand, break - if (qexpand_.size() == 0) break; - } - for (int const nid : qexpand_) { - (*p_tree)[nid].SetLeaf(p_tree->Stat(nid).base_weight * param_.learning_rate); - } - } - // this function does two jobs - // (1) reset the position in array position, to be the latest leaf id - // (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly - virtual void ResetPosAndPropose(const std::vector &gpair, - DMatrix *p_fmat, - const std::vector &fset, - const RegTree &tree) = 0; - // initialize the current working set of features in this round - virtual void InitWorkSet(DMatrix *, - const RegTree &tree, - std::vector *p_fset) { - p_fset->resize(tree.param.num_feature); - for (size_t i = 0; i < p_fset->size(); ++i) { - (*p_fset)[i] = static_cast(i); - } - } - // reset position after split, this is not a must, depending on implementation - virtual void ResetPositionAfterSplit(DMatrix *p_fmat, - const RegTree &tree) { - } - virtual void CreateHist(const std::vector &gpair, - DMatrix *, - const std::vector &fset, - const RegTree &) = 0; - - private: - void EnumerateSplit(const HistUnit &hist, - const GradStats &node_sum, - bst_uint fid, - SplitEntry *best, - GradStats *left_sum) const { - if (hist.size == 0) return; - - double root_gain = CalcGain(param_, node_sum.GetGrad(), node_sum.GetHess()); - GradStats s, c; - for (bst_uint i = 0; i < hist.size; ++i) { - s.Add(hist.data[i]); - if (s.sum_hess >= param_.min_child_weight) { - c.SetSubstract(node_sum, s); - if (c.sum_hess >= param_.min_child_weight) { - double loss_chg = CalcGain(param_, s.GetGrad(), s.GetHess()) + - CalcGain(param_, c.GetGrad(), c.GetHess()) - root_gain; - if (best->Update(static_cast(loss_chg), fid, hist.cut[i], - false, false, s, c)) { - *left_sum = s; - } - } - } - } - s = GradStats(); - for (bst_uint i = hist.size - 1; i != 0; --i) { - s.Add(hist.data[i]); - if (s.sum_hess >= param_.min_child_weight) { - c.SetSubstract(node_sum, s); - if (c.sum_hess >= param_.min_child_weight) { - double loss_chg = CalcGain(param_, s.GetGrad(), s.GetHess()) + - CalcGain(param_, c.GetGrad(), c.GetHess()) - root_gain; - if (best->Update(static_cast(loss_chg), fid, - hist.cut[i - 1], true, false, c, s)) { - *left_sum = c; - } - } - } - } - } - - void FindSplit(const std::vector &feature_set, - RegTree *p_tree) { - const size_t num_feature = feature_set.size(); - // get the best split condition for each node - std::vector sol(qexpand_.size()); - std::vector left_sum(qexpand_.size()); - auto nexpand = qexpand_.size(); - common::ParallelFor(nexpand, ctx_->Threads(), common::Sched::Dyn(1), [&](auto wid) { - const int nid = qexpand_[wid]; - CHECK_EQ(node2workindex_[nid], static_cast(wid)); - SplitEntry &best = sol[wid]; - GradStats &node_sum = wspace_.hset[0][num_feature + wid * (num_feature + 1)].data[0]; - for (size_t i = 0; i < feature_set.size(); ++i) { - // Query is thread safe as it's a const function. - if (!this->interaction_constraints_.Query(nid, feature_set[i])) { - continue; - } - - EnumerateSplit(this->wspace_.hset[0][i + wid * (num_feature + 1)], node_sum, feature_set[i], - &best, &left_sum[wid]); - } - }); - // get the best result, we can synchronize the solution - for (bst_omp_uint wid = 0; wid < nexpand; ++wid) { - const bst_node_t nid = qexpand_[wid]; - SplitEntry const& best = sol[wid]; - const GradStats &node_sum = wspace_.hset[0][num_feature + wid * (num_feature + 1)].data[0]; - this->SetStats(p_tree, nid, node_sum); - // set up the values - p_tree->Stat(nid).loss_chg = best.loss_chg; - // now we know the solution in snode[nid], set split - if (best.loss_chg > kRtEps) { - bst_float base_weight = CalcWeight(param_, node_sum); - bst_float left_leaf_weight = - CalcWeight(param_, best.left_sum.sum_grad, best.left_sum.sum_hess) * - param_.learning_rate; - bst_float right_leaf_weight = - CalcWeight(param_, best.right_sum.sum_grad, - best.right_sum.sum_hess) * - param_.learning_rate; - p_tree->ExpandNode(nid, best.SplitIndex(), best.split_value, - best.DefaultLeft(), base_weight, left_leaf_weight, - right_leaf_weight, best.loss_chg, - node_sum.sum_hess, - best.left_sum.GetHess(), best.right_sum.GetHess()); - GradStats right_sum; - right_sum.SetSubstract(node_sum, left_sum[wid]); - auto left_child = (*p_tree)[nid].LeftChild(); - auto right_child = (*p_tree)[nid].RightChild(); - this->SetStats(p_tree, left_child, left_sum[wid]); - this->SetStats(p_tree, right_child, right_sum); - this->interaction_constraints_.Split(nid, best.SplitIndex(), left_child, right_child); - } else { - (*p_tree)[nid].SetLeaf(p_tree->Stat(nid).base_weight * param_.learning_rate); - } - } - } - - inline void SetStats(RegTree *p_tree, int nid, const GradStats &node_sum) { - p_tree->Stat(nid).base_weight = - static_cast(CalcWeight(param_, node_sum)); - p_tree->Stat(nid).sum_hess = static_cast(node_sum.sum_hess); - } -}; - -class CQHistMaker : public HistMaker { - public: - explicit CQHistMaker(GenericParameter const *ctx) : HistMaker(ctx) {} - char const *Name() const override { return "grow_local_histmaker"; } - - protected: - struct HistEntry { - HistMaker::HistUnit hist; - unsigned istart; - /*! - * \brief add a histogram to data, - * do linear scan, start from istart - */ - inline void Add(bst_float fv, - const std::vector &gpair, - const bst_uint ridx) { - while (istart < hist.size && !(fv < hist.cut[istart])) ++istart; - CHECK_NE(istart, hist.size); - hist.data[istart].Add(gpair[ridx]); - } - /*! - * \brief add a histogram to data, - * do linear scan, start from istart - */ - inline void Add(bst_float fv, - GradientPair gstats) { - if (fv < hist.cut[istart]) { - hist.data[istart].Add(gstats); - } else { - while (istart < hist.size && !(fv < hist.cut[istart])) ++istart; - if (istart != hist.size) { - hist.data[istart].Add(gstats); - } else { - LOG(INFO) << "fv=" << fv << ", hist.size=" << hist.size; - for (size_t i = 0; i < hist.size; ++i) { - LOG(INFO) << "hist[" << i << "]=" << hist.cut[i]; - } - LOG(FATAL) << "fv=" << fv << ", hist.last=" << hist.cut[hist.size - 1]; - } - } - } - }; - // sketch type used for this - using WXQSketch = common::WXQuantileSketch; - // initialize the work set of tree - void InitWorkSet(DMatrix *p_fmat, - const RegTree &tree, - std::vector *p_fset) override { - if (p_fmat != cache_dmatrix_) { - feat_helper_.InitByCol(p_fmat, tree); - cache_dmatrix_ = p_fmat; - } - feat_helper_.SyncInfo(); - feat_helper_.SampleCol(this->param_.colsample_bytree, p_fset); - } - // code to create histogram - void CreateHist(const std::vector &gpair, - DMatrix *p_fmat, - const std::vector &fset, - const RegTree &tree) override { - const MetaInfo &info = p_fmat->Info(); - // fill in reverse map - feat2workindex_.resize(tree.param.num_feature); - std::fill(feat2workindex_.begin(), feat2workindex_.end(), -1); - for (size_t i = 0; i < fset.size(); ++i) { - feat2workindex_[fset[i]] = static_cast(i); - } - // start to work - this->wspace_.Configure(1); - // if it is C++11, use lazy evaluation for Allreduce, - // to gain speedup in recovery - auto lazy_get_hist = [&]() { - thread_hist_.resize(ctx_->Threads()); - // start accumulating statistics - for (const auto &batch : p_fmat->GetBatches()) { - auto page = batch.GetView(); - // start enumeration - common::ParallelFor(fset.size(), ctx_->Threads(), common::Sched::Dyn(1), [&](auto i) { - int fid = fset[i]; - int offset = feat2workindex_[fid]; - if (offset >= 0) { - this->UpdateHistCol(gpair, page[fid], info, tree, fset, offset, - &thread_hist_[omp_get_thread_num()]); - } - }); - } - // update node statistics. - this->GetNodeStats(gpair, *p_fmat, tree, - &thread_stats_, &node_stats_); - for (int const nid : this->qexpand_) { - const int wid = this->node2workindex_[nid]; - this->wspace_.hset[0][fset.size() + wid * (fset.size() + 1)] - .data[0] = node_stats_[nid]; - } - }; - // sync the histogram - rabit::Allreduce(&dmlc::BeginPtr(this->wspace_.hset[0].data)->sum_grad, - this->wspace_.hset[0].data.size() * 2, lazy_get_hist); - } - - void ResetPositionAfterSplit(DMatrix *, - const RegTree &tree) override { - this->GetSplitSet(this->qexpand_, tree, &fsplit_set_); - } - void ResetPosAndPropose(const std::vector &gpair, - DMatrix *p_fmat, - const std::vector &fset, - const RegTree &tree) override { - const MetaInfo &info = p_fmat->Info(); - // fill in reverse map - feat2workindex_.resize(tree.param.num_feature); - std::fill(feat2workindex_.begin(), feat2workindex_.end(), -1); - work_set_.clear(); - for (auto fidx : fset) { - if (feat_helper_.Type(fidx) == 2) { - feat2workindex_[fidx] = static_cast(work_set_.size()); - work_set_.push_back(fidx); - } else { - feat2workindex_[fidx] = -2; - } - } - const size_t work_set_size = work_set_.size(); - - sketchs_.resize(this->qexpand_.size() * work_set_size); - for (auto& sketch : sketchs_) { - sketch.Init(info.num_row_, this->param_.sketch_eps); - } - // initialize the summary array - summary_array_.resize(sketchs_.size()); - // setup maximum size - unsigned max_size = this->param_.MaxSketchSize(); - for (size_t i = 0; i < sketchs_.size(); ++i) { - summary_array_[i].Reserve(max_size); - } - { - // get summary - thread_sketch_.resize(ctx_->Threads()); - - // TWOPASS: use the real set + split set in the column iteration. - this->SetDefaultPostion(p_fmat, tree); - work_set_.insert(work_set_.end(), fsplit_set_.begin(), fsplit_set_.end()); - std::sort(work_set_.begin(), work_set_.end()); - work_set_.resize(std::unique(work_set_.begin(), work_set_.end()) - work_set_.begin()); - - // start accumulating statistics - for (const auto &batch : p_fmat->GetBatches()) { - // TWOPASS: use the real set + split set in the column iteration. - this->CorrectNonDefaultPositionByBatch(batch, fsplit_set_, tree); - auto page = batch.GetView(); - // start enumeration - common::ParallelFor(work_set_.size(), ctx_->Threads(), common::Sched::Dyn(1), - [&](auto i) { - int fid = work_set_[i]; - int offset = feat2workindex_[fid]; - if (offset >= 0) { - this->UpdateSketchCol(gpair, page[fid], tree, work_set_size, offset, - &thread_sketch_[omp_get_thread_num()]); - } - }); - } - for (size_t i = 0; i < sketchs_.size(); ++i) { - common::WXQuantileSketch::SummaryContainer out; - sketchs_[i].GetSummary(&out); - summary_array_[i].SetPrune(out, max_size); - } - CHECK_EQ(summary_array_.size(), sketchs_.size()); - } - if (summary_array_.size() != 0) { - size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_size); - sreducer_.Allreduce(dmlc::BeginPtr(summary_array_), nbytes, summary_array_.size()); - } - // now we get the final result of sketch, setup the cut - this->wspace_.cut.clear(); - this->wspace_.rptr.clear(); - this->wspace_.rptr.push_back(0); - for (size_t wid = 0; wid < this->qexpand_.size(); ++wid) { - for (unsigned int i : fset) { - int offset = feat2workindex_[i]; - if (offset >= 0) { - const WXQSketch::Summary &a = summary_array_[wid * work_set_size + offset]; - for (size_t i = 1; i < a.size; ++i) { - bst_float cpt = a.data[i].value - kRtEps; - if (i == 1 || cpt > this->wspace_.cut.back()) { - this->wspace_.cut.push_back(cpt); - } - } - // push a value that is greater than anything - if (a.size != 0) { - bst_float cpt = a.data[a.size - 1].value; - // this must be bigger than last value in a scale - bst_float last = cpt + fabs(cpt) + kRtEps; - this->wspace_.cut.push_back(last); - } - this->wspace_.rptr.push_back(static_cast(this->wspace_.cut.size())); - } else { - CHECK_EQ(offset, -2); - bst_float cpt = feat_helper_.MaxValue(i); - this->wspace_.cut.push_back(cpt + fabs(cpt) + kRtEps); - this->wspace_.rptr.push_back(static_cast(this->wspace_.cut.size())); - } - } - // reserve last value for global statistics - this->wspace_.cut.push_back(0.0f); - this->wspace_.rptr.push_back(static_cast(this->wspace_.cut.size())); - } - CHECK_EQ(this->wspace_.rptr.size(), - (fset.size() + 1) * this->qexpand_.size() + 1); - } - - inline void UpdateHistCol(const std::vector &gpair, - const SparsePage::Inst &col, - const MetaInfo &info, - const RegTree &tree, - const std::vector &fset, - bst_uint fid_offset, - std::vector *p_temp) { - if (col.size() == 0) return; - // initialize sbuilder for use - std::vector &hbuilder = *p_temp; - hbuilder.resize(tree.param.num_nodes); - for (int const nid : this->qexpand_) { - const unsigned wid = this->node2workindex_[nid]; - hbuilder[nid].istart = 0; - hbuilder[nid].hist = this->wspace_.hset[0][fid_offset + wid * (fset.size()+1)]; - } - if (this->param_.cache_opt != 0) { - constexpr bst_uint kBuffer = 32; - bst_uint align_length = col.size() / kBuffer * kBuffer; - int buf_position[kBuffer]; - GradientPair buf_gpair[kBuffer]; - for (bst_uint j = 0; j < align_length; j += kBuffer) { - for (bst_uint i = 0; i < kBuffer; ++i) { - bst_uint ridx = col[j + i].index; - buf_position[i] = this->position_[ridx]; - buf_gpair[i] = gpair[ridx]; - } - for (bst_uint i = 0; i < kBuffer; ++i) { - const int nid = buf_position[i]; - if (nid >= 0) { - hbuilder[nid].Add(col[j + i].fvalue, buf_gpair[i]); - } - } - } - for (bst_uint j = align_length; j < col.size(); ++j) { - const bst_uint ridx = col[j].index; - const int nid = this->position_[ridx]; - if (nid >= 0) { - hbuilder[nid].Add(col[j].fvalue, gpair[ridx]); - } - } - } else { - for (const auto& c : col) { - const bst_uint ridx = c.index; - const int nid = this->position_[ridx]; - if (nid >= 0) { - hbuilder[nid].Add(c.fvalue, gpair, ridx); - } - } - } - } - inline void UpdateSketchCol(const std::vector &gpair, - const SparsePage::Inst &col, - const RegTree &tree, - size_t work_set_size, - bst_uint offset, - std::vector *p_temp) { - if (col.size() == 0) return; - // initialize sbuilder for use - std::vector &sbuilder = *p_temp; - sbuilder.resize(tree.param.num_nodes); - for (int const nid : this->qexpand_) { - const unsigned wid = this->node2workindex_[nid]; - sbuilder[nid].sum_total = 0.0f; - sbuilder[nid].sketch = &sketchs_[wid * work_set_size + offset]; - } - // first pass, get sum of weight, TODO, optimization to skip first pass - for (const auto& c : col) { - const bst_uint ridx = c.index; - const int nid = this->position_[ridx]; - if (nid >= 0) { - sbuilder[nid].sum_total += gpair[ridx].GetHess(); - } - } - // if only one value, no need to do second pass - if (col[0].fvalue == col[col.size()-1].fvalue) { - for (int const nid : this->qexpand_) { - sbuilder[nid].sketch->Push( - col[0].fvalue, static_cast(sbuilder[nid].sum_total)); - } - return; - } - // two pass scan - unsigned max_size = this->param_.MaxSketchSize(); - for (int const nid : this->qexpand_) { - sbuilder[nid].Init(max_size); - } - // second pass, build the sketch - if (this->param_.cache_opt != 0) { - constexpr bst_uint kBuffer = 32; - bst_uint align_length = col.size() / kBuffer * kBuffer; - int buf_position[kBuffer]; - bst_float buf_hess[kBuffer]; - for (bst_uint j = 0; j < align_length; j += kBuffer) { - for (bst_uint i = 0; i < kBuffer; ++i) { - bst_uint ridx = col[j + i].index; - buf_position[i] = this->position_[ridx]; - buf_hess[i] = gpair[ridx].GetHess(); - } - for (bst_uint i = 0; i < kBuffer; ++i) { - const int nid = buf_position[i]; - if (nid >= 0) { - sbuilder[nid].Push(col[j + i].fvalue, buf_hess[i], max_size); - } - } - } - for (bst_uint j = align_length; j < col.size(); ++j) { - const bst_uint ridx = col[j].index; - const int nid = this->position_[ridx]; - if (nid >= 0) { - sbuilder[nid].Push(col[j].fvalue, gpair[ridx].GetHess(), max_size); - } - } - } else { - for (const auto& c : col) { - const bst_uint ridx = c.index; - const int nid = this->position_[ridx]; - if (nid >= 0) { - sbuilder[nid].Push(c.fvalue, gpair[ridx].GetHess(), max_size); - } - } - } - for (int const nid : this->qexpand_) { sbuilder[nid].Finalize(max_size); } - } - // cached dmatrix where we initialized the feature on. - const DMatrix* cache_dmatrix_{nullptr}; - // feature helper - BaseMaker::FMetaHelper feat_helper_; - // temp space to map feature id to working index - std::vector feat2workindex_; - // set of index from fset that are current work set - std::vector work_set_; - // set of index from that are split candidates. - std::vector fsplit_set_; - // thread temp data - std::vector > thread_sketch_; - // used to hold statistics - std::vector > thread_stats_; - // used to hold start pointer - std::vector > thread_hist_; - // node statistics - std::vector node_stats_; - // summary array - std::vector summary_array_; - // reducer for summary - rabit::SerializeReducer sreducer_; - // per node, per feature sketch - std::vector > sketchs_; -}; - -XGBOOST_REGISTER_TREE_UPDATER(LocalHistMaker, "grow_local_histmaker") - .describe("Tree constructor that uses approximate histogram construction.") - .set_body([](GenericParameter const *ctx, ObjInfo) { return new CQHistMaker(ctx); }); -} // namespace tree -} // namespace xgboost From 594a963624b60365fcaf0c623f19a124d9783c15 Mon Sep 17 00:00:00 2001 From: Rong Ou Date: Thu, 16 Jun 2022 12:20:51 -0700 Subject: [PATCH 3/5] fix mpi engine --- rabit/src/engine_mpi.cc | 87 ----------------------------------------- 1 file changed, 87 deletions(-) diff --git a/rabit/src/engine_mpi.cc b/rabit/src/engine_mpi.cc index c5811cb76a6c..d5ed2f454993 100644 --- a/rabit/src/engine_mpi.cc +++ b/rabit/src/engine_mpi.cc @@ -158,92 +158,5 @@ void Allreduce_(void *sendrecvbuf, MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf, count, GetType(dtype), GetOp(op)); } - -// code for reduce handle -ReduceHandle::ReduceHandle(void) - : handle_(NULL), redfunc_(NULL), htype_(NULL) { -} -ReduceHandle::~ReduceHandle(void) { - /* !WARNING! - - A handle can be held by a tree method/Learner from xgboost. The booster might not be - freed until program exit, while (good) users call rabit.finalize() before reaching - the end of program. So op->Free() might be called after finalization and results - into following error: - - ``` - Attempting to use an MPI routine after finalizing MPICH - ``` - - Here we skip calling Free if MPI has already been finalized to workaround the issue. - It can be a potential leak of memory. The best way to resolve it is to eliminate all - use of long living handle. - */ - int finalized = 0; - CHECK_EQ(MPI_Finalized(&finalized), MPI_SUCCESS); - if (handle_ != NULL) { - MPI::Op *op = reinterpret_cast(handle_); - if (!finalized) { - op->Free(); - } - delete op; - } - if (htype_ != NULL) { - MPI::Datatype *dtype = reinterpret_cast(htype_); - if (!finalized) { - dtype->Free(); - } - delete dtype; - } -} -int ReduceHandle::TypeSize(const MPI::Datatype &dtype) { - return dtype.Get_size(); -} -void ReduceHandle::Init(IEngine::ReduceFunction redfunc, size_t type_nbytes) { - utils::Assert(handle_ == NULL, "cannot initialize reduce handle twice"); - if (type_nbytes != 0) { - MPI::Datatype *dtype = new MPI::Datatype(); - if (type_nbytes % 8 == 0) { - *dtype = MPI::LONG.Create_contiguous(type_nbytes / sizeof(long)); // NOLINT(*) - } else if (type_nbytes % 4 == 0) { - *dtype = MPI::INT.Create_contiguous(type_nbytes / sizeof(int)); - } else { - *dtype = MPI::CHAR.Create_contiguous(type_nbytes); - } - dtype->Commit(); - created_type_nbytes_ = type_nbytes; - htype_ = dtype; - } - MPI::Op *op = new MPI::Op(); - MPI::User_function *pf = redfunc; - op->Init(pf, true); - handle_ = op; -} -void ReduceHandle::Allreduce(void *sendrecvbuf, - size_t type_nbytes, size_t count, - IEngine::PreprocFunction prepare_fun, - void *prepare_arg) { - utils::Assert(handle_ != NULL, "must initialize handle to call AllReduce"); - MPI::Op *op = reinterpret_cast(handle_); - MPI::Datatype *dtype = reinterpret_cast(htype_); - if (created_type_nbytes_ != type_nbytes || dtype == NULL) { - if (dtype == NULL) { - dtype = new MPI::Datatype(); - } else { - dtype->Free(); - } - if (type_nbytes % 8 == 0) { - *dtype = MPI::LONG.Create_contiguous(type_nbytes / sizeof(long)); // NOLINT(*) - } else if (type_nbytes % 4 == 0) { - *dtype = MPI::INT.Create_contiguous(type_nbytes / sizeof(int)); - } else { - *dtype = MPI::CHAR.Create_contiguous(type_nbytes); - } - dtype->Commit(); - created_type_nbytes_ = type_nbytes; - } - if (prepare_fun != NULL) prepare_fun(prepare_arg); - MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf, count, *dtype, *op); -} } // namespace engine } // namespace rabit From e4784a4364aaa806ac4c86853a54d2a7de5ecc3e Mon Sep 17 00:00:00 2001 From: Rong Ou Date: Fri, 17 Jun 2022 11:02:16 -0700 Subject: [PATCH 4/5] remove sketch_eps and update docs --- doc/parameter.rst | 10 --- doc/treemethod.rst | 62 ++++++++++--------- .../scala/spark/params/BoosterParams.scala | 14 ----- src/tree/param.h | 12 ---- 4 files changed, 33 insertions(+), 65 deletions(-) diff --git a/doc/parameter.rst b/doc/parameter.rst index 3a35666338dd..71ada71f7612 100644 --- a/doc/parameter.rst +++ b/doc/parameter.rst @@ -151,15 +151,6 @@ Parameters for Tree Booster - ``hist``: Faster histogram optimized approximate greedy algorithm. - ``gpu_hist``: GPU implementation of ``hist`` algorithm. -* ``sketch_eps`` [default=0.03] - - - Only used for ``updater=grow_local_histmaker``. - - This roughly translates into ``O(1 / sketch_eps)`` number of bins. - Compared to directly select number of bins, this comes with theoretical guarantee with sketch accuracy. - - Usually user does not have to tune this. - But consider setting to a lower number for more accurate enumeration of split candidates. - - range: (0, 1) - * ``scale_pos_weight`` [default=1] - Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: ``sum(negative instances) / sum(positive instances)``. See :doc:`Parameters Tuning ` for more discussion. Also, see Higgs Kaggle competition demo for examples: `R `_, `py1 `_, `py2 `_, `py3 `_. @@ -170,7 +161,6 @@ Parameters for Tree Booster - ``grow_colmaker``: non-distributed column-based construction of trees. - ``grow_histmaker``: distributed tree construction with row-based data splitting based on global proposal of histogram counting. - - ``grow_local_histmaker``: based on local histogram counting. - ``grow_quantile_histmaker``: Grow tree using quantized histogram. - ``grow_gpu_hist``: Grow tree with GPU. - ``sync``: synchronizes trees in all distributed nodes. diff --git a/doc/treemethod.rst b/doc/treemethod.rst index 91f546c36a9c..254eafb281db 100644 --- a/doc/treemethod.rst +++ b/doc/treemethod.rst @@ -5,7 +5,7 @@ Tree Methods For training boosted tree models, there are 2 parameters used for choosing algorithms, namely ``updater`` and ``tree_method``. XGBoost has 4 builtin tree methods, namely ``exact``, ``approx``, ``hist`` and ``gpu_hist``. Along with these tree methods, there -are also some free standing updaters including ``grow_local_histmaker``, ``refresh``, +are also some free standing updaters including ``refresh``, ``prune`` and ``sync``. The parameter ``updater`` is more primitive than ``tree_method`` as the latter is just a pre-configuration of the former. The difference is mostly due to historical reasons that each updater requires some specific configurations and might has @@ -37,27 +37,18 @@ approximated training algorithms. These algorithms build a gradient histogram f node and iterate through the histogram instead of real dataset. Here we introduce the implementations in XGBoost below. -1. ``grow_local_histmaker`` updater: An approximation tree method described in `reference - paper `_. This updater is rarely used in practice so - it's still an updater rather than tree method. During split finding, it first runs a - weighted GK sketching for data points belong to current node to find split candidates, - using hessian as weights. The histogram is built upon this per-node sketch. It's - faster than ``exact`` in some applications, but still slow in computation. - -2. ``approx`` tree method: An approximation tree method described in `reference paper - `_. Different from ``grow_local_histmaker``, it runs - sketching before building each tree using all the rows (rows belonging to the root) - instead of per-node dataset. Similar to ``grow_local_histmaker`` updater, hessian is - used as weights during sketch. The algorithm can be accessed by setting - ``tree_method`` to ``approx``. +1. ``approx`` tree method: An approximation tree method described in `reference paper + `_. It runs sketching before building each tree + using all the rows (rows belonging to the root). Hessian is used as weights during + sketch. The algorithm can be accessed by setting ``tree_method`` to ``approx``. -3. ``hist`` tree method: An approximation tree method used in LightGBM with slight +2. ``hist`` tree method: An approximation tree method used in LightGBM with slight differences in implementation. It runs sketching before training using only user provided weights instead of hessian. The subsequent per-node histogram is built upon this global sketch. This is the fastest algorithm as it runs sketching only once. The algorithm can be accessed by setting ``tree_method`` to ``hist``. -4. ``gpu_hist`` tree method: The ``gpu_hist`` tree method is a GPU implementation of +3. ``gpu_hist`` tree method: The ``gpu_hist`` tree method is a GPU implementation of ``hist``, with additional support for gradient based sampling. The algorithm can be accessed by setting ``tree_method`` to ``gpu_hist``. @@ -102,19 +93,32 @@ Other Updaters Removed Updaters **************** -2 Updaters were removed during development due to maintainability. We describe them here -solely for the interest of documentation. First one is distributed colmaker, which was a -distributed version of exact tree method. It required specialization for column based -splitting strategy and a different prediction procedure. As the exact tree method is slow -by itself and scaling is even less efficient, we removed it entirely. Second one is -``skmaker``. Per-node weighted sketching employed by ``grow_local_histmaker`` is slow, -the ``skmaker`` was unmaintained and seems to be a workaround trying to eliminate the -histogram creation step and uses sketching values directly during split evaluation. It -was never tested and contained some unknown bugs, we decided to remove it and focus our -resources on more promising algorithms instead. For accuracy, most of the time -``approx``, ``hist`` and ``gpu_hist`` are enough with some parameters tuning, so removing -them don't have any real practical impact. - +3 Updaters were removed during development due to maintainability. We describe them here +solely for the interest of documentation. + +1. Distributed colmaker, which was a distributed version of exact tree method. It + required specialization for column based splitting strategy and a different prediction + procedure. As the exact tree method is slow by itself and scaling is even less + efficient, we removed it entirely. + +2. ``skmaker``. Per-node weighted sketching employed by ``grow_local_histmaker`` is slow, + the ``skmaker`` was unmaintained and seems to be a workaround trying to eliminate the + histogram creation step and uses sketching values directly during split evaluation. It + was never tested and contained some unknown bugs, we decided to remove it and focus our + resources on more promising algorithms instead. For accuracy, most of the time + ``approx``, ``hist`` and ``gpu_hist`` are enough with some parameters tuning, so + removing them don't have any real practical impact. + +3. ``grow_local_histmaker`` updater: An approximation tree method described in `reference + paper `_. This updater was rarely used in practice so + it was still an updater rather than tree method. During split finding, it first runs a + weighted GK sketching for data points belong to current node to find split candidates, + using hessian as weights. The histogram is built upon this per-node sketch. It was + faster than ``exact`` in some applications, but still slow in computation. It was + removed because it depended on Rabit's customized reduction function that handles all + the data structure that can be serialized/deserialized into fixed size buffer, which is + not directly supported by NCCL or federated learning gRPC, making it hard to refactor + into a common allreducer interface. ************** Feature Matrix diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala index e4223e953812..21a77341c7e5 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala @@ -182,20 +182,6 @@ private[spark] trait BoosterParams extends Params { final def getSinglePrecisionHistogram: Boolean = $(singlePrecisionHistogram) - /** - * This is only used for approximate greedy algorithm. - * This roughly translated into O(1 / sketch_eps) number of bins. Compared to directly select - * number of bins, this comes with theoretical guarantee with sketch accuracy. - * [default=0.03] range: (0, 1) - */ - final val sketchEps = new DoubleParam(this, "sketchEps", - "This is only used for approximate greedy algorithm. This roughly translated into" + - " O(1 / sketch_eps) number of bins. Compared to directly select number of bins, this comes" + - " with theoretical guarantee with sketch accuracy.", - (value: Double) => value < 1 && value > 0) - - final def getSketchEps: Double = $(sketchEps) - /** * Control the balance of positive and negative weights, useful for unbalanced classes. A typical * value to consider: sum(negative cases) / sum(positive cases). [default=1] diff --git a/src/tree/param.h b/src/tree/param.h index 5c6c2e11375d..c94437732299 100644 --- a/src/tree/param.h +++ b/src/tree/param.h @@ -65,8 +65,6 @@ struct TrainParam : public XGBoostParameter { // whether to subsample columns during tree construction float colsample_bytree; // accuracy of sketch - float sketch_eps; - // accuracy of sketch float sketch_ratio; // option to open cacheline optimization bool cache_opt; @@ -162,10 +160,6 @@ struct TrainParam : public XGBoostParameter { .set_range(0.0f, 1.0f) .set_default(1.0f) .describe("Subsample ratio of columns, resample on each tree construction."); - DMLC_DECLARE_FIELD(sketch_eps) - .set_range(0.0f, 1.0f) - .set_default(0.03f) - .describe("EXP Param: Sketch accuracy of approximate algorithm."); DMLC_DECLARE_FIELD(sketch_ratio) .set_lower_bound(0.0f) .set_default(2.0f) @@ -203,12 +197,6 @@ struct TrainParam : public XGBoostParameter { return loss_chg < this->min_split_loss || (this->max_depth != 0 && depth > this->max_depth); } - /*! \brief maximum sketch size */ - inline unsigned MaxSketchSize() const { - auto ret = static_cast(sketch_ratio / sketch_eps); - CHECK_GT(ret, 0U); - return ret; - } bst_node_t MaxNodes() const { if (this->max_depth == 0 && this->max_leaves == 0) { From 6248c4025734e018742b7fcbc8f65ebacd37883f Mon Sep 17 00:00:00 2001 From: Rong Ou Date: Fri, 17 Jun 2022 11:17:33 -0700 Subject: [PATCH 5/5] remove setSketchEps --- .../scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala | 2 -- .../scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala | 2 -- 2 files changed, 4 deletions(-) diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala index acc9febff6e4..a10394837a68 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala @@ -100,8 +100,6 @@ class XGBoostClassifier ( def setMaxLeaves(value: Int): this.type = set(maxLeaves, value) - def setSketchEps(value: Double): this.type = set(sketchEps, value) - def setScalePosWeight(value: Double): this.type = set(scalePosWeight, value) def setSampleType(value: String): this.type = set(sampleType, value) diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala index 77e0ac6b0901..82f8346b062a 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala @@ -102,8 +102,6 @@ class XGBoostRegressor ( def setMaxLeaves(value: Int): this.type = set(maxLeaves, value) - def setSketchEps(value: Double): this.type = set(sketchEps, value) - def setScalePosWeight(value: Double): this.type = set(scalePosWeight, value) def setSampleType(value: String): this.type = set(sampleType, value)